Source code for pyaerocom.io.read_aeronet_sunv3

import gzip
import logging
import os
import pathlib
import shutil
import tempfile

import numpy as np
import pandas as pd

from pyaerocom import const
from pyaerocom.aux_var_helpers import calc_ang4487aer, calc_od550aer, calc_od550lt1ang
from pyaerocom.exceptions import AeronetReadError
from pyaerocom.io.readaeronetbase import ReadAeronetBase
from pyaerocom.stationdata import StationData

logger = logging.getLogger(__name__)



[docs]
class ReadAeronetSunV3(ReadAeronetBase):
    """Interface for reading Aeronet direct sun version 3 Level 1.5 and 2.0 data

    .. seealso::

        Base classes :class:`ReadAeronetBase` and :class:`ReadUngriddedBase`

    """

    #: Mask for identifying datafiles
    _FILEMASK = "*.lev*"

    #: version log of this class (for caching)
    __version__ = "0.12_" + ReadAeronetBase.__baseversion__

    #: Name of dataset (OBS_ID)
    DATA_ID = const.AERONET_SUN_V3L2_AOD_DAILY_NAME

    #: List of all datasets supported by this interface
    SUPPORTED_DATASETS = [
        const.AERONET_SUN_V3L15_AOD_DAILY_NAME,
        const.AERONET_SUN_V3L15_AOD_ALL_POINTS_NAME,
        const.AERONET_SUN_V3L2_AOD_DAILY_NAME,
        const.AERONET_SUN_V3L2_AOD_ALL_POINTS_NAME,
    ]

    #: dictionary assigning temporal resolution flags for supported datasets
    #: that are provided in a defined temporal resolution
    TS_TYPES = {
        const.AERONET_SUN_V3L15_AOD_DAILY_NAME: "daily",
        const.AERONET_SUN_V3L2_AOD_DAILY_NAME: "daily",
    }

    #: default variables for read method
    DEFAULT_VARS = ["od550aer", "ang4487aer"]

    #: value corresponding to invalid measurement
    # NAN_VAL = -9999.
    NAN_VAL = -999.0

    #: Mappings for identifying variables in file
    VAR_PATTERNS_FILE = {"AOD_([0-9]*)nm": "od*aer"}

    #: dictionary specifying the file column names (values) for each Aerocom
    #: variable (keys)
    VAR_NAMES_FILE = {}
    VAR_NAMES_FILE["od340aer"] = "AOD_340nm"
    VAR_NAMES_FILE["od440aer"] = "AOD_440nm"
    VAR_NAMES_FILE["od500aer"] = "AOD_500nm"
    # VAR_NAMES_FILE['od865aer'] = 'AOD_865nm'
    VAR_NAMES_FILE["od870aer"] = "AOD_870nm"
    VAR_NAMES_FILE["ang4487aer"] = "440-870_Angstrom_Exponent"

    #: dictionary specifying the file column names (values) for each
    #: metadata key (cf. attributes of :class:`StationData`, e.g.
    #: 'station_name', 'longitude', 'latitude', 'altitude')
    META_NAMES_FILE = {}
    META_NAMES_FILE["data_quality_level"] = "Data_Quality_Level"
    META_NAMES_FILE["instrument_number"] = "AERONET_Instrument_Number"
    META_NAMES_FILE["station_name"] = "AERONET_Site"
    META_NAMES_FILE["latitude"] = "Site_Latitude(Degrees)"
    META_NAMES_FILE["longitude"] = "Site_Longitude(Degrees)"
    META_NAMES_FILE["altitude"] = "Site_Elevation(m)"
    META_NAMES_FILE["date"] = "Date(dd:mm:yyyy)"
    META_NAMES_FILE["time"] = "Time(hh:mm:ss)"
    META_NAMES_FILE["day_of_year"] = "Day_of_Year"

    META_NAMES_FILE_ALT = {"AERONET_Site": ["AERONET_Site_Name"]}
    #: dictionary containing information about additionally required variables
    #: for each auxiliary variable (i.e. each variable that is not provided
    #: by the original data but computed on import)
    AUX_REQUIRES = {
        "ang44&87aer": ["od440aer", "od870aer"],
        "od550aer": ["od440aer", "od500aer", "ang4487aer"],
        "od550lt1ang": ["od440aer", "od500aer", "ang4487aer"],
        "proxyod550aerh2o": ["od440aer", "od500aer", "ang4487aer"],
        "proxyod550bc": ["od440aer", "od500aer", "ang4487aer"],
        "proxyod550dust": ["od440aer", "od500aer", "ang4487aer"],
        "proxyod550nh4": ["od440aer", "od500aer", "ang4487aer"],
        "proxyod550oa": ["od440aer", "od500aer", "ang4487aer"],
        "proxyod550so4": ["od440aer", "od500aer", "ang4487aer"],
        "proxyod550ss": ["od440aer", "od500aer", "ang4487aer"],
        "proxyod550no3": ["od440aer", "od500aer", "ang4487aer"],
        "proxyzaerosol": ["od440aer", "od500aer", "ang4487aer"],
        "proxyzdust": ["od440aer", "od500aer", "ang4487aer"],
    }

    #: Functions that are used to compute additional variables (i.e. one
    #: for each variable defined in AUX_REQUIRES)
    AUX_FUNS = {
        "ang44&87aer": calc_ang4487aer,
        "od550aer": calc_od550aer,
        "od550lt1ang": calc_od550lt1ang,
        "proxyod550aerh2o": calc_od550aer,
        "proxyod550bc": calc_od550aer,
        "proxyod550dust": calc_od550aer,
        "proxyod550nh4": calc_od550aer,
        "proxyod550oa": calc_od550aer,
        "proxyod550so4": calc_od550aer,
        "proxyod550ss": calc_od550aer,
        "proxyod550no3": calc_od550aer,
        "proxyzaerosol": calc_od550aer,
        "proxyzdust": calc_od550aer,
    }

    UNITS = {
        "proxyzdust": "km",
        "proxyzaerosol": "km",
    }

    #: List of variables that are provided by this dataset (will be extended
    #: by auxiliary variables on class init, for details see __init__ method of
    #: base class ReadUngriddedBase)
    PROVIDES_VARIABLES = list(VAR_NAMES_FILE)


[docs]
    def read_file(self, filename, vars_to_retrieve=None, vars_as_series=False):
        """Read Aeronet Sun V3 level 1.5 or 2 file

        Parameters
        ----------
        filename : str
            absolute path to filename to read
        vars_to_retrieve : :obj:`list`, optional
            list of str with variable names to read. If None, use
            :attr:`DEFAULT_VARS`
        vars_as_series : bool
            if True, the data columns of all variables in the result dictionary
            are converted into pandas Series objects

        Returns
        -------
        StationData
            dict-like object containing results
        """
        if vars_to_retrieve is None:
            vars_to_retrieve = self.DEFAULT_VARS
        # implemented in base class
        vars_to_read, vars_to_compute = self.check_vars_to_retrieve(vars_to_retrieve)

        # create empty data object (is dictionary with extended functionality)
        data_out = StationData()
        data_out.data_id = self.data_id
        # create empty arrays for meta information
        for item in self.META_NAMES_FILE:
            data_out[item] = []

        # Iterate over the lines of the file
        self.logger.info(f"Reading file {filename}")
        # enable alternative reading of .gz files here to save space on the file system
        suffix = pathlib.Path(filename).suffix
        tmp_name = filename
        if suffix == ".gz":
            f_out = tempfile.NamedTemporaryFile(delete=False)
            with gzip.open(filename, "r") as f_in:
                shutil.copyfileobj(f_in, f_out)
            filename = f_out.name
            f_out.close()

        try:
            with open(filename) as in_file:
                lines = in_file.readlines()
        except UnicodeDecodeError:
            with open(filename, encoding="ISO-8859-1") as in_file:
                lines = in_file.readlines()
        except OSError:
            # faulty gzip file, but also the gzip class raises some exceptions
            if suffix == ".gz":
                os.remove(f_out.name)
            raise AeronetReadError(f"gzip error in file {tmp_name}")

        _lines_ignored = []

        line_idx = 4
        _lines_ignored.append(lines[0 : line_idx - 1])

        # PI line
        dummy_arr = lines[line_idx].strip().split(";")
        line_idx += 1
        data_out["PI"] = dummy_arr[0].split("=")[1]
        data_out["PI_email"] = dummy_arr[1].split("=")[1]
        data_out["ts_type"] = self.TS_TYPE

        data_type_comment = lines[line_idx]
        line_idx += 1

        _lines_ignored.append(data_type_comment)
        self.logger.debug(f"Data type comment: {data_type_comment}")

        # put together a dict with the header string as key and the index number as value so that we can access
        # the index number via the header string
        col_index_str = lines[line_idx]
        line_idx += 1

        if col_index_str != self._last_col_index_str:
            self.logger.info("Header has changed, reloading col_index map")
            self._update_col_index(col_index_str)
        col_index = self.col_index

        # dependent on the station, some of the required input variables
        # may not be provided in the data file. These will be ignored
        # in the following list that iterates over all data rows and will
        # be filled below, with vectors containing NaNs after the file
        # reading loop
        vars_available = {}
        for var in vars_to_read:
            data_out[var] = []
            if var in col_index:
                vars_available[var] = col_index[var]
            else:
                self.logger.warning(
                    f"Variable {var} not available in file {os.path.basename(filename)}"
                )
        pl = None

        for i, line in enumerate(lines[line_idx:]):
            # process line
            dummy_arr = line.split(self.COL_DELIM)

            if pl is not None and len(dummy_arr) != len(pl):
                self.logger.warning(f"Data line {i} in {filename} is corrupt, skipping...")
                continue
            # copy the meta data (array of type string)
            for var in self.META_NAMES_FILE:
                try:
                    val = dummy_arr[col_index[var]]
                except IndexError as e:
                    self.logger.warning(repr(e))

                try:
                    # e.g. lon, lat, altitude
                    val = float(val)
                except Exception:
                    pass
                data_out[var].append(val)

            # This uses the numpy datestring64 functions that e.g. also
            # support Months as a time step for timedelta
            # Build a proper ISO 8601 UTC date string
            day, month, year = dummy_arr[col_index["date"]].split(":")
            datestring = "-".join([year, month, day])
            datestring = "T".join([datestring, dummy_arr[col_index["time"]]])
            # NOTE JGLISS: parsing timezone offset was removed on 22/2/19
            # since it is deprecated in recent numpy versions, for details
            # see https://www.numpy.org/devdocs/reference/arrays.datetime.html#changes-with-numpy-1-11
            # datestring = '+'.join([datestring, '00:00'])

            data_out["dtime"].append(np.datetime64(datestring))

            for var, idx in vars_available.items():
                val = np.float_(dummy_arr[idx])
                if val == self.NAN_VAL:
                    val = np.nan
                data_out[var].append(val)

            pl = dummy_arr

        # remove the temp file in case the input file was a gz file
        if suffix == ".gz":
            os.remove(f_out.name)

        # convert all lists to numpy arrays
        data_out["dtime"] = np.asarray(data_out["dtime"])

        for item in self.META_NAMES_FILE:
            data_out[item] = np.asarray(data_out[item])

        for var in vars_to_read:
            if var in vars_available:
                array = np.asarray(data_out[var])
            else:
                array = np.zeros(len(data_out["dtime"])) * np.nan
            data_out[var] = array

        # compute additional variables (if applicable)
        data_out = self.compute_additional_vars(data_out, vars_to_compute)

        # convert data vectors to pandas.Series (if applicable)
        if vars_as_series:
            for var in vars_to_read + vars_to_compute:
                if var in vars_to_retrieve:
                    data_out[var] = pd.Series(data_out[var], index=data_out["dtime"])
                else:
                    del data_out[var]
        self.logger.debug(f"The following lines were ignored: {_lines_ignored}")
        return data_out