Source code for pyaerocom.utils

from __future__ import annotations

import fnmatch
from pathlib import Path

import pandas as pd

from pyaerocom.io import ReadGridded



[docs]
def create_varinfo_table(
    model_ids, vars_or_var_patterns, read_data=False, sort_by_cols=["Var", "Model"]
):
    """Create an info table for model list based on variables

    The method iterates over all models in :attr:`model_list` and creates an
    instance of :class:`ReadGridded`. Variable matches are searched based on
    input list :attr:`vars_or_var_patterns` (you may also use wildcards to
    specify a family of variables) and for each match the information below
    is collected. The search also includes variables that are not directly
    available in the model data but can be computed from other available
    variables. That is, all variables that are defined in
    :attr:`ReadGridded.AUX_REQUIRES`.

    The output table (DataFrame) then consists of the following columns:

        - Var: variable name
        - Model: model name
        - Years: available years
        - Freq: frequency
        - Vertical: information about vertical dimension (inferred from \
          Aerocom file name)
        - At stations: data is at stations (inferred from filename)
        - AUX vars: Auxiliary variable required to compute Var (col 1). Only \
          relevant for variables that are computed by the interface
        - Dim: number of dimensions (only retrieved if *read_data* is True)
        - Dim names: names of dimension coordinates (only retrieved if \
                                                     *read_data* is True)
        - Shape: Shape of data (only retrieved if *read_data* is True)
        - Read ok: reading was successful (only retrieved if *read_data* \
                                           is True)

    Parameters
    ----------
    model_ids : list
        list of model ids to be analysed (can also be string -> single model)
    vars_or_var_patterns : list
        list of variables or variable patterns to be analysed (can also be
        string -> single variable or variable family)
    read_data : bool
        if True, more information about the imported data will be available
        in the table (e.g. no. of dimensions, names of dimension coords)
        but the routine will run longer since the data is imported
    sort_by_cols : list
        column sort order (use header names in listing above). Defaults
        to `['Var', 'Model']`

    Returns
    -------
    pandas.DataFrame
        dataframe including result table (ready to be saved as csv or other
        tabular format or to be displayed in a jupyter notebook)

    Example
    -------
    >>> from pyaerocom import create_varinfo_table
    >>> models = ['INCA-BCext_CTRL2016-PD',
                  'GEOS5-freegcm_CTRL2016-PD']
    >>> vars = ['ang4487aer', 'od550aer', 'ec*']
    >>> df = create_varinfo_table(models, vars)
    >>> print(df)
    """
    if isinstance(model_ids, str):
        model_ids = [model_ids]
    if isinstance(vars_or_var_patterns, str):
        vars_or_var_patterns = [vars_or_var_patterns]

    failed = []

    header = [
        "Var",
        "Model",
        "Years",
        "Freq",
        "Vertical",
        "At stations",
        "AUX vars",
        "Dim",
        "Dim names",
        "Shape",
        "Read ok",
    ]
    result = []
    table_cols = ["year", "ts_type", "vert_code", "is_at_stations", "aux_vars"]
    for i, model in enumerate(model_ids):
        print(f"At model: {model} ({i} of {len(model_ids)})")
        try:
            reader = ReadGridded(model)
            var_info = reader.get_var_info_from_files()
            for var_avail, info in var_info.items():
                for var in vars_or_var_patterns:
                    if fnmatch.fnmatch(var_avail, var):
                        for freq in info["ts_type"]:
                            sub_res = [var_avail, model]
                            for key in table_cols:
                                if key in info:
                                    sub_res.append(info[key])
                                else:
                                    sub_res.append(None)
                            try:
                                if not read_data:
                                    raise Exception
                                data = reader.read_var(var_avail, ts_type=freq, flex_ts_type=False)
                                dim_names = [d.name() for d in data.grid.dim_coords]
                                sub_res.extend([data.ndim, dim_names, data.shape, True])

                            except Exception:
                                sub_res.extend([None, None, None, False])
                            result.append(sub_res)

        except OSError as e:
            dummy = [None] * len(header)
            dummy[1] = model
            result.append(dummy)
            print(repr(e))
            failed.append(model)

    df = pd.DataFrame(result, columns=header)
    if sort_by_cols:
        df.sort_values(sort_by_cols, inplace=True)
    return df




[docs]
def print_file(path: Path | str):
    if isinstance(path, str):
        path = Path(path)
    if not path.exists():
        raise OSError("File not found...")
    if not path.is_file():
        raise ValueError(f"{path} is not a file")

    for line in path.read_text().splitlines():
        if line.strip():
            print(line)