Source code for pyaerocom.io.read_airnow

import codecs
import logging
import os
from glob import glob

import numpy as np
import pandas as pd
from tqdm import tqdm

from pyaerocom.exceptions import DataRetrievalError
from pyaerocom.io import ReadUngriddedBase
from pyaerocom.stationdata import StationData
from pyaerocom.ungriddeddata import UngriddedData

logger = logging.getLogger(__name__)


[docs] class ReadAirNow(ReadUngriddedBase): """ Reading routine for North-American Air Now observations """ # Data type of files _FILETYPE = ".dam" # File search mask to recursively retrieve list of data files _FILEMASK = f"/monthly/*{_FILETYPE}" #: Version log of this class (for caching) __version__ = "0.12" #: Column delimiter FILE_COL_DELIM = "|" #: Columns in data files FILE_COL_NAMES = [ "date", "time", "station_id", "station_name", "time_zone", "variable", "unit", "value", "institute", ] # will be used millions of times, therefore store it FILE_COL_ROW_NUMBER = len(FILE_COL_NAMES) # row # for the variable ROW_VAR_COL = 5 #: Mapping of columns in station metadata file to pyaerocom standard STATION_META_MAP = { "aqsid": "station_id", "name": "station_name", "lat": "latitude", "lon": "longitude", "elevation": "altitude", "city": "city", "address": "address", "timezone": "timezone", "environment": "area_classification", "populationclass": "station_classification", "modificationdate": "modificationdate", "comment": "comment", } #: conversion functions for metadata dtypes STATION_META_DTYPES = { "station_id": str, "station_name": str, "latitude": float, "longitude": float, "altitude": float, "city": str, "address": str, "timezone": str, "area_classification": str, "station_classification": str, "modificationdate": str, "comment": str, } # strings to be replaced in original station names REPLACE_STATNAME = {"&": "and", "/": " ", ":": " ", ".": " ", "'": ""} # Years in timestamps in the files are 2-digit (e.g. 20 for 2020) BASEYEAR = 2000 #: Name of dataset (OBS_ID) DATA_ID = "AirNow" #: List of all datasets supported by this interface SUPPORTED_DATASETS = [DATA_ID] #: Units found in data files UNIT_MAP = { "C": "celcius", "M/S": "m s-1", "MILLIBAR": "mbar", "MM": "mm", "PERCENT": "%", "PPB": "ppb", "PPM": "ppm", "UG/M3": "ug m-3", "WATTS/M2": "W m-2", } #: Variable names in data files VAR_MAP = { "concbc": "BC", "concpm10": "PM10", "concpm25": "PM2.5", "vmrco": "CO", "vmrnh3": "NH3", "vmrno": "NO", "vmrno2": "NO2", "vmrnox": "NOX", "vmrnoy": "NOY", "vmro3": "OZONE", "vmrso2": "SO2", } #: List of variables that are provided PROVIDES_VARIABLES = list(VAR_MAP) #: Default variables DEFAULT_VARS = PROVIDES_VARIABLES #: Frequency of measurements TS_TYPE = "hourly" #: file containing station metadata STAT_METADATA_FILENAME = "allStations_20191224.csv" def __init__(self, data_id=None, data_dir=None): super().__init__(data_id=data_id, data_dir=data_dir) self.make_datetime64_array = np.vectorize(self._date_time_str_to_datetime64) self._station_metadata = None @property def station_metadata(self): """Dictionary containing global metadata for each site""" if self._station_metadata is None: self._init_station_metadata() return self._station_metadata def _date_time_str_to_datetime64(self, date, time): """ Convert date and time string into datetime64 object Parameters ---------- date : str date string as mm/dd/yy as in data files time : str time of the day as HH:MM Returns ------- datetime64[s] """ mm, dd, yy = date.split("/") yr = str(self.BASEYEAR + int(yy)) # returns as datetime64[s] return np.datetime64(f"{yr}-{mm}-{dd}T{time}:00") def _read_metadata_file(self): """ Read station metadatafile Returns ------- cfg : pandas.DataFrame metadata dataframe """ fn = os.path.join(self.data_dir, self.STAT_METADATA_FILENAME) cfg = pd.read_csv(fn, sep=",", converters={"aqsid": lambda x: str(x)}) return cfg def _correct_station_name(self, station_name): """ Remove unwanted chars from original station names Parameters ---------- station_name : str original station name Returns ------- str station name cleaned of chars defined in :attr:`REPLACE_STATNAME` """ for search, replace in self.REPLACE_STATNAME.items(): station_name = station_name.replace(search, replace) return station_name def _init_station_metadata(self): """ Initiate metadata for all stations Returns ------- dict dictionary with metadata dictionaries for all stations """ cfg = self._read_metadata_file() meta_map = self.STATION_META_MAP cols = list(cfg.columns.values) col_idx = {} for from_meta, to_meta in meta_map.items(): col_idx[to_meta] = cols.index(from_meta) arr = cfg.values dtypes = self.STATION_META_DTYPES stats = {} for row in arr: stat = {} for meta_key, col_num in col_idx.items(): stat[meta_key] = dtypes[meta_key](row[col_num]) sid = stat["station_id"] stat["station_name"] = self._correct_station_name(stat["station_name"]) stat["data_id"] = self.data_id stat["ts_type"] = self.TS_TYPE stats[sid] = stat self._station_metadata = stats return stats
[docs] def get_file_list(self): """ Retrieve list of data files Returns ------- list """ basepath = self.data_dir pattern = f"{basepath}{self._FILEMASK}" files = sorted(glob(pattern)) return files
def _read_file(self, file): """ Read one datafile using :func:`pandas.read_csv` Parameters ---------- file : str file path Returns ------- df : pandas.DataFrame DataFrame containing the file data """ # try utf_8 and cp863 reading first, then # determine file encoding and provide that to pandas # just determining the encoding is too slow given the # of files # Airbase consists of # just trying a couple of encodings and not determining the encoding all # the time speeds up reading by a factor of 5 # make sure certain dtypes are set for a few rows (e.g. station code as str) try: encoding = "utf_8" df = pd.read_csv( file, sep=self.FILE_COL_DELIM, names=self.FILE_COL_NAMES, encoding=encoding, on_bad_lines="skip", dtype={2: str, 4: float, 7: float}, ) except UnicodeDecodeError: encoding = "cp863" df = pd.read_csv( file, sep=self.FILE_COL_DELIM, names=self.FILE_COL_NAMES, encoding=encoding, on_bad_lines="skip", dtype={2: str, 4: float, 7: float}, ) except: encoding = self.get_file_encoding(file) df = pd.read_csv( file, sep=self.FILE_COL_DELIM, names=self.FILE_COL_NAMES, encoding=encoding, on_bad_lines="skip", dtype={ 2: str, 4: float, 7: float, }, ) return df def _read_files(self, files, vars_to_retrieve): """ Read input variables from list of files Parameters ---------- files : list list of data files vars_to_retrieve : list list of variables to retrieve Raises ------ NotImplementedError if several timezones are assigned to the same station AttributeError if data unit is unkown Returns ------- stats : list list of StationData objects """ logger.info("Read AirNow data file(s)") file_vars_to_retrieve = [self.VAR_MAP[x] for x in vars_to_retrieve] # initialize empty dataframe varcol = self.FILE_COL_NAMES.index("variable") arrs = [] unique_stat_ids = None for i in tqdm(range(len(files)), disable=None): fp = files[i] filedata = self._read_file(fp) for i, filevar in enumerate(file_vars_to_retrieve): arrs.append(filedata[filedata["variable"] == filevar].values) if unique_stat_ids is None: unique_stat_ids = np.unique( (arrs[-1][:, self.FILE_COL_NAMES.index("station_id")]) ) else: try: unique_stat_ids = np.union1d( unique_stat_ids, np.unique((arrs[-1][:, self.FILE_COL_NAMES.index("station_id")])), ) except (ValueError, TypeError): print(arrs[-1][:, self.FILE_COL_NAMES.index("station_id")]) raise DataRetrievalError( f"file {fp}: error in creating unique stationlist" ) if len(arrs) == 0: raise DataRetrievalError("None of the input variables could be found in input list") return self._filedata_to_statlist(arrs, vars_to_retrieve, unique_stat_ids=unique_stat_ids) def _filedata_to_statlist( self, arrs: list[np.ndarray], vars_to_retrieve: list[str], unique_stat_ids: list[str] = None, ) -> list[StationData]: """ Convert loaded filedata into list of StationData objects Parameters ---------- arrs : list list of numpy arrays extracted from each file (see :func:`_read_files`). vars_to_retrieve : list list of variables to be retrieved from input data. Returns ------- stats : list list of :class:`StationData` objects, one for each var and station. :type unique_stat_ids: object """ # doubling of RAM usage! data = np.concatenate(arrs) # so kill the input data right afterwards arrs = None logger.info("Converting filedata to list of StationData") stat_meta = self.station_metadata stat_ids = list(stat_meta) varcol = self.FILE_COL_NAMES.index("variable") statcol = self.FILE_COL_NAMES.index("station_id") tzonecol = self.FILE_COL_NAMES.index("time_zone") unitcol = self.FILE_COL_NAMES.index("unit") valcol = self.FILE_COL_NAMES.index("value") stats = [] # shortcut for limited testing dataset because the code # below the try statement fails with testing: # E ValueError: cannot call `vectorize` on size 0 inputs unless `otypes` is set # ../../../mambaforge/envs/pyadev-applied/lib/python3.11/site-packages/numpy/lib/function_base.py:2363: ValueError try: dtime = self.make_datetime64_array(data[:, 0], data[:, 1]) except ValueError: raise DataRetrievalError("None of the input variables could be found in input list") for var in vars_to_retrieve: # extract only variable data (should speed things up) var_in_file = self.VAR_MAP[var] mask = data[:, varcol] == var_in_file # another RAM consuming op, the mask can just be used all the time (for the price of readability...) subset = data[mask] dtime_subset = dtime[mask] # there are stations with a all numeric station ID, but type hints in pd.read_csv made sure # they are read as str... if unique_stat_ids is None: statlist = np.unique((subset[:, statcol])) else: statlist = unique_stat_ids for stat_id in tqdm(statlist, desc=var, disable=None): if not stat_id in stat_ids: continue statmask = subset[:, statcol] == stat_id if statmask.sum() == 0: continue # RAM consuming op... statdata = subset[statmask] timestamps = dtime_subset[statmask] # timezone offsets (there's a half hour time zone!, so float) # not sure why the astype(float) is needed in between, but it does not work without... toffs = statdata[:, tzonecol].astype(float).astype("timedelta64[h]") timestamps += toffs stat = StationData(**stat_meta[stat_id]) vals = statdata[:, valcol] units = np.unique(statdata[:, unitcol]) # errors that did not occur in v0 but that may occur assert len(units) == 1 assert units[0] in self.UNIT_MAP stat["dtime"] = timestamps stat["timezone"] = "UTC" stat[var] = vals unit = self.UNIT_MAP[units[0]] stat["var_info"][var] = dict(units=unit) stats.append(stat) return stats
[docs] def read_file(self, filename, vars_to_retrieve=None): """ This method is returns just the raw content of a file as a dict Parameters ---------- filename : str absolute path to filename to read vars_to_retrieve : :obj:`list`, optional list of str with variable names to read. If None, use :attr:`DEFAULT_VARS` vars_as_series : bool if True, the data columns of all variables in the result dictionary are converted into pandas Series objects Returns ------- StationData dict-like object containing results Raises ------ NotImplementedError """ # unfortunately the files have different encodings, so we have to try them # on the entire file first ret_data = {} encoding = "utf_8" try: with open(filename, encoding=encoding) as infile: linedata = infile.readlines() except UnicodeDecodeError: encoding = "cp863" with open(filename, encoding=encoding) as infile: linedata = infile.readlines() except: logger.info(f"unforeseen encoding in file {filename}! Trying to determine encoding") try: encoding = self.get_file_encoding(filename) logger.info(f"determined encoding: {encoding}") with open(filename, encoding=encoding) as infile: linedata = infile.readlines() except MemoryError: logger.info(f"could not determine encoding due to MemoryError: Skipping file...") raise DataRetrievalError( f"could not determine encoding due to MemoryError: Skipping file..." ) return ret_data if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS file_vars_to_retrieve = [self.VAR_MAP[x] for x in vars_to_retrieve] tot_lines_retrieved = 0 for var in vars_to_retrieve: ret_data[var] = {} ret_data[var]["lines_retrieved"] = 0 ret_data[var]["linedata"] = [] for line in linedata: line_arr = line.split(self.FILE_COL_DELIM) # skip malformed lines if len(line_arr) != self.FILE_COL_ROW_NUMBER: continue # skip lines that do not contain data of an interesting variable if line_arr[self.ROW_VAR_COL] not in file_vars_to_retrieve: continue # make the numerical values numerical here already line_arr[4] = float(line_arr[4]) line_arr[7] = float(line_arr[7]) ret_data[var]["linedata"].append(line_arr) ret_data[var]["lines_retrieved"] += 1 tot_lines_retrieved += 1 return ret_data
[docs] def read(self, vars_to_retrieve=None, first_file=None, last_file=None): """ Read variable data Parameters ---------- vars_to_retrieve : str or list, optional List of variables to be retrieved. The default is None. first_file : int, optional Index of first file to be read. The default is None, in which case index 0 in file list is used. last_file : int, optional Index of last file to be read. The default is None, in which case last index in file list is used. Returns ------- data : UngriddedData loaded data object. """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] self._init_station_metadata() files = self.get_file_list() if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] stats = self._read_files(files, vars_to_retrieve) data = UngriddedData.from_station_data( stats, add_meta_keys=["timezone", "area_classification", "station_classification"] ) return data
# the fllowing has been stolen from # https://stackoverflow.com/questions/44402983/how-to-read-the-file-without-encoding-and-extract-desired-urls-with-python3
[docs] def get_file_bom_encoding(self, filename): with open(filename, "rb") as openfileobject: line = str(openfileobject.readline()) if line[2:14] == str(codecs.BOM_UTF8).split("'")[1]: return "utf_8" if line[2:10] == str(codecs.BOM_UTF16_BE).split("'")[1]: return "utf_16" if line[2:10] == str(codecs.BOM_UTF16_LE).split("'")[1]: return "utf_16" if line[2:18] == str(codecs.BOM_UTF32_BE).split("'")[1]: return "utf_32" if line[2:18] == str(codecs.BOM_UTF32_LE).split("'")[1]: return "utf_32" return ""
[docs] def get_all_file_encodings(self, filename): encoding_list = [] encodings = ( "utf_8", "cp863", "utf_16", "utf_16_le", "utf_16_be", "utf_32", "utf_32_be", "utf_32_le", "cp850", "cp437", "cp852", "cp1252", "cp1250", "ascii", "utf_8_sig", "big5", "big5hkscs", "cp037", "cp424", "cp500", "cp720", "cp737", "cp775", "cp855", "cp856", "cp857", "cp858", "cp860", "cp861", "cp862", "cp864", "cp865", "cp866", "cp869", "cp874", "cp875", "cp932", "cp949", "cp950", "cp1006", "cp1026", "cp1140", "cp1251", "cp1253", "cp1254", "cp1255", "cp1256", "cp1257", "cp1258", "euc_jp", "euc_jis_2004", "euc_jisx0213", "euc_kr", "gb2312", "gbk", "gb18030", "hz", "iso2022_jp", "iso2022_jp_1", "iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_3", "iso2022_jp_ext", "iso2022_kr", "latin_1", "iso8859_2", "iso8859_3", "iso8859_4", "iso8859_5", "iso8859_6", "iso8859_7", "iso8859_8", "iso8859_9", "iso8859_10", "iso8859_13", "iso8859_14", "iso8859_15", "iso8859_16", "johab", "koi8_r", "koi8_u", "mac_cyrillic", "mac_greek", "mac_iceland", "mac_latin2", "mac_roman", "mac_turkish", "ptcp154", "shift_jis", "shift_jis_2004", "shift_jisx0213", ) for e in encodings: try: fh = codecs.open(filename, "r", encoding=e) fh.readlines() except UnicodeDecodeError: fh.close() except UnicodeError: fh.close() else: encoding_list.append([e]) fh.close() continue return encoding_list
[docs] def get_file_encoding(self, filename): file_encoding = self.get_file_bom_encoding(filename) if file_encoding != "": return file_encoding encoding_list = self.get_all_file_encodings(filename) file_encoding = str(encoding_list[0][0]) if file_encoding[-3:] == "_be" or file_encoding[-3:] == "_le": file_encoding = file_encoding[:-3] return file_encoding