Source code for metobs_toolkit.gap

from __future__ import annotations

import logging
from typing import Union, TYPE_CHECKING
import numpy as np
import pandas as pd

from metobs_toolkit.obstypes import Obstype
from metobs_toolkit.modeltimeseries import ModelTimeSeries
from metobs_toolkit.settings_collection import Settings
from metobs_toolkit.backend_collection.argumentcheckers import (
    fmt_timedelta_arg,
)

from metobs_toolkit.backend_collection.df_helpers import convert_to_numeric_series
import metobs_toolkit.backend_collection.printing_collection as printing

import metobs_toolkit.gf_collection.gf_common_methods as gf_methods
from metobs_toolkit.gf_collection.debias_gapfill import fill_regular_debias
from metobs_toolkit.gf_collection.diurnal_debias_gapfill import (
    fill_with_diurnal_debias,
    fill_with_weighted_diurnal_debias,
)

from metobs_toolkit.backend_collection.decorators import log_entry
from metobs_toolkit.backend_collection.dev_collection import copy_doc
from metobs_toolkit.backend_collection.dataframe_constructors import gap_df

if TYPE_CHECKING:
    from metobs_toolkit.sensordata import SensorData

logger = logging.getLogger("<metobs_toolkit>")

_unfilled_label = "unfilled"
_failed_label = "failed gapfill"
_successful_label = "successful gapfill"
_partially_successful_label = "partially successful gapfill"


[docs] class Gap: """ Represents a gap in observational data for a specific station and observation type. Parameters ---------- gaprecords : pd.DatetimeIndex The datetime index representing the gap records. obstype : Obstype The type of observation (e.g., temperature, humidity). stationname : str The name of the station where the gap occurred. """
[docs] def __init__( self, gaprecords: pd.DatetimeIndex, obstype: Obstype, stationname: str, ): """Initialize a Gap object.""" gaprecords.name = "datetime" self._records = pd.Series(data=np.nan, index=gaprecords, name="value") self._labels = pd.Series( data=Settings.get("label_def.regular_gap.label"), index=gaprecords, name="label", ) self._extra_info = pd.Series( data="no details", index=gaprecords, name="details" ) self._fillkwargs = {} self._obstype = obstype self._stationname = stationname
def __repr__(self): """Instance representation.""" return f"{type(self).__name__}(station={self.stationname}, obstype={self.obstype.name}, start={self.start_datetime}, end={self.end_datetime}, status={self.fillstatus})" @property def records(self) -> pd.Series: """Return the records of the gap.""" return convert_to_numeric_series(self._records, datadtype=np.float32) @property def obstype(self) -> Obstype: """Return the observation type.""" return self._obstype @property def stationname(self) -> str: """Return the station name.""" return self._stationname @property def fillsettings(self) -> dict: """ Return the settings used for filling the gap. The settings are the kwargs (keyword arguments) used in the gapfill methods. Returns ------- dict A dictionary containing the settings used for filling the gap. """ return self._fillkwargs @property def fillstatus(self) -> str: """ Returns the fill status of the gap. Returns ------- str The fill status, which can be one of the following: * 'unfilled' * 'failed gapfill' * 'successful gapfill' * 'partially successful gapfill' """ if self.records.isna().all() and not bool(self._fillkwargs): return _unfilled_label elif self.records.isna().all() and bool(self._fillkwargs): return _failed_label elif not self.records.isna().any() and bool(self._fillkwargs): return _successful_label elif self.records.isna().any() and bool(self._fillkwargs): return _partially_successful_label else: raise NotImplementedError( "This situation is unforeseen! Please notify developers." ) @property def start_datetime(self) -> pd.Timestamp: """Return the start datetime of the gap.""" return min(self.records.index) @property def end_datetime(self) -> pd.Timestamp: """Return the end datetime of the gap.""" return max(self.records.index) @copy_doc(gap_df) @property def df(self) -> pd.DataFrame: return gap_df(self) # ------------------------------------------ # Get info methods # ------------------------------------------
[docs] @log_entry def flag_can_be_filled(self, overwrite: bool = False) -> bool: """ Determine if the gap can be filled. By default, a gap can be filled if it is not already filled or if the previous gapfill method failed for the gap. A gap that is already filled can only be updated if the overwrite flag is set to True. Parameters ---------- overwrite : bool, optional If True, allows filling regardless of the current fill status. Default is False. Returns ------- bool True if the gap can be filled, False otherwise. """ if overwrite: return True if self.fillstatus in [_unfilled_label, _failed_label]: return True if self.fillstatus in [_partially_successful_label]: # required for sequential GF, this is the intuitive approach return True if self.fillstatus in [_successful_label]: return False else: raise NotImplementedError( "This situation is unforeseen! Please notify developers." )
[docs] @log_entry def get_info(self, printout: bool = True) -> Union[str, None]: """ Print or return detailed information about the Gap. Parameters ---------- printout : bool, optional If True, prints the information. If False, returns the information as a string. Default is True. Returns ------- str or None The gap information as a string if printout is False, otherwise None. """ infostr = "" infostr += printing.print_fmt_title("General info of Gap") infostr += printing.print_fmt_section("Gap details") infostr += printing.print_fmt_line( f"Gap of {self.obstype.name} for station: {self.stationname}", 0 ) infostr += printing.print_fmt_line( f"From {self.start_datetime} -> {self.end_datetime}", 1 ) infostr += printing.print_fmt_line( f"Duration gap: {self.end_datetime - self.start_datetime}", 1 ) infostr += printing.print_fmt_section("Gap filling details") infostr += printing.print_fmt_line(f"Gap status: {self.fillstatus}") infostr += printing.print_fmt_line("Gapfill settings used:") infostr += printing.print_fmt_dict(d=self.fillsettings, identlvl=2) if printout: print(infostr) else: return infostr
[docs] @log_entry def debiased_model_gapfill( self, sensordata: SensorData, modeltimeseries: ModelTimeSeries, leading_period_duration: Union[str, pd.Timedelta], min_leading_records_total: int, trailing_period_duration: Union[str, pd.Timedelta], min_trailing_records_total: int, max_gap_duration_to_fill: pd.Timedelta = pd.Timedelta("12h"), min_value=None, max_value=None, ) -> None: """ Fill the gaps using model data corrected for the bias. This method fills the gap using model data corrected for bias. The bias is estimated using a leading (before the gap) and trailing (after the gap) period. The bias is computed by combining the leading and trailing period, and comparing the model with the observations (not labeled as outliers). The model data is then interpolated to the missing records, and corrected with the estimated bias. Parameters ---------- sensordata : SensorData The corresponding SensorData used in the computation of the bias. Only the observations that are not labeled as outliers are used to compute the bias. modeltimeseries : ModelTimeSeries The model time series used to fill the gap records. The model data must be compatible (equivalent obstype and related to the same Station as the gap.) leading_period_duration : str or pandas.Timedelta The duration of the leading period. min_leading_records_total : int The minimum number of records required in the leading period. trailing_period_duration : str or pandas.Timedelta The duration of the trailing period. min_trailing_records_total : int The minimum number of records required in the trailing period. max_gap_duration_to_fill : pandas.Timedelta, optional The maximum gap duration of to fill with interpolation. The result is independent on the time-resolution of the gap. Defaults to 12 hours. min_value : float, optional Minimum allowed value for filled data. If provided, filled values below this threshold will be clipped to this value. Default is None (no minimum limit). max_value : float, optional Maximum allowed value for filled data. If provided, filled values above this threshold will be clipped to this value. Default is None (no maximum limit). Returns ---------- None. Notes ----- A schematic description of the debiased modeldata gap fill: #. Check the compatibility of the `ModelTimeSeries` with the `gap`. #. Construct a leading and trailing sample, and test if they meet the required conditions. #. Compute the bias of the modeldata (combine leading and trailing samples). #. Fill the gap records by using raw (interpolated) modeldata that is corrected by subtracting the bias. #. Clip filled values to the range [min_value, max_value] if specified. #. Update the `gap` attributes with the interpolated values, labels, and details. """ leading_period_duration = fmt_timedelta_arg(leading_period_duration) trailing_period_duration = fmt_timedelta_arg(trailing_period_duration) self._fillkwargs = { "applied_gapfill_method": "debias_model_gapfill", "leading_period_duration": leading_period_duration, "min_leading_records_total": min_leading_records_total, "trailing_period_duration": trailing_period_duration, "min_trailing_records_total": min_trailing_records_total, "max_gap_duration_to_fill": max_gap_duration_to_fill, "min_value": min_value, "max_value": max_value, } # 1. Check if the gap duration exceeds the max_gap_duration_to_fill gapsize_is_ok, setdetails = self.test_if_gf_is_suitable_with_gapsize( max_gap_duration_to_fill ) if not gapsize_is_ok: self._labels[:] = Settings.get( "label_def.failed_debias_modeldata_fill.label" ) self._extra_info[:] = setdetails return None # 2. Check validity of modeltimeseries is_compat, err_msg = gf_methods.check_if_modeltimeseries_is_compatible( gap=self, modeltimeseries=modeltimeseries, lp_duration=leading_period_duration, tp_duration=trailing_period_duration, ) if not is_compat: self._labels[:] = Settings.get( "label_def.failed_debias_modeldata_fill.label" ) self._extra_info[:] = err_msg logger.warning( f"Incompatible modeldata for debias_model_gapfill: \n{err_msg}" ) return None # 3. Construct and validity-test leading and trailing periods ( lead_period, trail_period, continueflag, ) = self._setup_lead_and_trail_for_debias_gapfill( sensordata=sensordata, fail_label=Settings.get("label_def.failed_debias_modeldata_fill.label"), leading_period_duration=leading_period_duration, min_leading_records_total=min_leading_records_total, trailing_period_duration=trailing_period_duration, min_trailing_records_total=min_trailing_records_total, ) if not continueflag: # warnings and gap attributes are already updated return None # 3. Fill the gap combdf = gf_methods.create_a_combined_df( leadseries=lead_period, trailseries=trail_period, gap=self ) # add modeldata to combdf combdf = gf_methods.add_modeldata_to_combdf( combineddf=combdf, modeltimeseries=modeltimeseries ) # Fill the missing records filleddf = fill_regular_debias( df=combdf, min_value=min_value, max_value=max_value ) filleddf = filleddf.loc[self.records.index] # subset to gap records # 4. Update attributes self._records = filleddf["fillvalue"].rename( "value" ) # set the new filled records # set labels self._labels.loc[self.records.notna()] = Settings.get( "label_def.debias_modeldata_fill.label" ) self._labels.loc[self.records.isna()] = Settings.get( "label_def.failed_debias_modeldata_fill.label" ) # update details self._extra_info = filleddf["msg"].rename("details")
[docs] @log_entry def diurnal_debiased_model_gapfill( self, sensordata: SensorData, modeltimeseries: ModelTimeSeries, leading_period_duration: pd.Timedelta, trailing_period_duration: pd.Timedelta, min_debias_sample_size: int, max_gap_duration_to_fill: pd.Timedelta = pd.Timedelta("12h"), min_value=None, max_value=None, ) -> None: """ Fill the gaps using model data corrected for the diurnal bias. This method fills the gap using model data corrected for its diurnal bias. The diurnal bias is a bias that is estimated for each timestamp in the leading and trailing period. All biases are averaged over hour, minute and second, to obtain a diurnal bias (for each timestamp). Parameters ---------- sensordata : SensorData The corresponding SensorData used in the computation of the bias. Only the observations that are not labeled as outliers are used to compute the bias. modeltimeseries : ModelTimeSeries The model time series used to fill the gap records. The model data must be compatible (equivalent obstype and related to the same Station as the gap.) leading_period_duration : pandas.Timedelta The duration of the leading period. That is the period before the gap, used for bias estimation. trailing_period_duration : pandas.Timedelta The duration of the trailing period. That is the period after the gap, used for bias estimation. min_debias_sample_size : int The minimum number of samples required for bias estimation. If this condition is not met, the gap is not filled. max_gap_duration_to_fill : pandas.Timedelta, optional The maximum gap duration of to fill with interpolation. The result is independent on the time-resolution of the gap. Defaults to 12 hours. min_value : float, optional Minimum allowed value for filled data. If provided, filled values below this threshold will be clipped to this value. Default is None (no minimum limit). max_value : float, optional Maximum allowed value for filled data. If provided, filled values above this threshold will be clipped to this value. Default is None (no maximum limit). Returns --------- None. Notes ----- A schematic description of the diurnal debiased modeldata gap fill: #. Check the compatibility of the `ModelTimeSeries` with the `gap`. #. Construct a leading and trailing sample, and test if they meet the required conditions. The required conditions are tested by testing the samplesizes per hour, minute and second for the leading + trailing periods. #. A diurnal bias is computed by grouping to hour, minute and second, and averaging the biases. #. Fill the gap records by using raw (interpolated) modeldata that is corrected by subtracting the coresponding diurnal bias. #. Clip filled values to the range [min_value, max_value] if specified. #. Update the `gap` attributes with the interpolated values, labels, and details. A suitable `min_debias_sample_size` depends on the sizes of the leading- and trailing periods, and also on the time resolution gap (=time resolution of the corresponding SensorData). References ---------- Jacobs A, et. al. (2024) `Filling gaps in urban temperature observations by debiasing ERA5 reanalysis data <https://doi.org/10.1016/j.uclim.2024.102226>`_ """ self._fillkwargs = { "applied_gapfill_method": "diurnal_debias_model_gapfill", "leading_period_duration": leading_period_duration, "trailing_period_duration": trailing_period_duration, "min_debias_sample_size": min_debias_sample_size, "max_gap_duration_to_fill": max_gap_duration_to_fill, "min_value": min_value, "max_value": max_value, } # 1. Check if the gap duration exceeds the max_gap_duration_to_fill gapsize_is_ok, setdetails = self.test_if_gf_is_suitable_with_gapsize( max_gap_duration_to_fill ) if not gapsize_is_ok: self._labels[:] = Settings.get( "label_def.failed_diurnal_debias_modeldata_fill.label" ) self._extra_info[:] = setdetails return None # 2. Check validity of modeltimeseries is_compat, err_msg = gf_methods.check_if_modeltimeseries_is_compatible( gap=self, modeltimeseries=modeltimeseries, lp_duration=leading_period_duration, tp_duration=trailing_period_duration, ) if not is_compat: self._labels[:] = Settings.get( "label_def.failed_diurnal_debias_modeldata_fill.label" ) self._extra_info[:] = err_msg logger.warning( f"Incompatible modeldata for diurnal_debias_model_gapfill: \n{err_msg}" ) return None # 3. Construct and validity-test leading and trailing periods ( lead_period, trail_period, continueflag, ) = self._setup_lead_and_trail_for_debias_gapfill( sensordata=sensordata, fail_label=Settings.get( "label_def.failed_diurnal_debias_modeldata_fill.label" ), leading_period_duration=leading_period_duration, min_leading_records_total=min_debias_sample_size, trailing_period_duration=trailing_period_duration, min_trailing_records_total=min_debias_sample_size, ) if not continueflag: # warnings and gap attributes are already been updated return None # 4. Fill the gap combdf = gf_methods.create_a_combined_df( leadseries=lead_period, trailseries=trail_period, gap=self ) # add modeldata to combdf combdf = gf_methods.add_modeldata_to_combdf( combineddf=combdf, modeltimeseries=modeltimeseries ) # Fill the missing records filleddf = fill_with_diurnal_debias( df=combdf, min_sample_size=int(min_debias_sample_size), min_value=min_value, max_value=max_value, ) filleddf = filleddf.loc[self.records.index] # subset to gap records # 4. Update attributes self._records = filleddf["fillvalue"].rename( "value" ) # set the new filled records # set labels self._labels.loc[self.records.notna()] = Settings.get( "label_def.diurnal_debias_modeldata_fill.label" ) self._labels.loc[self.records.isna()] = Settings.get( "label_def.failed_diurnal_debias_modeldata_fill.label" ) # update details self._extra_info = filleddf["msg"].rename("details")
[docs] @log_entry def weighted_diurnal_debiased_model_gapfill( self, sensordata: SensorData, modeltimeseries: ModelTimeSeries, leading_period_duration: pd.Timedelta, min_lead_debias_sample_size: int, trailing_period_duration: pd.Timedelta, min_trail_debias_sample_size: int, max_gap_duration_to_fill: pd.Timedelta = pd.Timedelta("12h"), min_value=None, max_value=None, ) -> None: """ Fill the gaps using a weighted sum of model data corrected for the diurnal bias and weights with respect to the start of the gap. This method fills the gap using model data corrected for its diurnal bias. The diurnal bias is a bias that is estimated for each timestamp in the leading and trailing period (separately). For both periods separately, all biases are averaged over hour, minute and second, to obtain a diurnal bias (for each timestamp). In addition, a normalized weight is computed for each gap record indicating the distance (in time) to the start and end of the gap. The correction applied on the interpolated (in time) model data is thus a weighted sum of corrections coming from both the leading and trailing period. Parameters ---------- sensordata : SensorData The corresponding SensorData used in the computation of the bias. Only the observations that are not labeled as outliers are used to compute the bias. modeltimeseries : ModelTimeSeries The model time series used to fill the gap records. The model data must be compatible (equivalent obstype and related to the same Station as the gap.) leading_period_duration : pandas.Timedelta The duration of the leading period. That is the period before the gap, used for bias estimation. min_lead_debias_sample_size : int The minimum number of leading samples required for bias estimation. If this condition is not met, the gap is not filled. trailing_period_duration : pandas.Timedelta The duration of the trailing period. That is the period after the gap, used for bias estimation. min_trail_debias_sample_size : int The minimum number of trailing samples required for bias estimation. If this condition is not met, the gap is not filled. max_gap_duration_to_fill : pandas.Timedelta, optional The maximum gap duration of to fill with interpolation. The result is independent on the time-resolution of the gap. Defaults to 12 hours. min_value : float, optional Minimum allowed value for filled data. If provided, filled values below this threshold will be clipped to this value. Default is None (no minimum limit). max_value : float, optional Maximum allowed value for filled data. If provided, filled values above this threshold will be clipped to this value. Default is None (no maximum limit). Returns -------- None. Notes ----- A schematic description of the weighted diurnal debiased modeldata gap fill: #. Check the compatibility of the `ModelTimeSeries` with the `gap`. #. Construct a leading and trailing sample, and test if they meet the required conditions. The required conditions are tested by testing the samplesizes per hour, minute and second for the leading and trailing periods (seperatly). #. A leading and trailing set of diurnal biases are computed by grouping to hour, minute and second, and averaging the biases. #. A weight is computed for each gap record, that is the normalized distance to the start and end of the gap. #. Fill the gap records by using raw (interpolated) modeldata is corrected by a weighted sum the coresponding diurnal bias for the lead and trail periods. #. Clip filled values to the range [min_value, max_value] if specified. #. Update the `gap` attributes with the interpolated values, labels, and details. A suitable `min_debias_sample_size` depends on the sizes of the leading- and trailing periods, and also on the time resolution gap (=time resolution of the corresponding SensorData). References ---------- Jacobs A, et. al. (2024) `Filling gaps in urban temperature observations by debiasing ERA5 reanalysis data <https://doi.org/10.1016/j.uclim.2024.102226>`_ """ self._fillkwargs = { "applied_gapfill_method": "weighted_diurnal_debias_model_gapfill", "leading_period_duration": leading_period_duration, "trailing_period_duration": trailing_period_duration, "min_lead_debias_sample_size": min_lead_debias_sample_size, "min_trail_debias_sample_size": min_trail_debias_sample_size, "max_gap_duration_to_fill": max_gap_duration_to_fill, "min_value": min_value, "max_value": max_value, } # 1. Check if the gap duration exceeds the max_gap_duration_to_fill gapsize_is_ok, setdetails = self.test_if_gf_is_suitable_with_gapsize( max_gap_duration_to_fill ) if not gapsize_is_ok: self._labels[:] = Settings.get( "label_def.failed_weighted_diurnal_debias_modeldata_fill.label" ) self._extra_info[:] = setdetails return None # 2. Check validity of modeltimeseries is_compat, err_msg = gf_methods.check_if_modeltimeseries_is_compatible( gap=self, modeltimeseries=modeltimeseries, lp_duration=leading_period_duration, tp_duration=trailing_period_duration, ) if not is_compat: self._labels[:] = Settings.get( "label_def.failed_weighted_diurnal_debias_modeldata_fill.label" ) self._extra_info[:] = err_msg logger.warning( f"Incompatible modeldata for weighted_diurnal_debias_model_gapfill: \n{err_msg}" ) return None # 3. Construct and validity-test leading and trailing periods ( lead_period, trail_period, continueflag, ) = self._setup_lead_and_trail_for_debias_gapfill( sensordata=sensordata, fail_label=Settings.get( "label_def.failed_weighted_diurnal_debias_modeldata_fill.label" ), leading_period_duration=leading_period_duration, min_leading_records_total=min_lead_debias_sample_size, trailing_period_duration=trailing_period_duration, min_trailing_records_total=min_trail_debias_sample_size, ) if not continueflag: # warnings and gap attributes are already been updated return None # 4. Fill the gap combdf = gf_methods.create_a_combined_df( leadseries=lead_period, trailseries=trail_period, gap=self ) # add modeldata to combdf combdf = gf_methods.add_modeldata_to_combdf( combineddf=combdf, modeltimeseries=modeltimeseries ) # Fill the missing records filleddf = fill_with_weighted_diurnal_debias( df=combdf, min_lead_sample_size=min_lead_debias_sample_size, min_trail_sample_size=min_trail_debias_sample_size, min_value=min_value, max_value=max_value, ) filleddf = filleddf.loc[self.records.index] # subset to gap records # 5. Update attributes self._records = filleddf["fillvalue"].rename( "value" ) # set the new filled records # set labels self._labels.loc[self.records.notna()] = Settings.get( "label_def.weighted_diurnal_debias_modeldata_fill.label" ) self._labels.loc[self.records.isna()] = Settings.get( "label_def.failed_weighted_diurnal_debias_modeldata_fill.label" ) # update details self._extra_info = filleddf["msg"].rename("details")
[docs] @log_entry def raw_model_gapfill( self, modeltimeseries: ModelTimeSeries, max_gap_duration_to_fill: pd.Timedelta = pd.Timedelta("12h"), min_value=None, max_value=None, ) -> None: """ Fill the gap using model data without correction. This method fills the gap by directly interpolating the model data to the missing records. Parameters ---------- modeltimeseries : ModelTimeSeries The model time series used to fill the gap records. The model data must be compatible (equivalent obstype and related to the same Station as the gap.) max_gap_duration_to_fill : pandas.Timedelta, optional The maximum gap duration of to fill with interpolation. The result is independent on the time-resolution of the gap. Defaults to 12 hours. min_value : float, optional Minimum allowed value for filled data. If provided, filled values below this threshold will be clipped to this value. Default is None (no minimum limit). max_value : float, optional Maximum allowed value for filled data. If provided, filled values above this threshold will be clipped to this value. Default is None (no maximum limit). Returns ------- None Notes ----- A schematic description of the raw model data gap fill: #. Check the compatibility of the `ModelTimeSeries` with the `gap`. #. Ensure both the `ModelTimeSeries` and `gap` have the same timezone. #. Interpolate the model data to match the missing records in the gap. #. Clip filled values to the range [min_value, max_value] if specified. #. Update the `gap` attributes with the interpolated values, labels, and details. """ self._fillkwargs = { "applied_gapfill_method": "raw_model_gapfill", "max_gap_duration_to_fill": max_gap_duration_to_fill, "min_value": min_value, "max_value": max_value, } # 1. Check if the gap duration exceeds the max_gap_duration_to_fill gapsize_is_ok, setdetails = self.test_if_gf_is_suitable_with_gapsize( max_gap_duration_to_fill ) if not gapsize_is_ok: self._labels[:] = Settings.get("label_def.failed_raw_modeldata_fill.label") self._extra_info[:] = setdetails return None # 2. Check validity of modeltimeseries is_compat, err_msg = gf_methods.check_if_modeltimeseries_is_compatible( gap=self, modeltimeseries=modeltimeseries, lp_duration=pd.Timedelta(0), tp_duration=pd.Timedelta(0), ) if not is_compat: self._labels.loc[self.records.isna()] = Settings.get( "label_def.failed_raw_modeldata_fill.label" ) self._extra_info.loc[self.records.isna()] = err_msg logger.warning(f"Incompatible modeldata for raw_model_gapfill: \n{err_msg}") return None modelseries = modeltimeseries.series gapseries = self.records # 3. Ensure both series have the same timezone if modelseries.index.tz != gapseries.index.tz: modelseries = modelseries.tz_convert(gapseries.index.tz) # 4. Fill the gap # 4. Reindex modelseries to match gapseries, interpolating if necessary modelseries_reindexed = ( pd.concat([modelseries, gapseries]) .sort_index() .interpolate(method="time", limit_area="inside") ) # duplicates are introduced when timestamps are both in modelseries and gapseries modelseries_reindexed = modelseries_reindexed[ ~modelseries_reindexed.index.duplicated(keep="first") ] # 5. Update attributes self._records = modelseries_reindexed.loc[ self.records.index ] # (save) set the new filled records # Apply min/max constraints if provided if min_value is not None: self._records = self._records.clip(lower=min_value) if max_value is not None: self._records = self._records.clip(upper=max_value) # set labels self._labels.loc[self.records.notna()] = Settings.get( "label_def.raw_modeldata_fill.label" ) self._labels.loc[self.records.isna()] = Settings.get( "label_def.failed_raw_modeldata_fill.label" ) # update details self._extra_info.loc[self.records.notna()] = ( f"Successful raw modeldata fill using {modeltimeseries.modelvariable} (but converted to {self.obstype.std_unit}) of {modeltimeseries.modelname}" ) self._extra_info.loc[self.records.isna()] = "Unsuccessful raw modeldata fill."
[docs] @log_entry def interpolate( self, sensordata: SensorData, method: str = "time", max_gap_duration_to_fill: pd.Timedelta = pd.Timedelta("3h"), n_leading_anchors: int = 1, n_trailing_anchors: int = 1, max_lead_to_gap_distance: Union[pd.Timedelta, None] = None, max_trail_to_gap_distance: Union[pd.Timedelta, None] = None, method_kwargs: dict = {}, ) -> None: """ Fill the gap using interpolation of SensorData. The gap is interpolated using the leading and trailing periods of the gap. One can select different interpolation methods. By using restrictions on the leading and trailing periods, one can ensure that the interpolation is only done when there are enough leading and trailing data available. Parameters ---------- sensordata : SensorData The corresponding SensorData used to interpolate the gap. method : str, optional Interpolation technique to use. See pandas.DataFrame.interpolate 'method' argument for possible values. Make sure that `n_leading_anchors`, `n_trailing_anchors` and `method_kwargs` are set accordingly to the method (higher order interpolation techniques require more leading and trailing anchors). The default is "time". max_gap_duration_to_fill : pandas.Timedelta, optional The maximum gap duration of to fill with interpolation. The result is independent on the time-resolution of the gap. Defaults to 3 hours. n_leading_anchors : int, optional The number of leading anchors to use for the interpolation. A leading anchor is a near record (not rejected by QC) just before the start of the gap, that is used for interpolation. Higher-order interpolation techniques require multiple leading anchors. Defaults to 1. n_trailing_anchors : int, optional The number of trailing anchors to use for the interpolation. A trailing anchor is a near record (not rejected by QC) just after the end of the gap, that is used for interpolation. Higher-order interpolation techniques require multiple leading anchors. Defaults to 1. max_lead_to_gap_distance : pandas.Timedelta or None, optional The maximum time difference between the start of the gap and a leading anchor(s). If None, no time restriction is applied on the leading anchors. The default is None. max_trail_to_gap_distance : pandas.Timedelta or None, optional The maximum time difference between the end of the gap and a trailing anchor(s). If None, no time restriction is applied on the trailing anchors. Defaults to None. method_kwargs : dict, optional Extra arguments that are passed to pandas.DataFrame.interpolate() structured in a dict. Defaults to {}. Notes ----- A schematic description: #. Get the leading and trailing periods of the gap. #. Check if the leading and trailing periods are valid. #. Create a combined DataFrame with the leading, trailing, and gap data. #. Interpolate the missing records using the specified method. #. Update the gap attributes with the interpolated values, labels, and details. Note ------ If you want to use a higher-order method of interpolation, make sure to increase the `n_leading_anchors` and `n_trailing_anchors` accordingly. For example, for a cubic interpolation, you need at least 2 leading and 2 trailing anchors. """ # store fill settings self._fillkwargs = { "applied_gapfill_method": "interpolation", "method": method, "n_leading_anchors": n_leading_anchors, "n_trailing_anchors": n_trailing_anchors, "max_lead_to_gap_distance": max_lead_to_gap_distance, "max_trail_to_gap_distance": max_trail_to_gap_distance, "max_gap_duration_to_fill": max_gap_duration_to_fill, **method_kwargs, } # 1. Check if the gap duration exceeds the max_gap_duration_to_fill gapsize_is_ok, setdetails = self.test_if_gf_is_suitable_with_gapsize( max_gap_duration_to_fill ) if not gapsize_is_ok: self._labels[:] = Settings.get("label_def.failed_interpolation_gap.label") self._extra_info[:] = setdetails return None # 2. Get leading period lead_period, continueflag, err_msg = gf_methods.get_leading_period( gap=self, sensordata=sensordata, n_records=n_leading_anchors, duration=max_lead_to_gap_distance, fixed_by_records=True, fixed_by_duration=False, ) if not continueflag: # Interpolation failed due to failing leading period self._labels[:] = Settings.get("label_def.failed_interpolation_gap.label") self._extra_info[:] = err_msg logger.warning( f"Cannot interpolate {self} because no valid leading period can be found." ) return None # 3. Get trailing period trail_period, continueflag, err_msg = gf_methods.get_trailing_period( gap=self, sensordata=sensordata, n_records=n_trailing_anchors, duration=max_trail_to_gap_distance, fixed_by_records=True, fixed_by_duration=False, ) if not continueflag: # Interpolation failed due to failing trailing period self._labels[:] = Settings.get("label_def.failed_interpolation_gap.label") self._extra_info[:] = err_msg logger.warning( f"Cannot interpolate {self} because no valid trailing period can be found." ) return None # 5. Combine the anchors with the observations combdf = gf_methods.create_a_combined_df( leadseries=lead_period, trailseries=trail_period, gap=self ) tofill_series = combdf["value"] # 4. Replace the NaN's (GAPFILLING) # Interpolate series tofill_series = tofill_series.interpolate( method=method, limit_area="inside", **method_kwargs, ) # Update attributes self._records = tofill_series.loc[ self.records.index ] # set the new filled records # set labels self._labels.loc[self.records.notna()] = Settings.get( "label_def.interpolated_gap.label" ) self._labels.loc[self.records.isna()] = Settings.get( "label_def.failed_interpolation_gap.label" ) # update details self._extra_info.loc[self.records.notna()] = "Successful interpolation" self._extra_info.loc[self.records.isna()] = ( "Unsuccessful interpolation, likely due to an error when calling pandas.Series.interpolate. See the error logs for further details." ) return None
# ------------------------------------------ # Helping methods # ------------------------------------------
[docs] @log_entry def flush_fill(self) -> None: """ Clear all fill information for this gap. This method resets the records, labels, extra info, and fillkwargs to their initial state. """ logger.debug(f"Flushing fill values of {self}") # 1. set nan for all records self._records.loc[:] = np.nan # 2. Convert all labels to 'gap' self._labels.loc[:] = Settings.get("label_def.regular_gap.label") # 3. Clears the extra data (per record) self._extra_info.loc[:] = "no details" # 4. Empty the fillkwargs self._fillkwargs = {}
def _setup_lead_and_trail_for_debias_gapfill( self, sensordata: SensorData, fail_label: str, leading_period_duration: pd.Timedelta, min_leading_records_total: int, trailing_period_duration: pd.Timedelta, min_trailing_records_total: int, ) -> tuple[Union[pd.Series, None], Union[pd.Series, None], bool]: """ Construct leading and trailing periods for debias gapfill. This method is shared by multiple gap-filling methods to construct and validate the leading and trailing periods required for bias estimation. Parameters ---------- sensordata : SensorData The corresponding SensorData used to compute the bias. fail_label : str The label to assign to the gap records if the setup fails. leading_period_duration : pd.Timedelta The duration of the leading period. min_leading_records_total : int The minimum number of records required in the leading period. trailing_period_duration : pd.Timedelta The duration of the trailing period. min_trailing_records_total : int The minimum number of records required in the trailing period. Returns ------- tuple of (pd.Series or None, pd.Series or None, bool) A tuple containing the leading period, trailing period, and a flag indicating whether the setup was successful. If unsuccessful, the leading and trailing periods will be None, and the flag will be False. """ # Validate argument types # 1. Get leading period lead_period, continueflag, err_msg = gf_methods.get_leading_period( gap=self, sensordata=sensordata, n_records=min_leading_records_total, duration=leading_period_duration, fixed_by_records=False, fixed_by_duration=True, ) if not continueflag: # Setup failed due to an invalid leading period self._labels[:] = fail_label self._extra_info[:] = err_msg logger.warning( f"Cannot fill {self} because no valid leading period can be found." ) return None, None, False # 2. Get trailing period trail_period, continueflag, err_msg = gf_methods.get_trailing_period( gap=self, sensordata=sensordata, n_records=min_trailing_records_total, duration=trailing_period_duration, fixed_by_records=False, fixed_by_duration=True, ) if not continueflag: # Setup failed due to an invalid trailing period self._labels[:] = fail_label self._extra_info[:] = err_msg logger.warning( f"Cannot fill {self} because no valid trailing period can be found." ) return None, None, False logger.debug(f"Exiting _setup_lead_and_trail_for_debias_gapfill for {self}") return lead_period, trail_period, True def test_if_gf_is_suitable_with_gapsize(self, max_gapsize) -> tuple[bool, str]: """Check whether the gap duration is within the allowed limit for gap-filling. Parameters ---------- max_gapsize : pandas.Timedelta Maximum allowed gap duration. Gaps larger than this value cannot be filled. Returns ------- tuple of (bool, str) ``(True, '')`` when the gap is small enough, or ``(False, detail_string)`` when the gap exceeds *max_gapsize* where *detail_string* describes the reason. """ if (self.end_datetime - self.start_datetime) > max_gapsize: detailstring = f"Gap is too large ({(self.end_datetime - self.start_datetime)} ) to be filled with max_gapsize={max_gapsize}." logger.warning( f"Cannot fill {self} because the gap is too large (gapsize: {(self.end_datetime - self.start_datetime)} > {max_gapsize} : max_gapsize). Increase the max_gapsize or use another gapfill method." ) return False, detailstring return True, ""