Source code for get_closest_value

"""Module for robust time series value interpolation.

This module provides functionality to find the closest value in a time
series to a target timestamp, with robust handling of missing values.
It implements:

- Exact timestamp matching when available
- Adaptive window median interpolation when exact match unavailable
- Progressive window size expansion until valid values found

Author
------
B. Heinesch
University of Liege, Gembloux Agro-Bio Tech
April, 2025
"""

import numpy as np
import datetime



[docs]
def get_closest_value(df, target_timestamp):
    """Find closest value to target timestamp with robust handling.

    This function searches a time series for the value closest to a
    target timestamp. If an exact match is not available or contains
    NaN, it progressively expands a window around the target time
    until valid values are found, then returns their median.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame with:
        - datetime index
        - Single column of values to interpolate
        - May contain NaN values
    target_timestamp : datetime.datetime
        Target timestamp to find closest value for

    Returns
    -------
    float
        Either:
        - Exact value if timestamp match found and not NaN
        - Median of closest valid values using adaptive window

    Notes
    -----
    The search process:
    1. Try exact timestamp match first
    2. If no match or value is NaN:
       - Start with window of 10 closest timestamps
       - Expand window by 10 until valid values found
       - Return median of valid values in window
    3. Window expansion continues until either:
       - Valid values found
       - Entire series searched
    """

    closest_index = abs(df.index - target_timestamp).argmin()
    if abs(df.index - target_timestamp).min() == datetime.timedelta() and not np.isnan(df.iloc[closest_index]):
        # the var is present for this half-hour
        value = df.iloc[closest_index]  # find closest var value based on timestamp
    else:
        max_limit = len(df)
        step = 10
        found = False
        window_size = step
    
        while window_size <= max_limit:
            # Calculate distances from the target index
            valid_indices = range(len(df))
            valid_distances = [abs(i - closest_index) for i in valid_indices]
            sorted_indices = [x for _, x in sorted(zip(valid_distances, valid_indices))]
    
            # Select the closest N values
            selected_indices = sorted_indices[:window_size]
    
            # Extract values and filter out NaNs
            selected_values = df.iloc[selected_indices].values
            valid_values = selected_values[~np.isnan(selected_values)]
    
            if len(valid_values) > 0:
                value = np.median(valid_values)
                found = True
                break
            else:
                window_size += step
    
    return value