Source code for read_GHG

"""Functions for reading and processing LI-COR GHG high-frequency data files.

This module provides functionality to read and process high-frequency data from LI-COR GHG files,
specifically focusing on the SMARTFLUX system output. It includes tools for:

* Reading and extracting data from zipped GHG files
* Processing diagnostic values from LI-7200 gas analyzer
* Handling AGC (Automatic Gain Control) values and other diagnostic flags

Author: Ariane Faures
Created: October 5, 2021
"""

import zipfile
import glob
import pandas as pd
import os
import numpy as np

# %% List of functions

[docs]
def read_GHG (raw_file, raw_format='ghg', unzip_path=None):
    """Read and extract high-frequency data from LI-COR SMARTFLUX GHG files.

    This function handles the reading of high-frequency eddy covariance data from
    LI-COR SMARTFLUX GHG files. It extracts both data and metadata from zipped GHG
    files and returns them as pandas DataFrames.

    Parameters
    ----------
    raw_file : str
        Path to the GHG file to process
    raw_format : str, optional
        Format of the raw data file, currently only 'ghg' is supported
    unzip_path : str, optional
        Directory where the GHG file should be temporarily extracted. If None,
        uses the same directory as the GHG file

    Returns
    -------
    list
        A list containing:

        - file_header : pandas.DataFrame
          Header information from the data file (first 6 lines)
        - file_data : pandas.DataFrame
          High frequency data with variable names as columns
        - data_name : str
          Path to the extracted data file
        - metadata_name : str
          Path to the extracted metadata file

    Notes
    -----
    The function automatically cleans up extracted files after reading them.
    """

    # Read the header (first 7 lines of the file)
    # And afterwards read the many body (with variable names as columns)
    if raw_format == 'ghg':
        with zipfile.ZipFile(raw_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

        data_name = glob.glob(unzip_path + '/' +'*.data')
        metadata_name = glob.glob(unzip_path + '/' + '*.metadata')

        with open(data_name[-1], mode='r') as file:
            file_header = pd.read_table(file, nrows = 6, header = None)
        with open(data_name[-1], mode='r') as file:    
            file_data = pd.read_table(file, header = 7)

    file.close()

    os.remove(data_name[0])
    os.remove(metadata_name[0])    

    return([file_header,file_data,data_name, metadata_name])




# Legacy version of read_diag_val with additional debugging output
# Currently unused - the production version is defined below
# def read_diag_val(data, data_name_short):
#     """Process diagnostic values from LI-7200 gas analyzer data.
# 
#     This function processes the diagnostic values from LI-7200 gas analyzer data,
#     converting binary diagnostic flags into meaningful status indicators for various
#     instrument components.
# 
#     Parameters
#     ----------
#     data : pandas.DataFrame
#         DataFrame containing the raw data with a 'Diagnostic Value' column
#     data_name_short : str
#         Short identifier for the data file being processed
# 
#     Returns
#     -------
#     pandas.DataFrame
#         DataFrame containing diagnostic counts/values for:
#         - AGC (Automatic Gain Control)
#         - Sync
#         - PLL
#         - Detector
#         - Chopper
#         - DeltaPressure
#         - Aux_input
#         - Tinlet
#         - Toutlet
#         - Head detect
#         - Anemometer Diagnostics
# 
#     Notes
#     -----
#     AGC values are compared against a reference of 100.05 and stored as means.
#     Other diagnostic flags are counted when they indicate an issue (value != 1).
#     """
#     print('Reading diagnostic data of files')
#     diag_val = data.loc[:,'Diagnostic Value'].copy() # Copy of the initial column: change!!!
# 
#     print('Starting to count the number of times a flag was raised')
# 
#     def int_to_binary_vectorized(arr):
#         """Convert integer array to binary strings with leading zeros.
# 
#         Parameters
#         ----------
#         arr : numpy.ndarray
#             Array of integers to convert
# 
#         Returns
#         -------
#         numpy.ndarray
#             Array of binary strings, each padded to 7 digits
#         """
#         int2binary = np.vectorize(lambda x: '000' + format(x, 'b'))
#         return int2binary(arr)
#     
#     # Example usage
#     arr = diag_val.to_numpy()
#     diag_val = int_to_binary_vectorized(arr)
#     
#     
#     def split_bin_diag_vectorized(diag_val,data_name_short):
#         """Process binary diagnostic values into component-specific flags.
# 
#         Parameters
#         ----------
#         diag_val : numpy.ndarray
#             Array of binary strings representing diagnostic values
#         data_name_short : str
#             Short identifier for the data file being processed
# 
#         Returns
#         -------
#         pandas.DataFrame
#             DataFrame containing diagnostic counts for each component
#         """
#         # Create the binary array from the input series
#         binary_array = np.array([list(x) for x in diag_val.astype(str)])
#         
#         # Create a mask for the AGC columns: 4 last bits, so columns of the array
#         agc_mask = binary_array[:,-4:].astype(int)
#         num_rows, num_cols = agc_mask.shape
#         # Use the bitwise left shit operator (<<) to comnbine the binary digits
#         # and get one number per row without looping over all the records
#         result = np.zeros(num_rows, dtype=np.int)
#         for i in range(num_cols):
#             result = result << 1
#             result = result | agc_mask[:, i]
#             
#         # Convert the agc diagnostic into its real value to compare it to the
#         # reference value (*6.67 according to the manual)
#         agc_column = (result * 6.67).astype(float) 
#         # Count the number of times it isn't the reference value (100.05) with
#         # a limit of tolerance
#         # agc_count = (~np.isclose(agc_column, 100.05)).sum()
#         agc_count = np.mean(agc_column) # put the mean value instead of the count
#         
#         # Create a mask for the remaining columns
#         other_mask = binary_array[:,3:-4].astype(int)
#         other_count = (~(other_mask == 1)).sum(axis=0)
#         
#         # Create the count dataframe from the counts. First are the agc counts
#         # and then the other counts but flipped since the bits must be read
#         # right to left to correspond to the variable names
#         diag_count = pd.DataFrame([agc_count] + list(np.flip(other_count)),
#                                   columns = [data_name_short[:-9]])
#         diag_count = diag_count.T
#         diag_count.columns = ['AGC', 'Sync', 'PLL', 'Detector', 
#                                            'Chopper', 'DeltaPressure',
#                                            'Aux_input', 'Tinlet', 'Toutlet', 
#                                            'Head detect']
#         
#         return diag_count
# 
#     diag_count = split_bin_diag_vectorized(diag_val,data_name_short)
#     diag_count['Anemometer Diagnostics'] = data.loc[:,'Anemometer Diagnostics'].sum()
#         
#         # print_progress(k, diag_val)
#     agc = pd.Series(((diag_count.iloc[:,0])<100).any())
#     agc.index= ['AGC']
#     diag_flag = pd.concat([agc,((diag_count.iloc[:,1:])!=0).any()])
#     if not((diag_count.loc[:,diag_flag]).empty):     
#         print('File ' + data_name_short)
#         print('Number and type of LI7200 diagnostic flags raised:')
#         print(diag_count.loc[:,diag_flag])
#    
#     return(diag_count)








[docs]
def read_diag_val(data):
    """Process LI-7200 gas analyzer diagnostic values.

    This function processes diagnostic values from a LI-7200 gas analyzer,
    converting binary diagnostic flags into meaningful status indicators.
    Each diagnostic value is a binary number where specific bits indicate
    the status of different analyzer components.

    Parameters
    ----------
    data : pandas.Series or numpy.ndarray
        Series or array of diagnostic values from the analyzer

    Returns
    -------
    pandas.DataFrame
        DataFrame containing diagnostic information for each component:

        AGC : float
            Mean AGC value (Automatic Gain Control, scaled by 6.67)
        Sync : int
            Count of synchronization issues
        PLL : int
            Count of Phase-Locked Loop issues
        Detector : int
            Count of detector issues
        Chopper : int
            Count of chopper wheel issues
        DeltaPressure : int
            Count of pressure differential issues
        Aux_input : int
            Count of auxiliary input issues
        Tinlet : int
            Count of inlet temperature issues
        Toutlet : int
            Count of outlet temperature issues
        Head detect : int
            Count of head detection issues
    """

    diag_val = data.astype(int)

    def int_to_binary_vectorized(arr):
        """Convert integer array to binary strings with leading zeros.

        Parameters
        ----------
        arr : numpy.ndarray
            Array of integers to convert

        Returns
        -------
        numpy.ndarray
            Array of binary strings, each padded with leading zeros
        """
        int2binary = np.vectorize(lambda x: '000' + format(x, 'b'))
        return int2binary(arr)

    diag_val = int_to_binary_vectorized(diag_val)

    def split_bin_diag_vectorized(diag_val):
        """Process binary diagnostic values into component-specific flags.

        Parameters
        ----------
        diag_val : numpy.ndarray
            Array of binary strings representing diagnostic values

        Returns
        -------
        pandas.DataFrame
            DataFrame containing diagnostic counts/means for each component
        """
        # Create the binary array from the input series
        binary_array = np.array([list(x) for x in diag_val.astype(str)])
        
        # Create a mask for the AGC columns: 4 last bits, so columns of the array
        agc_mask = binary_array[:,-4:].astype(int)
        num_rows, num_cols = agc_mask.shape
        # Use the bitwise left shit operator (<<) to comnbine the binary digits
        # and get one number per row without looping over all the records
        result = np.zeros(num_rows, dtype=int)
        for i in range(num_cols):
            result = result << 1
            result = result | agc_mask[:, i]
            
        # Convert the agc diagnostic into its real value to compare it to the
        # reference value (*6.67 according to the manual)
        agc_column = (result * 6.67).astype(float) 
        # store the mean value
        agc_count = np.mean(agc_column)
        
        # Create a mask for the remaining columns
        other_mask = binary_array[:,3:-4].astype(int)
        other_count = (~(other_mask == 1)).sum(axis=0)
        
        # Create the count dataframe from the counts. First are the agc counts
        # and then the other counts but flipped since the bits must be read
        # right to left to correspond to the variable names
        diag_count = pd.DataFrame([agc_count] + list(np.flip(other_count)))  #,columns = [data_name_short[:-9]]
        diag_count = diag_count.T
        diag_count.columns = ['AGC', 'Sync', 'PLL', 'Detector', 
                                           'Chopper', 'DeltaPressure',
                                           'Aux_input', 'Tinlet', 'Toutlet', 
                                           'Head detect']
        
        return diag_count

    diag_count = split_bin_diag_vectorized(diag_val)

    # print('Number and type of LI7200 diagnostic flags raised:')
    # print(diag_count.loc[:,(diag_count!=0).any()])
   
    return(diag_count)