Source code for micromet.format.transformers.columns

"""
Column naming and organization functions for the reformatter pipeline.

This module handles column renaming, prefix normalization, legacy format
updates, and column ordering operations.
"""

import logging
import re
from typing import Dict

import pandas as pd

def create_suffix_map(df, col_list, suffix):
    """
    Filters a list of columns based on what's actually in the DataFrame,
    then creates a dictionary for renaming with a user-provided suffix.
    """
    # 1. Only include columns that actually exist in your current dataframe
    existing_cols = [col for col in col_list if col in df.columns]
    
    # 2. Create the renaming dictionary
    # Example: 'CO2_SIGMA' -> 'CO2_SIGMA_1_1_1'
    rename_dict = {col: f"{col}{suffix}" for col in existing_cols}
    
    return rename_dict


# SoilVUE Depth/orientation conversion tables
_DEPTH_MAP = {5: 1, 10: 2, 20: 3, 30: 4, 40: 5, 50: 6, 60: 7, 75: 8, 100: 9}
_ORIENT_MAP = {"N": 3, "S": 4}
_LEGACY_RE = re.compile(
    r"^(?P<prefix>(SWC|TS|EC|K|T))_(?P<depth>\d{1,3})cm_(?P<orient>[NS])_.*$",
    re.IGNORECASE,
)
_PREFIX_PATTERNS: Dict[re.Pattern[str], str] = {
    re.compile(r"^BulkEC_", re.IGNORECASE): "EC_",
    re.compile(r"^VWC_", re.IGNORECASE): "SWC_",
    re.compile(r"^Ka_", re.IGNORECASE): "K_",
}


[docs] def rename_columns( df: pd.DataFrame, data_type: str, config: dict, logger: logging.Logger ) -> pd.DataFrame: """ Rename DataFrame columns based on configuration and standardize their names. This function renames columns using a predefined mapping from the configuration, normalizes soil and temperature-related prefixes, and converts all column names to uppercase. Parameters ---------- df : pd.DataFrame The input DataFrame with columns to be renamed. data_type : str The type of data ('eddy' or 'met'), which determines which renaming map to use. config : dict The configuration dictionary containing the renaming maps. logger : logging.Logger The logger for tracking the renaming process. Returns ------- pd.DataFrame The DataFrame with renamed and standardized column names. """ mapping = config.get("renames_eddy" if data_type == "eddy" else "renames_met", {}) logger.debug(f"Renaming columns from {df.columns} to {mapping}") df.columns = df.columns.str.strip() df = df.rename(columns=mapping) df = normalize_prefixes(df, logger) df = modernize_soil_legacy(df, logger) df.columns = df.columns.str.upper() logger.debug(f"Len of renamed cols {len(df)}") return df
[docs] def normalize_prefixes(df: pd.DataFrame, logger: logging.Logger) -> pd.DataFrame: """ Normalize column name prefixes for soil and temperature measurements. This function standardizes column name prefixes by renaming them based on a set of predefined patterns. For example, it can change 'BulkEC_' to 'EC_'. Parameters ---------- df : pd.DataFrame The input DataFrame with columns to be normalized. logger : logging.Logger The logger for tracking the normalization process. Returns ------- pd.DataFrame The DataFrame with normalized column name prefixes. """ rename_map: Dict[str, str] = {} for col in df.columns: for patt, repl in _PREFIX_PATTERNS.items(): if patt.match(col): rename_map[col] = patt.sub(repl, col) break else: if re.match(r"^T_\d{1,3}cm_", col, flags=re.IGNORECASE): rename_map[col] = re.sub(r"^T_", "Ts_", col, flags=re.IGNORECASE) if rename_map: logger.debug("Prefix normalisation: %s", rename_map) df = df.rename(columns=rename_map) logger.debug(f"Len of normalized prefix cols {len(df)}") return df
[docs] def modernize_soil_legacy(df: pd.DataFrame, logger: logging.Logger) -> pd.DataFrame: """ Update legacy soil sensor column names to a standardized format. This function identifies and renames legacy soil sensor columns to a modern, standardized format based on predefined mapping rules for depth and orientation. Parameters ---------- df : pd.DataFrame The input DataFrame with legacy soil sensor column names. logger : logging.Logger The logger for tracking the modernization process. Returns ------- pd.DataFrame The DataFrame with updated soil sensor column names. """ rename_map: Dict[str, str] = {} for col in df.columns: m = _LEGACY_RE.match(col) if not m: continue prefix = m.group("prefix").upper() if prefix == "T": prefix = "TS" depth_cm = int(m.group("depth")) orient = m.group("orient").upper() depth_idx = _DEPTH_MAP.get(depth_cm) if depth_idx is None: continue replic = _ORIENT_MAP[orient] new_name = f"{prefix}_{replic}_{depth_idx}_1" rename_map[col] = new_name if rename_map: logger.info(f"Legacy soil columns modernised: {rename_map}") df = df.rename(columns=rename_map) return df
[docs] def make_unique(cols): """ Make a list of column names unique by appending numeric suffixes to duplicates. This function takes a list of column names and ensures that all names are unique by appending a numeric suffix (e.g., '.1', '.2') to any duplicate names. Parameters ---------- cols : list A list of column names. Returns ------- list A list of unique column names. """ seen = {} out = [] for c in cols: c = str(c) if c in seen: seen[c] += 1 out.append(f"{c}.{seen[c]}") else: seen[c] = 0 out.append(c) return out
[docs] def make_unique_cols(df: pd.DataFrame) -> pd.DataFrame: """ Ensure that all column names in a DataFrame are unique. This function uses the `make_unique` helper function to append numeric suffixes to any duplicate column names, ensuring that every column has a unique identifier. Parameters ---------- df : pd.DataFrame The input DataFrame. Returns ------- pd.DataFrame A copy of the DataFrame with unique column names. """ df = df.copy() df.columns = make_unique(df.columns) return df
[docs] def col_order(df: pd.DataFrame, logger: logging.Logger) -> pd.DataFrame: """ Reorder DataFrame columns to place priority columns at the beginning. This function moves specified columns ('TIMESTAMP_END', 'TIMESTAMP_START') to the front of the DataFrame for better readability and consistency. Parameters ---------- df : pd.DataFrame The input DataFrame. logger : logging.Logger The logger for tracking the reordering process. Returns ------- pd.DataFrame The DataFrame with columns reordered. """ first_cols = ["TIMESTAMP_END", "TIMESTAMP_START"] for col in first_cols: if col in df.columns: ncol = df.pop(col) df.insert(0, col, ncol) logger.debug(f"Column Order: {df.columns}") return df
__all__ = [ "rename_columns", "normalize_prefixes", "modernize_soil_legacy", "make_unique", "make_unique_cols", "col_order", ]