Source code for micromet.format.merge

import pandas as pd
import numpy as np


[docs]
def fillna_with_second_df(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    suffix1: str = '_df1',
    suffix2: str = '_df2'
) -> pd.DataFrame:
    """
    Merges two DataFrames by index, prioritizing data from df1 and using df2 
    to fill any missing (NaN) values introduced by the outer merge for any 
    columns that match between the two dataframes.

    Parameters
    ----------
    df1 : pd.DataFrame
        The primary DataFrame whose index and values are prioritized.
    df2 : pd.DataFrame
        The secondary DataFrame used to fill NaN values in df1's columns.
    suffix1 : str, optional
        The suffix to apply to columns from df1 during the merge. 
        The default is '_df1'. This suffix is removed from the output. 
        Select a suffix that is not a string in a column name in either dataframe
    suffix2 : str, optional
        The suffix to apply to columns from df2 during the merge. 
        The default is '_df2'. These columns are dropped from the output.
        Select a suffix that is not a string in a column name in either dataframe

    Returns
    -------
    pd.DataFrame
        A merged DataFrame containing the union of both indices. Columns 
        are filled: df1's value if present, otherwise df2's value.
        The final column names are stripped of suffix1.
    
    Notes
    -----
    This function assumes that the column names (excluding suffixes) 
    in both DataFrames are the same for matching purposes.
    """
    # Check df1 columns for suffix1 or suffix2
    if any(df1.columns.str.contains(suffix1, regex=False)) or \
       any(df1.columns.str.contains(suffix2, regex=False)):
        raise ValueError(
            f"Error: Columns in df1 already contain '{suffix1}' or '{suffix2}'. "
            "Please select different suffix values."
        )

    # Check df2 columns for suffix1 or suffix2
    if any(df2.columns.str.contains(suffix1, regex=False)) or \
       any(df2.columns.str.contains(suffix2, regex=False)):
        raise ValueError(
            f"Error: Columns in df2 already contain '{suffix1}' or '{suffix2}'. "
            "Please select different suffix values."
        )
    
    # Merge datasets and identify column sets
    mergedat = df1.merge(df2, left_index=True, right_index=True, how='outer', suffixes=[suffix1, suffix2])
    df1_cols = mergedat.columns[mergedat.columns.str.contains(suffix1, regex=False)]
    df2_cols = mergedat.columns[mergedat.columns.str.contains(suffix2, regex=False)]

    # 3. Coalesce Values (Fill df1's NaN with df2's values)
    for col1 in df1_cols:
        base_name = col1.removesuffix(suffix1)
        col2 = base_name + suffix2
        mergedat[col1] = mergedat[col1].fillna(mergedat[col2])

    mergedat = mergedat.drop(columns=df2_cols, errors='ignore')
    
    mergedat = mergedat.rename(columns=lambda x: x.removesuffix(suffix1))
    
    return mergedat