Source code for micromet.format.merge

import pandas as pd
import numpy as np

[docs] def fillna_with_second_df( df1: pd.DataFrame, df2: pd.DataFrame, suffix1: str = '_df1', suffix2: str = '_df2' ) -> pd.DataFrame: """ Merges two DataFrames by index, prioritizing data from df1 and using df2 to fill any missing (NaN) values introduced by the outer merge for any columns that match between the two dataframes. Parameters ---------- df1 : pd.DataFrame The primary DataFrame whose index and values are prioritized. df2 : pd.DataFrame The secondary DataFrame used to fill NaN values in df1's columns. suffix1 : str, optional The suffix to apply to columns from df1 during the merge. The default is '_df1'. This suffix is removed from the output. Select a suffix that is not a string in a column name in either dataframe suffix2 : str, optional The suffix to apply to columns from df2 during the merge. The default is '_df2'. These columns are dropped from the output. Select a suffix that is not a string in a column name in either dataframe Returns ------- pd.DataFrame A merged DataFrame containing the union of both indices. Columns are filled: df1's value if present, otherwise df2's value. The final column names are stripped of suffix1. Notes ----- This function assumes that the column names (excluding suffixes) in both DataFrames are the same for matching purposes. """ # Check df1 columns for suffix1 or suffix2 if any(df1.columns.str.contains(suffix1, regex=False)) or \ any(df1.columns.str.contains(suffix2, regex=False)): raise ValueError( f"Error: Columns in df1 already contain '{suffix1}' or '{suffix2}'. " "Please select different suffix values." ) # Check df2 columns for suffix1 or suffix2 if any(df2.columns.str.contains(suffix1, regex=False)) or \ any(df2.columns.str.contains(suffix2, regex=False)): raise ValueError( f"Error: Columns in df2 already contain '{suffix1}' or '{suffix2}'. " "Please select different suffix values." ) # Merge datasets and identify column sets mergedat = df1.merge(df2, left_index=True, right_index=True, how='outer', suffixes=[suffix1, suffix2]) df1_cols = mergedat.columns[mergedat.columns.str.contains(suffix1, regex=False)] df2_cols = mergedat.columns[mergedat.columns.str.contains(suffix2, regex=False)] # 3. Coalesce Values (Fill df1's NaN with df2's values) for col1 in df1_cols: base_name = col1.removesuffix(suffix1) col2 = base_name + suffix2 mergedat[col1] = mergedat[col1].fillna(mergedat[col2]) mergedat = mergedat.drop(columns=df2_cols, errors='ignore') mergedat = mergedat.rename(columns=lambda x: x.removesuffix(suffix1)) return mergedat