Source code for crunchers.pandas_helpers.transformations

"""Provide functions for performing non-standard-ish column-wise transformations."""
import itertools as itr

import pandas as pd
import numpy as np

[docs]def apply_ignore_null(func, s, fillwith=None): """Perform `func` on values on `s` that are not 'nan' or equivalent. `func` applied to `s` after filling the 'nan' with `fillwith`. If `fillwith` is None, min(s) is used. You may prefer to use the mean or median like this: `apply_ignore_null(func, s, fillwith=np.mean(s))` Returns a reconstituted pandas.Series with 'nan' everywhere there was an original 'nan', but with the transformed values everywhere else. """ if fillwith is None: fillwith = s.min() idx = s.index isnull = s.isnull() applied = pd.Series(func(s.fillna(fillwith)), index=idx) return pd.concat([applied[s.notnull()], s[s.isnull()]])
[docs]def apply_pairwise(series, func): """Apply `func` to items in `series` pairwise: return dataframe.""" func = sum names = series.index.values df = pd.DataFrame(index=names, columns=names) for idx, col in itr.combinations(names,2): df[col][idx] = func([series[idx], series[col]]) return df
# Data Scaling
[docs]def zero_stdv_columns(df): """Return list of column names where standard deviation == 0.""" std0_mask = df.std() == 0 return df.columns[std0_mask]
[docs]def std_scale(df): """Return scaled copy of df tolerating columns where stdev == 0.""" stdv_0_cols = zero_stdv_columns(df) scaled = (df - df.mean())/df.std() scaled = scaled.copy() scaled.loc[:,stdv_0_cols] = 0 return scaled
[docs]def robust_scale(df): """Return copy of `df` scaled by (df - df.median()) / MAD(df) where MAD is a function returning the median absolute deviation.""" median_subtracted = df - df.median() mad = median_subtracted.abs().median() return median_subtracted/mad