Public
Snippet $8 authored by joshuamosesb

anonymize pandas data-frame column(s) values with a prefix & hash-code for consistent values for all reference-columns in other tables & store the anonym-value >> original-key pairs in for each column level reverse lookup

Edited
df_col_anonymize_util.py
def get_hash(prefix, val):
    return hash('{}-{}'.format(prefix, val))


def get_anonym_col(in_df, col_nm, prefix):
    uniq_vals = in_df[col_nm].unique()
    col_amap = dict([(uniq_vals[x], '{}-{}'.format(prefix, get_hash(prefix, uniq_vals[x]))) for x in range(0, len(uniq_vals))])
    return in_df[col_nm].apply(lambda x: col_amap.get(x, 'Def-{}'.format(prefix)))

anonym_col_map = {
    'profile6': 'College',
    'college_name': 'College',
    'student_id': 'UserID',
    'studentid': 'UserID',
    'userid': 'UserID',
    'user_id': 'UserID',
    'username': 'UName',
    'name': 'UName',
    'email': 'email',
    'cmid': 'CMID'
}


def get_anonym_df(in_df, cols, data_file_key='Dummy', base_dir='.'):
    for col_nm in cols:
        prefix = anonym_col_map.get(col_nm, 'Anonym-{}'.format(col_nm))
        uniq_vals = in_df[col_nm].unique()
        col_amap = dict([(uniq_vals[x], '{}-{}'.format(prefix, get_hash(prefix, uniq_vals[x]))) for x in range(0, len(uniq_vals))])
        in_df.loc[:, col_nm] = in_df[col_nm].apply(lambda x: col_amap.get(x, 'Def-{}'.format(prefix)))
        hash_df = pd.DataFrame.from_dict(col_amap, orient='index')
        hash_df.to_csv('{}/hmap_{}_{}.csv'.format(base_dir, data_file_key, col_nm))
        logger.debug("@ get_anonym_df: {}s  hmap & colums:: {} ".format((dt.now() - st_time).seconds, ['{}/hmap_{}_{}.csv'.format(base_dir, data_file_key, col_nm)]))
    return in_df