def get_hash(prefix, val):
return hash('{}-{}'.format(prefix, val))
def get_anonym_col(in_df, col_nm, prefix):
uniq_vals = in_df[col_nm].unique()
col_amap = dict([(uniq_vals[x], '{}-{}'.format(prefix, get_hash(prefix, uniq_vals[x]))) for x in range(0, len(uniq_vals))])
return in_df[col_nm].apply(lambda x: col_amap.get(x, 'Def-{}'.format(prefix)))
anonym_col_map = {
'profile6': 'College',
'college_name': 'College',
'student_id': 'UserID',
'studentid': 'UserID',
'userid': 'UserID',
'user_id': 'UserID',
'username': 'UName',
'name': 'UName',
'email': 'email',
'cmid': 'CMID'
}
def get_anonym_df(in_df, cols, data_file_key='Dummy', base_dir='.'):
for col_nm in cols:
prefix = anonym_col_map.get(col_nm, 'Anonym-{}'.format(col_nm))
uniq_vals = in_df[col_nm].unique()
col_amap = dict([(uniq_vals[x], '{}-{}'.format(prefix, get_hash(prefix, uniq_vals[x]))) for x in range(0, len(uniq_vals))])
in_df.loc[:, col_nm] = in_df[col_nm].apply(lambda x: col_amap.get(x, 'Def-{}'.format(prefix)))
hash_df = pd.DataFrame.from_dict(col_amap, orient='index')
hash_df.to_csv('{}/hmap_{}_{}.csv'.format(base_dir, data_file_key, col_nm))
logger.debug("@ get_anonym_df: {}s hmap & colums:: {} ".format((dt.now() - st_time).seconds, ['{}/hmap_{}_{}.csv'.format(base_dir, data_file_key, col_nm)]))
return in_df