def get_hash(prefix, val): return hash('{}-{}'.format(prefix, val)) def get_anonym_col(in_df, col_nm, prefix): uniq_vals = in_df[col_nm].unique() col_amap = dict([(uniq_vals[x], '{}-{}'.format(prefix, get_hash(prefix, uniq_vals[x]))) for x in range(0, len(uniq_vals))]) return in_df[col_nm].apply(lambda x: col_amap.get(x, 'Def-{}'.format(prefix))) anonym_col_map = { 'profile6': 'College', 'college_name': 'College', 'student_id': 'UserID', 'studentid': 'UserID', 'userid': 'UserID', 'user_id': 'UserID', 'username': 'UName', 'name': 'UName', 'email': 'email', 'cmid': 'CMID' } def get_anonym_df(in_df, cols, data_file_key='Dummy', base_dir='.'): for col_nm in cols: prefix = anonym_col_map.get(col_nm, 'Anonym-{}'.format(col_nm)) uniq_vals = in_df[col_nm].unique() col_amap = dict([(uniq_vals[x], '{}-{}'.format(prefix, get_hash(prefix, uniq_vals[x]))) for x in range(0, len(uniq_vals))]) in_df.loc[:, col_nm] = in_df[col_nm].apply(lambda x: col_amap.get(x, 'Def-{}'.format(prefix))) hash_df = pd.DataFrame.from_dict(col_amap, orient='index') hash_df.to_csv('{}/hmap_{}_{}.csv'.format(base_dir, data_file_key, col_nm)) logger.debug("@ get_anonym_df: {}s hmap & colums:: {} ".format((dt.now() - st_time).seconds, ['{}/hmap_{}_{}.csv'.format(base_dir, data_file_key, col_nm)])) return in_df