Spaces:
Runtime error
Runtime error
Commit
·
a51662f
1
Parent(s):
6c3e9dd
Implemented k_anonymizer.
Browse files- modules.py +55 -2
modules.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
|
| 3 |
SUPPORTED_TYPES = [".csv", ".json", ".xlsx"]
|
|
@@ -53,5 +55,56 @@ def data_cleaner(df, drop_missing=False, remove_duplicates=True):
|
|
| 53 |
if remove_duplicates: df = df.drop_duplicates()
|
| 54 |
return df
|
| 55 |
|
| 56 |
-
def
|
| 57 |
-
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from itertools import combinations
|
| 2 |
+
import numpy as np
|
| 3 |
import pandas as pd
|
| 4 |
|
| 5 |
SUPPORTED_TYPES = [".csv", ".json", ".xlsx"]
|
|
|
|
| 55 |
if remove_duplicates: df = df.drop_duplicates()
|
| 56 |
return df
|
| 57 |
|
| 58 |
+
def unique_ratio(df, col):
|
| 59 |
+
return df[col].nunique()/df[col].count()
|
| 60 |
+
|
| 61 |
+
def bin_numeric(df, name_col: str, num_bins: int):
|
| 62 |
+
|
| 63 |
+
df_copy = df.copy().select_dtypes(include=np.number)
|
| 64 |
+
|
| 65 |
+
col_name = df[name_col].sort_values()
|
| 66 |
+
min_, max_ = col_name.min(), col_name.max()
|
| 67 |
+
bins = np.array_split(col_name.values, num_bins)
|
| 68 |
+
pivots = [min_] + [b[0] for b in bins[1:]] + [max_]
|
| 69 |
+
bins_list = [(pivots[i], pivots[i+1]) for i in range(num_bins)]
|
| 70 |
+
|
| 71 |
+
for bin_min, bin_max in bins_list:
|
| 72 |
+
|
| 73 |
+
for row in df_copy.index:
|
| 74 |
+
if bin_min <= df_copy.loc[row, name_col] < bin_max:
|
| 75 |
+
df.loc[row, name_col] = f"{bin_min} - {bin_max}"
|
| 76 |
+
|
| 77 |
+
return df
|
| 78 |
+
|
| 79 |
+
def get_kanon_false(df, k=2):
|
| 80 |
+
df = df.select_dtypes(include=np.number)
|
| 81 |
+
k_anon_false = set() # columns containing non-unique k-tuples - need anonymization
|
| 82 |
+
pairwise_combinations = list(combinations(df.columns, k)) # get k-wise combinations of all columns in data
|
| 83 |
+
check = lambda x: x == k-1
|
| 84 |
+
|
| 85 |
+
for k_tuple in pairwise_combinations:
|
| 86 |
+
|
| 87 |
+
# if k_tuple in k_anon_false:
|
| 88 |
+
# continue
|
| 89 |
+
|
| 90 |
+
k_pair_counts = df.loc[:, k_tuple].value_counts().tolist() # checks for n_unique_values for each k-tuple
|
| 91 |
+
|
| 92 |
+
if any(check(i) for i in k_pair_counts): # if any value corresponding to the k-tuple is >1, i.e. non-unique
|
| 93 |
+
k_anon_false.add((k_tuple[0], unique_ratio(df, k_tuple[0])))
|
| 94 |
+
k_anon_false.add((k_tuple[1], unique_ratio(df, k_tuple[1])))
|
| 95 |
+
|
| 96 |
+
return sorted(k_anon_false, key = lambda x:x[1], reverse = True)
|
| 97 |
+
|
| 98 |
+
def k_anonymize(df, k=2):
|
| 99 |
+
k_anon_false = get_kanon_false(df)
|
| 100 |
+
while k_anon_false:
|
| 101 |
+
for i in k_anon_false:
|
| 102 |
+
col, _ = i
|
| 103 |
+
print(f"Binning {col}")
|
| 104 |
+
df = bin_numeric(df, col, num_bins = 15)
|
| 105 |
+
k_anon_false = get_kanon_false(df)
|
| 106 |
+
print(f"Updated sensitivity: {k_anon_false}")
|
| 107 |
+
return df
|
| 108 |
+
|
| 109 |
+
def data_anonymizer(df, k=2):
|
| 110 |
+
return k_anonymize(df, k)
|