Seanyoon commited on
Commit
627d5f4
·
1 Parent(s): 7b2b370

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +50 -0
README.md CHANGED
@@ -14,3 +14,53 @@ def preprocess_data(data):
14
  data = data.drop(columns=sensitive_cols)
15
 
16
  return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  data = data.drop(columns=sensitive_cols)
15
 
16
  return data
17
+
18
+
19
+
20
+ import transformers
21
+ import pandas as pd
22
+ import streamlit as st
23
+ from preprocess import preprocess_data
24
+
25
+ def anonymize_text(text):
26
+ model_name = "distilbert-base-uncased"
27
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
28
+ model = transformers.AutoModelForMaskedLM.from_pretrained(model_name)
29
+
30
+ input_ids = tokenizer.encode(text, return_tensors="pt")
31
+ mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
32
+
33
+ token_logits = model(input_ids)[0]
34
+ mask_token_logits = token_logits[0, mask_token_index, :]
35
+
36
+ top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
37
+
38
+ anonymized_text = []
39
+ for token in top_5_tokens:
40
+ token = tokenizer.decode([token])
41
+ anonymized_text.append(token)
42
+
43
+ return anonymized_text
44
+
45
+ def run_app():
46
+ st.title("Text Anonymization App")
47
+
48
+ # File upload
49
+ st.subheader("Upload your data")
50
+ file = st.file_uploader("Upload CSV", type=["csv"])
51
+
52
+ if file is not None:
53
+ # Read the file
54
+ data = pd.read_csv(file)
55
+
56
+ # Preprocess the data
57
+ preprocessed_data = preprocess_data(data)
58
+
59
+ # Column selection
60
+ st.subheader("Select columns to anonymize")
61
+ selected_columns = []
62
+ for col in preprocessed_data.columns:
63
+ if st.checkbox(col):
64
+ selected_columns.append(col)
65
+
66
+ #