|
|
import transformers |
|
|
import pandas as pd |
|
|
import streamlit as st |
|
|
from preprocess import preprocess_data |
|
|
|
|
|
def anonymize_text(text): |
|
|
model_name = "distilbert-base-uncased" |
|
|
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) |
|
|
model = transformers.AutoModelForMaskedLM.from_pretrained(model_name) |
|
|
|
|
|
input_ids = tokenizer.encode(text, return_tensors="pt") |
|
|
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1] |
|
|
|
|
|
token_logits = model(input_ids)[0] |
|
|
mask_token_logits = token_logits[0, mask_token_index, :] |
|
|
|
|
|
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() |
|
|
|
|
|
anonymized_text = [] |
|
|
for token in top_5_tokens: |
|
|
token = tokenizer.decode([token]) |
|
|
anonymized_text.append(token) |
|
|
|
|
|
return anonymized_text |
|
|
|
|
|
def run_app(): |
|
|
st.title("Text Anonymization App") |
|
|
|
|
|
|
|
|
st.subheader("Upload your data") |
|
|
file = st.file_uploader("Upload CSV", type=["csv"]) |
|
|
|
|
|
if file is not None: |
|
|
|
|
|
data = pd.read_csv(file) |
|
|
|
|
|
|
|
|
preprocessed_data = preprocess_data(data) |
|
|
|
|
|
|
|
|
st.subheader("Select columns to anonymize") |
|
|
selected_columns = [] |
|
|
for col in preprocessed_data.columns: |
|
|
if st.checkbox(col): |
|
|
selected_columns.append(col) |
|
|
|
|
|
|
|
|
|