| import pandas as pd | |
| def preprocess_data(data): | |
| nc = len(data.columns) | |
| nr = len(data.index) | |
| new = [0] * nc | |
| for i in range(nc): | |
| new[i] = len(data.iloc[:, i].unique()) / nr | |
| sorted_index = sorted(range(len(new)), key=lambda k: new[k], reverse=True) | |
| sensitive_cols = list(data.columns[sorted_index[i]] for i in range(nc) if new[sorted_index[i]] > 0.5) | |
| data = data.drop(columns=sensitive_cols) | |
| return data | |
| import transformers | |
| import pandas as pd | |
| import streamlit as st | |
| from preprocess import preprocess_data | |
| def anonymize_text(text): | |
| model_name = "distilbert-base-uncased" | |
| tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) | |
| model = transformers.AutoModelForMaskedLM.from_pretrained(model_name) | |
| input_ids = tokenizer.encode(text, return_tensors="pt") | |
| mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1] | |
| token_logits = model(input_ids)[0] | |
| mask_token_logits = token_logits[0, mask_token_index, :] | |
| top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() | |
| anonymized_text = [] | |
| for token in top_5_tokens: | |
| token = tokenizer.decode([token]) | |
| anonymized_text.append(token) | |
| return anonymized_text | |
| def run_app(): | |
| st.title("Text Anonymization App") | |
| # File upload | |
| st.subheader("Upload your data") | |
| file = st.file_uploader("Upload CSV", type=["csv"]) | |
| if file is not None: | |
| # Read the file | |
| data = pd.read_csv(file) | |
| # Preprocess the data | |
| preprocessed_data = preprocess_data(data) | |
| # Column selection | |
| st.subheader("Select columns to anonymize") | |
| selected_columns = [] | |
| for col in preprocessed_data.columns: | |
| if st.checkbox(col): | |
| selected_columns.append(col) | |
| # | |