Spaces:

Goofybaka
/

pii

Sleeping

App Files Files Community

Goofybaka commited on Nov 9, 2025

Commit

072d53b

verified ·

1 Parent(s): 147c91c

Upload 2 files

Browse files

Files changed (2) hide show

app.py +60 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import gradio as gr
+# Set up the model and tokenizer
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = AutoModelForCausalLM.from_pretrained(
+    "betterdataai/PII_DETECTION_MODEL",
+    trust_remote_code=True
+).to(device)
+tokenizer = AutoTokenizer.from_pretrained(
+    "betterdataai/PII_DETECTION_MODEL",
+    trust_remote_code=True
+)
+classes_list = ['<pin>','<api_key>','<bank_routing_number>','<bban>','<company>','<credit_card_number>','<credit_card_security_code>','<customer_id>','<date>','<date_of_birth>','<date_time>','<driver_license_number>','<email>','<employee_id>','<first_name>','<iban>','<ipv4>','<ipv6>','<last_name>','<local_latlng>','<name>','<passport_number>','<password>','<phone_number>','<social_security_number>','<street_address>','<swift_bic_code>','<time>','<user_name>']
+prompt_template = """You are an AI assistant who is responisble for identifying Personal Identifiable information (PII). You will be given a passage of text and you have to \
+identify the PII data present in the passage. You should only identify the data based on the classes provided and not make up any class on your own.
+```PII Classes```
+{classes}
+The given text is:
+{text}
+The PII data are:
+"""
+def detect_pii(user_input_text):
+    try:
+        # 1. Format the prompt
+        new_prompt = prompt_template.format(classes="\n".join(classes_list), text=user_input_text)
+        # 2. Tokenize
+        tokenized_input = tokenizer(new_prompt, return_tensors="pt").to(device)
+        # 3. Generate output
+        output = model.generate(**tokenized_input, max_new_tokens=250)
+        # 4. Decode the PII part
+        # Use rsplit to be safer, splitting only on the last occurrence
+        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
+        if "The PII data are:\n" in decoded_output:
+            pii_classes = decoded_output.rsplit("The PII data are:\n", 1)[1]
+        else:
+            pii_classes = "Could not parse model output."
+        return pii_classes
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
+# 3. Create the Gradio app
+iface = gr.Interface(
+    fn=detect_pii,
+    inputs=gr.Textbox(lines=5, label="Enter Text Here"),
+    outputs=gr.Textbox(label="Detected PII"),
+    title="PII Detection Model",
+    description="This app uses 'betterdataai/PII_DETECTION_MODEL' to find PII in text."
+)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+transformers
+gradio
+accelerate
+sentencepiece