Goofybaka commited on
Commit
072d53b
·
verified ·
1 Parent(s): 147c91c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +60 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer
2
+ import torch
3
+ import gradio as gr
4
+
5
+ # Set up the model and tokenizer
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+ model = AutoModelForCausalLM.from_pretrained(
8
+ "betterdataai/PII_DETECTION_MODEL",
9
+ trust_remote_code=True
10
+ ).to(device)
11
+ tokenizer = AutoTokenizer.from_pretrained(
12
+ "betterdataai/PII_DETECTION_MODEL",
13
+ trust_remote_code=True
14
+ )
15
+
16
+ classes_list = ['<pin>','<api_key>','<bank_routing_number>','<bban>','<company>','<credit_card_number>','<credit_card_security_code>','<customer_id>','<date>','<date_of_birth>','<date_time>','<driver_license_number>','<email>','<employee_id>','<first_name>','<iban>','<ipv4>','<ipv6>','<last_name>','<local_latlng>','<name>','<passport_number>','<password>','<phone_number>','<social_security_number>','<street_address>','<swift_bic_code>','<time>','<user_name>']
17
+
18
+ prompt_template = """You are an AI assistant who is responisble for identifying Personal Identifiable information (PII). You will be given a passage of text and you have to \
19
+ identify the PII data present in the passage. You should only identify the data based on the classes provided and not make up any class on your own.
20
+
21
+ ```PII Classes```
22
+ {classes}
23
+
24
+ The given text is:
25
+ {text}
26
+
27
+ The PII data are:
28
+ """
29
+
30
+ def detect_pii(user_input_text):
31
+ try:
32
+ # 1. Format the prompt
33
+ new_prompt = prompt_template.format(classes="\n".join(classes_list), text=user_input_text)
34
+
35
+ # 2. Tokenize
36
+ tokenized_input = tokenizer(new_prompt, return_tensors="pt").to(device)
37
+
38
+ # 3. Generate output
39
+ output = model.generate(**tokenized_input, max_new_tokens=250)
40
+
41
+ # 4. Decode the PII part
42
+ # Use rsplit to be safer, splitting only on the last occurrence
43
+ decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
44
+ if "The PII data are:\n" in decoded_output:
45
+ pii_classes = decoded_output.rsplit("The PII data are:\n", 1)[1]
46
+ else:
47
+ pii_classes = "Could not parse model output."
48
+
49
+ return pii_classes
50
+ except Exception as e:
51
+ return f"An error occurred: {str(e)}"
52
+
53
+ # 3. Create the Gradio app
54
+ iface = gr.Interface(
55
+ fn=detect_pii,
56
+ inputs=gr.Textbox(lines=5, label="Enter Text Here"),
57
+ outputs=gr.Textbox(label="Detected PII"),
58
+ title="PII Detection Model",
59
+ description="This app uses 'betterdataai/PII_DETECTION_MODEL' to find PII in text."
60
+ )
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ gradio
4
+ accelerate
5
+ sentencepiece