Update README.md
Browse files
README.md
CHANGED
|
@@ -108,4 +108,30 @@ The model is capable of detecting the following PII entities:
|
|
| 108 |
To use this model, you'll need to have the `transformers` library installed:
|
| 109 |
|
| 110 |
```bash
|
| 111 |
-
pip install transformers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
To use this model, you'll need to have the `transformers` library installed:
|
| 109 |
|
| 110 |
```bash
|
| 111 |
+
pip install transformers
|
| 112 |
+
|
| 113 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 114 |
+
|
| 115 |
+
# Load the tokenizer and model
|
| 116 |
+
tokenizer = AutoTokenizer.from_pretrained("ab-ai/PII-Model-Phi3-Mini")
|
| 117 |
+
model = AutoModelForTokenClassification.from_pretrained("ab-ai/PII-Model-Phi3-Mini")
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
input_text = "Hi Abner, just a reminder that your next primary care appointment is on 23/03/1926. Please confirm by replying to this email Nathen15@hotmail.com."
|
| 121 |
+
|
| 122 |
+
model_prompt = f"""### Instruction:
|
| 123 |
+
Identify and extract the following PII entities from the text, if present: companyname, pin, currencyname, email, phoneimei, litecoinaddress, currency, eyecolor, street, mac, state, time, vehiclevin, jobarea, date, bic, currencysymbol, currencycode, age, nearbygpscoordinate, amount, ssn, ethereumaddress, zipcode, buildingnumber, dob, firstname, middlename, ordinaldirection, jobtitle, bitcoinaddress, jobtype, phonenumber, height, password, ip, useragent, accountname, city, gender, secondaryaddress, iban, sex, prefix, ipv4, maskednumber, url, username, lastname, creditcardcvv, county, vehiclevrm, ipv6, creditcardissuer, accountnumber, creditcardnumber. Return the output in JSON format.
|
| 124 |
+
|
| 125 |
+
### Input:
|
| 126 |
+
{input_text}
|
| 127 |
+
|
| 128 |
+
### Output: """
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
inputs = tokenizer(model_prompt, return_tensors="pt").to(device)
|
| 132 |
+
# adjust max_new_tokens according to your need
|
| 133 |
+
outputs = model.generate(**inputs, do_sample=True, max_new_tokens=120)
|
| 134 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 135 |
+
print(response)
|
| 136 |
+
|
| 137 |
+
|