|
|
--- |
|
|
language: |
|
|
- en |
|
|
--- |
|
|
|
|
|
# Definition |
|
|
|
|
|
[phi-2] for [P]ersonal [I]dentifiable [I]nformation with [B]anking [B]anking [I]nsurance Dataset |
|
|
|
|
|
# How to use model |
|
|
|
|
|
## Load model and tokenizer |
|
|
``` |
|
|
import torch |
|
|
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer |
|
|
|
|
|
torch.set_default_device("cuda") |
|
|
|
|
|
model_name = "dcipheranalytics/phi-2-pii-bbi" |
|
|
|
|
|
quantization_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
device_map="auto", |
|
|
# torch_dtype="auto", |
|
|
torch_dtype=torch.bfloat16, |
|
|
trust_remote_code=True, |
|
|
quantization_config=quantization_config, |
|
|
) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
|
``` |
|
|
|
|
|
## Call generate method |
|
|
``` |
|
|
def generate(msg: str, max_new_tokens = 300, temperature=0.3): |
|
|
chat_template = "<|im_start|>user\n{msg}<|im_end|><|im_start|>assistant\n" |
|
|
prompt = chat_template.format(msg=msg) |
|
|
|
|
|
with torch.no_grad(): |
|
|
token_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt") |
|
|
output_ids = model.generate( |
|
|
token_ids.to(model.device), |
|
|
max_new_tokens=max_new_tokens, |
|
|
do_sample=True, |
|
|
temperature=temperature, |
|
|
pad_token_id=tokenizer.eos_token_id, |
|
|
eos_token_id=tokenizer.eos_token_id, |
|
|
) |
|
|
output = tokenizer.decode(output_ids[0][token_ids.size(1):-1]).strip() |
|
|
return output |
|
|
|
|
|
instruction_template = "List the personally identifiable information in the given text below.\nText:########\n{text}\n########" |
|
|
text_with_pii = "My passport number is 123456789." |
|
|
generate(instruction_template.format(text=text_with_pii)) |
|
|
``` |
|
|
|
|
|
|
|
|
## Batch predictions |
|
|
``` |
|
|
from transformers import TextGenerationPipeline |
|
|
|
|
|
def get_prompt(text): |
|
|
instruction_template = "List the personally identifiable information in the given text below.\nText:########\n{text}\n########" |
|
|
msg = instruction_template.format(text=text) |
|
|
chat_template = "<|im_start|>user\n{msg}<|im_end|><|im_start|>assistant\n" |
|
|
prompt = chat_template.format(msg=msg) |
|
|
|
|
|
return prompt |
|
|
|
|
|
generator = TextGenerationPipeline( |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
max_new_tokens=300, |
|
|
do_sample=True, |
|
|
temperature=0.3, |
|
|
pad_token_id=tokenizer.eos_token_id, |
|
|
eos_token_id=tokenizer.eos_token_id, |
|
|
) |
|
|
|
|
|
texts = ["My passport number is 123456789.", |
|
|
"My name is John Smith.", |
|
|
] |
|
|
prompts = list(map(get_prompt, texts)) |
|
|
outputs = generator(prompts, |
|
|
return_full_text=False, |
|
|
batch_size=2) |
|
|
``` |
|
|
|
|
|
# Train Data |
|
|
|
|
|
GPT4 generated customer service conversations. |
|
|
1. 100 unique banking topics, 8 examples per each, |
|
|
2. New 100 banking topics, 4 examples per each, |
|
|
3. 100 insurance topics, 4 examples per each. |
|
|
|
|
|
# Evaluation Results |
|
|
|
|
|
## Average |
|
|
``` |
|
|
precision 0.836223 |
|
|
recall 0.781132 |
|
|
f1 0.801837 |
|
|
``` |
|
|
|
|
|
## Per topic: |
|
|
 |
|
|
|
|
|
## On TAB test split: |
|
|
``` |
|
|
precision 0.506118 |
|
|
recall 0.350976 |
|
|
f1 0.391614 |
|
|
``` |