PIIGuard / utils.py
DeepActionPotential's picture
🚀 Initial upload of my app
73a7314 verified
import torch
from transformers import BertTokenizerFast
from model import BiLSTMCRF # make sure model.py exists
def load_full_model_and_tokenizer(path):
"""
Loads the FULL BiLSTM-CRF model (torch.save(model, ...)) and tokenizer.
"""
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
# Load full model
model = torch.load(path, map_location="cpu", weights_only=False)
model.eval()
# Define tag mapping (must match training)
idx2tag = {0: 'B-ACCOUNTNAME',
1: 'B-ACCOUNTNUMBER',
2: 'B-AGE',
3: 'B-AMOUNT',
4: 'B-BIC',
5: 'B-BITCOINADDRESS',
6: 'B-BUILDINGNUMBER',
7: 'B-CITY',
8: 'B-COMPANYNAME',
9: 'B-COUNTY',
10: 'B-CREDITCARDCVV',
11: 'B-CREDITCARDISSUER',
12: 'B-CREDITCARDNUMBER',
13: 'B-CURRENCY',
14: 'B-CURRENCYCODE',
15: 'B-CURRENCYNAME',
16: 'B-CURRENCYSYMBOL',
17: 'B-DATE',
18: 'B-DOB',
19: 'B-EMAIL',
20: 'B-ETHEREUMADDRESS',
21: 'B-EYECOLOR',
22: 'B-FIRSTNAME',
23: 'B-GENDER',
24: 'B-HEIGHT',
25: 'B-IBAN',
26: 'B-IP',
27: 'B-IPV4',
28: 'B-IPV6',
29: 'B-JOBAREA',
30: 'B-JOBTITLE',
31: 'B-JOBTYPE',
32: 'B-LASTNAME',
33: 'B-LITECOINADDRESS',
34: 'B-MAC',
35: 'B-MASKEDNUMBER',
36: 'B-MIDDLENAME',
37: 'B-NEARBYGPSCOORDINATE',
38: 'B-ORDINALDIRECTION',
39: 'B-PASSWORD',
40: 'B-PHONEIMEI',
41: 'B-PHONENUMBER',
42: 'B-PIN',
43: 'B-PREFIX',
44: 'B-SECONDARYADDRESS',
45: 'B-SEX',
46: 'B-SSN',
47: 'B-STATE',
48: 'B-STREET',
49: 'B-TIME',
50: 'B-URL',
51: 'B-USERAGENT',
52: 'B-USERNAME',
53: 'B-VEHICLEVIN',
54: 'B-VEHICLEVRM',
55: 'B-ZIPCODE',
56: 'I-ACCOUNTNAME',
57: 'I-ACCOUNTNUMBER',
58: 'I-AGE',
59: 'I-AMOUNT',
60: 'I-BIC',
61: 'I-BITCOINADDRESS',
62: 'I-BUILDINGNUMBER',
63: 'I-CITY',
64: 'I-COMPANYNAME',
65: 'I-COUNTY',
66: 'I-CREDITCARDCVV',
67: 'I-CREDITCARDISSUER',
68: 'I-CREDITCARDNUMBER',
69: 'I-CURRENCY',
70: 'I-CURRENCYCODE',
71: 'I-CURRENCYNAME',
72: 'I-CURRENCYSYMBOL',
73: 'I-DATE',
74: 'I-DOB',
75: 'I-EMAIL',
76: 'I-ETHEREUMADDRESS',
77: 'I-EYECOLOR',
78: 'I-FIRSTNAME',
79: 'I-GENDER',
80: 'I-HEIGHT',
81: 'I-IBAN',
82: 'I-IP',
83: 'I-IPV4',
84: 'I-IPV6',
85: 'I-JOBAREA',
86: 'I-JOBTITLE',
87: 'I-JOBTYPE',
88: 'I-LASTNAME',
89: 'I-LITECOINADDRESS',
90: 'I-MAC',
91: 'I-MASKEDNUMBER',
92: 'I-MIDDLENAME',
93: 'I-NEARBYGPSCOORDINATE',
94: 'I-PASSWORD',
95: 'I-PHONEIMEI',
96: 'I-PHONENUMBER',
97: 'I-PIN',
98: 'I-PREFIX',
99: 'I-SECONDARYADDRESS',
100: 'I-SSN',
101: 'I-STATE',
102: 'I-STREET',
103: 'I-TIME',
104: 'I-URL',
105: 'I-USERAGENT',
106: 'I-USERNAME',
107: 'I-VEHICLEVIN',
108: 'I-VEHICLEVRM',
109: 'I-ZIPCODE',
110: 'O'}
return model, tokenizer, idx2tag
def prepare_inputs(text, tokenizer, max_length=128):
encoding = tokenizer(
text.split(),
is_split_into_words=True,
padding="max_length",
truncation=True,
max_length=max_length,
return_tensors="pt"
)
input_ids = encoding["input_ids"]
mask = encoding["attention_mask"].bool()
return input_ids, mask