Spaces:
Sleeping
Sleeping
File size: 3,182 Bytes
73a7314 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import torch
from transformers import BertTokenizerFast
from model import BiLSTMCRF # make sure model.py exists
def load_full_model_and_tokenizer(path):
"""
Loads the FULL BiLSTM-CRF model (torch.save(model, ...)) and tokenizer.
"""
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
# Load full model
model = torch.load(path, map_location="cpu", weights_only=False)
model.eval()
# Define tag mapping (must match training)
idx2tag = {0: 'B-ACCOUNTNAME',
1: 'B-ACCOUNTNUMBER',
2: 'B-AGE',
3: 'B-AMOUNT',
4: 'B-BIC',
5: 'B-BITCOINADDRESS',
6: 'B-BUILDINGNUMBER',
7: 'B-CITY',
8: 'B-COMPANYNAME',
9: 'B-COUNTY',
10: 'B-CREDITCARDCVV',
11: 'B-CREDITCARDISSUER',
12: 'B-CREDITCARDNUMBER',
13: 'B-CURRENCY',
14: 'B-CURRENCYCODE',
15: 'B-CURRENCYNAME',
16: 'B-CURRENCYSYMBOL',
17: 'B-DATE',
18: 'B-DOB',
19: 'B-EMAIL',
20: 'B-ETHEREUMADDRESS',
21: 'B-EYECOLOR',
22: 'B-FIRSTNAME',
23: 'B-GENDER',
24: 'B-HEIGHT',
25: 'B-IBAN',
26: 'B-IP',
27: 'B-IPV4',
28: 'B-IPV6',
29: 'B-JOBAREA',
30: 'B-JOBTITLE',
31: 'B-JOBTYPE',
32: 'B-LASTNAME',
33: 'B-LITECOINADDRESS',
34: 'B-MAC',
35: 'B-MASKEDNUMBER',
36: 'B-MIDDLENAME',
37: 'B-NEARBYGPSCOORDINATE',
38: 'B-ORDINALDIRECTION',
39: 'B-PASSWORD',
40: 'B-PHONEIMEI',
41: 'B-PHONENUMBER',
42: 'B-PIN',
43: 'B-PREFIX',
44: 'B-SECONDARYADDRESS',
45: 'B-SEX',
46: 'B-SSN',
47: 'B-STATE',
48: 'B-STREET',
49: 'B-TIME',
50: 'B-URL',
51: 'B-USERAGENT',
52: 'B-USERNAME',
53: 'B-VEHICLEVIN',
54: 'B-VEHICLEVRM',
55: 'B-ZIPCODE',
56: 'I-ACCOUNTNAME',
57: 'I-ACCOUNTNUMBER',
58: 'I-AGE',
59: 'I-AMOUNT',
60: 'I-BIC',
61: 'I-BITCOINADDRESS',
62: 'I-BUILDINGNUMBER',
63: 'I-CITY',
64: 'I-COMPANYNAME',
65: 'I-COUNTY',
66: 'I-CREDITCARDCVV',
67: 'I-CREDITCARDISSUER',
68: 'I-CREDITCARDNUMBER',
69: 'I-CURRENCY',
70: 'I-CURRENCYCODE',
71: 'I-CURRENCYNAME',
72: 'I-CURRENCYSYMBOL',
73: 'I-DATE',
74: 'I-DOB',
75: 'I-EMAIL',
76: 'I-ETHEREUMADDRESS',
77: 'I-EYECOLOR',
78: 'I-FIRSTNAME',
79: 'I-GENDER',
80: 'I-HEIGHT',
81: 'I-IBAN',
82: 'I-IP',
83: 'I-IPV4',
84: 'I-IPV6',
85: 'I-JOBAREA',
86: 'I-JOBTITLE',
87: 'I-JOBTYPE',
88: 'I-LASTNAME',
89: 'I-LITECOINADDRESS',
90: 'I-MAC',
91: 'I-MASKEDNUMBER',
92: 'I-MIDDLENAME',
93: 'I-NEARBYGPSCOORDINATE',
94: 'I-PASSWORD',
95: 'I-PHONEIMEI',
96: 'I-PHONENUMBER',
97: 'I-PIN',
98: 'I-PREFIX',
99: 'I-SECONDARYADDRESS',
100: 'I-SSN',
101: 'I-STATE',
102: 'I-STREET',
103: 'I-TIME',
104: 'I-URL',
105: 'I-USERAGENT',
106: 'I-USERNAME',
107: 'I-VEHICLEVIN',
108: 'I-VEHICLEVRM',
109: 'I-ZIPCODE',
110: 'O'}
return model, tokenizer, idx2tag
def prepare_inputs(text, tokenizer, max_length=128):
encoding = tokenizer(
text.split(),
is_split_into_words=True,
padding="max_length",
truncation=True,
max_length=max_length,
return_tensors="pt"
)
input_ids = encoding["input_ids"]
mask = encoding["attention_mask"].bool()
return input_ids, mask
|