Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import BertTokenizerFast | |
| from model import BiLSTMCRF # make sure model.py exists | |
| def load_full_model_and_tokenizer(path): | |
| """ | |
| Loads the FULL BiLSTM-CRF model (torch.save(model, ...)) and tokenizer. | |
| """ | |
| tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") | |
| # Load full model | |
| model = torch.load(path, map_location="cpu", weights_only=False) | |
| model.eval() | |
| # Define tag mapping (must match training) | |
| idx2tag = {0: 'B-ACCOUNTNAME', | |
| 1: 'B-ACCOUNTNUMBER', | |
| 2: 'B-AGE', | |
| 3: 'B-AMOUNT', | |
| 4: 'B-BIC', | |
| 5: 'B-BITCOINADDRESS', | |
| 6: 'B-BUILDINGNUMBER', | |
| 7: 'B-CITY', | |
| 8: 'B-COMPANYNAME', | |
| 9: 'B-COUNTY', | |
| 10: 'B-CREDITCARDCVV', | |
| 11: 'B-CREDITCARDISSUER', | |
| 12: 'B-CREDITCARDNUMBER', | |
| 13: 'B-CURRENCY', | |
| 14: 'B-CURRENCYCODE', | |
| 15: 'B-CURRENCYNAME', | |
| 16: 'B-CURRENCYSYMBOL', | |
| 17: 'B-DATE', | |
| 18: 'B-DOB', | |
| 19: 'B-EMAIL', | |
| 20: 'B-ETHEREUMADDRESS', | |
| 21: 'B-EYECOLOR', | |
| 22: 'B-FIRSTNAME', | |
| 23: 'B-GENDER', | |
| 24: 'B-HEIGHT', | |
| 25: 'B-IBAN', | |
| 26: 'B-IP', | |
| 27: 'B-IPV4', | |
| 28: 'B-IPV6', | |
| 29: 'B-JOBAREA', | |
| 30: 'B-JOBTITLE', | |
| 31: 'B-JOBTYPE', | |
| 32: 'B-LASTNAME', | |
| 33: 'B-LITECOINADDRESS', | |
| 34: 'B-MAC', | |
| 35: 'B-MASKEDNUMBER', | |
| 36: 'B-MIDDLENAME', | |
| 37: 'B-NEARBYGPSCOORDINATE', | |
| 38: 'B-ORDINALDIRECTION', | |
| 39: 'B-PASSWORD', | |
| 40: 'B-PHONEIMEI', | |
| 41: 'B-PHONENUMBER', | |
| 42: 'B-PIN', | |
| 43: 'B-PREFIX', | |
| 44: 'B-SECONDARYADDRESS', | |
| 45: 'B-SEX', | |
| 46: 'B-SSN', | |
| 47: 'B-STATE', | |
| 48: 'B-STREET', | |
| 49: 'B-TIME', | |
| 50: 'B-URL', | |
| 51: 'B-USERAGENT', | |
| 52: 'B-USERNAME', | |
| 53: 'B-VEHICLEVIN', | |
| 54: 'B-VEHICLEVRM', | |
| 55: 'B-ZIPCODE', | |
| 56: 'I-ACCOUNTNAME', | |
| 57: 'I-ACCOUNTNUMBER', | |
| 58: 'I-AGE', | |
| 59: 'I-AMOUNT', | |
| 60: 'I-BIC', | |
| 61: 'I-BITCOINADDRESS', | |
| 62: 'I-BUILDINGNUMBER', | |
| 63: 'I-CITY', | |
| 64: 'I-COMPANYNAME', | |
| 65: 'I-COUNTY', | |
| 66: 'I-CREDITCARDCVV', | |
| 67: 'I-CREDITCARDISSUER', | |
| 68: 'I-CREDITCARDNUMBER', | |
| 69: 'I-CURRENCY', | |
| 70: 'I-CURRENCYCODE', | |
| 71: 'I-CURRENCYNAME', | |
| 72: 'I-CURRENCYSYMBOL', | |
| 73: 'I-DATE', | |
| 74: 'I-DOB', | |
| 75: 'I-EMAIL', | |
| 76: 'I-ETHEREUMADDRESS', | |
| 77: 'I-EYECOLOR', | |
| 78: 'I-FIRSTNAME', | |
| 79: 'I-GENDER', | |
| 80: 'I-HEIGHT', | |
| 81: 'I-IBAN', | |
| 82: 'I-IP', | |
| 83: 'I-IPV4', | |
| 84: 'I-IPV6', | |
| 85: 'I-JOBAREA', | |
| 86: 'I-JOBTITLE', | |
| 87: 'I-JOBTYPE', | |
| 88: 'I-LASTNAME', | |
| 89: 'I-LITECOINADDRESS', | |
| 90: 'I-MAC', | |
| 91: 'I-MASKEDNUMBER', | |
| 92: 'I-MIDDLENAME', | |
| 93: 'I-NEARBYGPSCOORDINATE', | |
| 94: 'I-PASSWORD', | |
| 95: 'I-PHONEIMEI', | |
| 96: 'I-PHONENUMBER', | |
| 97: 'I-PIN', | |
| 98: 'I-PREFIX', | |
| 99: 'I-SECONDARYADDRESS', | |
| 100: 'I-SSN', | |
| 101: 'I-STATE', | |
| 102: 'I-STREET', | |
| 103: 'I-TIME', | |
| 104: 'I-URL', | |
| 105: 'I-USERAGENT', | |
| 106: 'I-USERNAME', | |
| 107: 'I-VEHICLEVIN', | |
| 108: 'I-VEHICLEVRM', | |
| 109: 'I-ZIPCODE', | |
| 110: 'O'} | |
| return model, tokenizer, idx2tag | |
| def prepare_inputs(text, tokenizer, max_length=128): | |
| encoding = tokenizer( | |
| text.split(), | |
| is_split_into_words=True, | |
| padding="max_length", | |
| truncation=True, | |
| max_length=max_length, | |
| return_tensors="pt" | |
| ) | |
| input_ids = encoding["input_ids"] | |
| mask = encoding["attention_mask"].bool() | |
| return input_ids, mask | |