import torch from transformers import BertTokenizerFast from model import BiLSTMCRF # make sure model.py exists def load_full_model_and_tokenizer(path): """ Loads the FULL BiLSTM-CRF model (torch.save(model, ...)) and tokenizer. """ tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") # Load full model model = torch.load(path, map_location="cpu", weights_only=False) model.eval() # Define tag mapping (must match training) idx2tag = {0: 'B-ACCOUNTNAME', 1: 'B-ACCOUNTNUMBER', 2: 'B-AGE', 3: 'B-AMOUNT', 4: 'B-BIC', 5: 'B-BITCOINADDRESS', 6: 'B-BUILDINGNUMBER', 7: 'B-CITY', 8: 'B-COMPANYNAME', 9: 'B-COUNTY', 10: 'B-CREDITCARDCVV', 11: 'B-CREDITCARDISSUER', 12: 'B-CREDITCARDNUMBER', 13: 'B-CURRENCY', 14: 'B-CURRENCYCODE', 15: 'B-CURRENCYNAME', 16: 'B-CURRENCYSYMBOL', 17: 'B-DATE', 18: 'B-DOB', 19: 'B-EMAIL', 20: 'B-ETHEREUMADDRESS', 21: 'B-EYECOLOR', 22: 'B-FIRSTNAME', 23: 'B-GENDER', 24: 'B-HEIGHT', 25: 'B-IBAN', 26: 'B-IP', 27: 'B-IPV4', 28: 'B-IPV6', 29: 'B-JOBAREA', 30: 'B-JOBTITLE', 31: 'B-JOBTYPE', 32: 'B-LASTNAME', 33: 'B-LITECOINADDRESS', 34: 'B-MAC', 35: 'B-MASKEDNUMBER', 36: 'B-MIDDLENAME', 37: 'B-NEARBYGPSCOORDINATE', 38: 'B-ORDINALDIRECTION', 39: 'B-PASSWORD', 40: 'B-PHONEIMEI', 41: 'B-PHONENUMBER', 42: 'B-PIN', 43: 'B-PREFIX', 44: 'B-SECONDARYADDRESS', 45: 'B-SEX', 46: 'B-SSN', 47: 'B-STATE', 48: 'B-STREET', 49: 'B-TIME', 50: 'B-URL', 51: 'B-USERAGENT', 52: 'B-USERNAME', 53: 'B-VEHICLEVIN', 54: 'B-VEHICLEVRM', 55: 'B-ZIPCODE', 56: 'I-ACCOUNTNAME', 57: 'I-ACCOUNTNUMBER', 58: 'I-AGE', 59: 'I-AMOUNT', 60: 'I-BIC', 61: 'I-BITCOINADDRESS', 62: 'I-BUILDINGNUMBER', 63: 'I-CITY', 64: 'I-COMPANYNAME', 65: 'I-COUNTY', 66: 'I-CREDITCARDCVV', 67: 'I-CREDITCARDISSUER', 68: 'I-CREDITCARDNUMBER', 69: 'I-CURRENCY', 70: 'I-CURRENCYCODE', 71: 'I-CURRENCYNAME', 72: 'I-CURRENCYSYMBOL', 73: 'I-DATE', 74: 'I-DOB', 75: 'I-EMAIL', 76: 'I-ETHEREUMADDRESS', 77: 'I-EYECOLOR', 78: 'I-FIRSTNAME', 79: 'I-GENDER', 80: 'I-HEIGHT', 81: 'I-IBAN', 82: 'I-IP', 83: 'I-IPV4', 84: 'I-IPV6', 85: 'I-JOBAREA', 86: 'I-JOBTITLE', 87: 'I-JOBTYPE', 88: 'I-LASTNAME', 89: 'I-LITECOINADDRESS', 90: 'I-MAC', 91: 'I-MASKEDNUMBER', 92: 'I-MIDDLENAME', 93: 'I-NEARBYGPSCOORDINATE', 94: 'I-PASSWORD', 95: 'I-PHONEIMEI', 96: 'I-PHONENUMBER', 97: 'I-PIN', 98: 'I-PREFIX', 99: 'I-SECONDARYADDRESS', 100: 'I-SSN', 101: 'I-STATE', 102: 'I-STREET', 103: 'I-TIME', 104: 'I-URL', 105: 'I-USERAGENT', 106: 'I-USERNAME', 107: 'I-VEHICLEVIN', 108: 'I-VEHICLEVRM', 109: 'I-ZIPCODE', 110: 'O'} return model, tokenizer, idx2tag def prepare_inputs(text, tokenizer, max_length=128): encoding = tokenizer( text.split(), is_split_into_words=True, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt" ) input_ids = encoding["input_ids"] mask = encoding["attention_mask"].bool() return input_ids, mask