Spaces:
Build error
Build error
| import pandas as pd | |
| import torch | |
| import pickle | |
| import torch.nn as nn | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split | |
| from transformers import BertTokenizer, BertModel | |
| from torch.optim import AdamW | |
| from tqdm import tqdm | |
| TEXT_COLUMN = 'Sanction_Context' | |
| LABEL_COLUMNS = [ | |
| 'Red_Flag_Reason', 'Maker_Action', 'Escalation_Level', | |
| 'Risk_Category', 'Risk_Drivers', 'Investigation_Outcome' | |
| ] | |
| PRETRAINED_MODEL_NAME = 'bert-base-uncased' | |
| MAX_LEN = 128 | |
| BATCH_SIZE = 16 | |
| EPOCHS = 1 | |
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| class BertMultiOutput(nn.Module): | |
| def __init__(self, num_labels_per_output): | |
| super().__init__() | |
| self.bert = BertModel.from_pretrained(PRETRAINED_MODEL_NAME) | |
| self.dropout = nn.Dropout(0.3) | |
| self.classifiers = nn.ModuleList([ | |
| nn.Linear(self.bert.config.hidden_size, n_labels) | |
| for n_labels in num_labels_per_output | |
| ]) | |
| def forward(self, input_ids, attention_mask): | |
| outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) | |
| pooled_output = self.dropout(outputs.pooler_output) | |
| return [classifier(pooled_output) for classifier in self.classifiers] | |
| def train_and_save_model(csv_path, output_path='app/bert_model.pkl'): | |
| df = pd.read_csv(csv_path) | |
| X = df[TEXT_COLUMN].tolist() | |
| y = df[LABEL_COLUMNS] | |
| label_encoders = {} | |
| y_encoded = pd.DataFrame() | |
| for col in LABEL_COLUMNS: | |
| le = LabelEncoder() | |
| y_encoded[col] = le.fit_transform(y[col]) | |
| label_encoders[col] = le | |
| X_train, _, y_train, _ = train_test_split(X, y_encoded, test_size=0.2, random_state=42) | |
| tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) | |
| def tokenize_texts(texts): | |
| return tokenizer(texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt") | |
| train_encodings = tokenize_texts(X_train) | |
| labels = [torch.tensor(y_train[col].values) for col in LABEL_COLUMNS] | |
| num_labels_list = [len(le.classes_) for le in label_encoders.values()] | |
| model = BertMultiOutput(num_labels_list).to(DEVICE) | |
| optimizer = AdamW(model.parameters(), lr=2e-5) | |
| loss_fn = nn.CrossEntropyLoss() | |
| model.train() | |
| for epoch in range(EPOCHS): | |
| for i in tqdm(range(0, len(X_train), BATCH_SIZE)): | |
| input_ids = train_encodings['input_ids'][i:i+BATCH_SIZE].to(DEVICE) | |
| attention_mask = train_encodings['attention_mask'][i:i+BATCH_SIZE].to(DEVICE) | |
| batch_labels = [label[i:i+BATCH_SIZE].to(DEVICE) for label in labels] | |
| optimizer.zero_grad() | |
| outputs = model(input_ids, attention_mask) | |
| loss = sum([loss_fn(o, l) for o, l in zip(outputs, batch_labels)]) | |
| loss.backward() | |
| optimizer.step() | |
| model_bundle = { | |
| 'model_state_dict': model.state_dict(), | |
| 'tokenizer': tokenizer, | |
| 'label_encoders': label_encoders | |
| } | |
| with open(output_path, 'wb') as f: | |
| pickle.dump(model_bundle, f) | |
| def load_model(path='app/bert_model.pkl'): | |
| with open(path, 'rb') as f: | |
| bundle = pickle.load(f) | |
| tokenizer = bundle['tokenizer'] | |
| label_encoders = bundle['label_encoders'] | |
| num_labels_list = [len(le.classes_) for le in label_encoders.values()] | |
| model = BertMultiOutput(num_labels_list).to(DEVICE) | |
| model.load_state_dict(bundle['model_state_dict']) | |
| model.eval() | |
| return model, tokenizer, label_encoders | |