Spaces:

ganeshkonapalli
/

gk

Build error

App Files Files Community

gk / model_utils.py

ganeshkonapalli

Upload 5 files

10c2ac1 verified 8 months ago

raw

history blame contribute delete

3.5 kB


	import pandas as pd
	import torch
	import pickle
	import torch.nn as nn
	from sklearn.preprocessing import LabelEncoder
	from sklearn.model_selection import train_test_split
	from transformers import BertTokenizer, BertModel
	from torch.optim import AdamW
	from tqdm import tqdm

	TEXT_COLUMN = 'Sanction_Context'
	LABEL_COLUMNS = [
	'Red_Flag_Reason', 'Maker_Action', 'Escalation_Level',
	'Risk_Category', 'Risk_Drivers', 'Investigation_Outcome'
	]

	PRETRAINED_MODEL_NAME = 'bert-base-uncased'
	MAX_LEN = 128
	BATCH_SIZE = 16
	EPOCHS = 1
	DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	class BertMultiOutput(nn.Module):
	def __init__(self, num_labels_per_output):
	super().__init__()
	self.bert = BertModel.from_pretrained(PRETRAINED_MODEL_NAME)
	self.dropout = nn.Dropout(0.3)
	self.classifiers = nn.ModuleList([
	nn.Linear(self.bert.config.hidden_size, n_labels)
	for n_labels in num_labels_per_output
	])
	def forward(self, input_ids, attention_mask):
	outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
	pooled_output = self.dropout(outputs.pooler_output)
	return [classifier(pooled_output) for classifier in self.classifiers]

	def train_and_save_model(csv_path, output_path='app/bert_model.pkl'):
	df = pd.read_csv(csv_path)
	X = df[TEXT_COLUMN].tolist()
	y = df[LABEL_COLUMNS]

	label_encoders = {}
	y_encoded = pd.DataFrame()
	for col in LABEL_COLUMNS:
	le = LabelEncoder()
	y_encoded[col] = le.fit_transform(y[col])
	label_encoders[col] = le

	X_train, _, y_train, _ = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
	tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

	def tokenize_texts(texts):
	return tokenizer(texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")

	train_encodings = tokenize_texts(X_train)
	labels = [torch.tensor(y_train[col].values) for col in LABEL_COLUMNS]

	num_labels_list = [len(le.classes_) for le in label_encoders.values()]
	model = BertMultiOutput(num_labels_list).to(DEVICE)
	optimizer = AdamW(model.parameters(), lr=2e-5)
	loss_fn = nn.CrossEntropyLoss()

	model.train()
	for epoch in range(EPOCHS):
	for i in tqdm(range(0, len(X_train), BATCH_SIZE)):
	input_ids = train_encodings['input_ids'][i:i+BATCH_SIZE].to(DEVICE)
	attention_mask = train_encodings['attention_mask'][i:i+BATCH_SIZE].to(DEVICE)
	batch_labels = [label[i:i+BATCH_SIZE].to(DEVICE) for label in labels]

	optimizer.zero_grad()
	outputs = model(input_ids, attention_mask)
	loss = sum([loss_fn(o, l) for o, l in zip(outputs, batch_labels)])
	loss.backward()
	optimizer.step()

	model_bundle = {
	'model_state_dict': model.state_dict(),
	'tokenizer': tokenizer,
	'label_encoders': label_encoders
	}
	with open(output_path, 'wb') as f:
	pickle.dump(model_bundle, f)

	def load_model(path='app/bert_model.pkl'):
	with open(path, 'rb') as f:
	bundle = pickle.load(f)
	tokenizer = bundle['tokenizer']
	label_encoders = bundle['label_encoders']
	num_labels_list = [len(le.classes_) for le in label_encoders.values()]
	model = BertMultiOutput(num_labels_list).to(DEVICE)
	model.load_state_dict(bundle['model_state_dict'])
	model.eval()
	return model, tokenizer, label_encoders