nn_model.py · primel/aiba-bert-bilstm at main

Upload AIBA BERT-BiLSTM v2 (Avg F1: 0.9566)

0aaead9 verified 4 months ago

6.49 kB

	"""
	AIBA BERT-BiLSTM Multi-Task Model Architecture

	This file contains the model architecture for the AIBA multi-task model.
	"""

	import torch
	import torch.nn as nn
	from transformers import AutoModel


	class EnhancedMultiTaskModel(nn.Module):
	"""
	Multi-task model for:
	- Named Entity Recognition (NER)
	- Intent Classification
	- Language Detection

	Architecture: BERT + BiLSTM + Multi-head classification
	"""

	def __init__(self, config):
	super().__init__()

	# Load BERT base model
	self.bert = AutoModel.from_pretrained(config.get("base_model", "google-bert/bert-base-multilingual-uncased"))
	hidden_size = self.bert.config.hidden_size

	# Layer normalization
	self.layer_norm = nn.LayerNorm(hidden_size)

	# NER head with BiLSTM
	lstm_hidden = config.get("lstm_hidden", 256)
	self.ner_lstm = nn.LSTM(
	hidden_size,
	lstm_hidden // 2,
	num_layers=1,
	bidirectional=True,
	batch_first=True,
	dropout=0
	)
	dropout = config.get("dropout", 0.15)
	self.ner_dropout = nn.Dropout(dropout)
	self.ner_classifier = nn.Linear(lstm_hidden, config["num_ner_labels"])

	# Attention pooling for sequence-level tasks
	self.attention_pool = nn.Sequential(
	nn.Linear(hidden_size, hidden_size // 4),
	nn.Tanh(),
	nn.Linear(hidden_size // 4, 1),
	nn.Softmax(dim=1)
	)

	# Shared representation layer
	self.shared_dense = nn.Sequential(
	nn.Linear(hidden_size, hidden_size // 2),
	nn.GELU(),
	nn.LayerNorm(hidden_size // 2),
	nn.Dropout(dropout)
	)

	# Intent classification head
	self.intent_classifier = nn.Sequential(
	nn.Linear(hidden_size // 2, hidden_size // 4),
	nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(hidden_size // 4, hidden_size // 8),
	nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(hidden_size // 8, config["num_intent_labels"])
	)

	# Language detection head
	self.lang_classifier = nn.Sequential(
	nn.Linear(hidden_size // 2, hidden_size // 4),
	nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(hidden_size // 4, config["num_lang_labels"])
	)

	# Store config
	self.num_ner_labels = config["num_ner_labels"]
	self.num_intent_labels = config["num_intent_labels"]
	self.num_lang_labels = config["num_lang_labels"]

	def forward(self, input_ids, attention_mask=None, token_type_ids=None,
	labels_ner=None, labels_intent=None, labels_lang=None):
	"""
	Forward pass

	Args:
	input_ids: Input token IDs
	attention_mask: Attention mask
	token_type_ids: Token type IDs (optional)
	labels_ner: NER labels (optional, for training)
	labels_intent: Intent labels (optional, for training)
	labels_lang: Language labels (optional, for training)

	Returns:
	dict with keys: 'loss', 'ner_logits', 'intent_logits', 'lang_logits'
	"""
	# BERT encoding
	outputs = self.bert(
	input_ids=input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids
	)

	sequence_output = outputs.last_hidden_state
	sequence_output = self.layer_norm(sequence_output)

	# NER predictions (token-level)
	ner_lstm_out, _ = self.ner_lstm(sequence_output)
	ner_output = self.ner_dropout(ner_lstm_out)
	ner_logits = self.ner_classifier(ner_output)

	# Attention pooling for sequence-level tasks
	attention_weights = self.attention_pool(sequence_output)
	pooled_output = torch.sum(sequence_output * attention_weights, dim=1)

	# Shared representation
	shared_repr = self.shared_dense(pooled_output)

	# Intent and language predictions
	intent_logits = self.intent_classifier(shared_repr)
	lang_logits = self.lang_classifier(shared_repr)

	return {
	'ner_logits': ner_logits,
	'intent_logits': intent_logits,
	'lang_logits': lang_logits,
	}

	@classmethod
	def from_pretrained(cls, model_path):
	"""
	Load pretrained model from Hugging Face Hub

	Args:
	model_path: Path or repo ID (e.g., 'username/aiba-bert-bilstm')

	Returns:
	Loaded model
	"""
	import json
	from pathlib import Path
	from huggingface_hub import hf_hub_download

	# Download config
	config_path = hf_hub_download(repo_id=model_path, filename="config.json")
	with open(config_path, 'r') as f:
	config = json.load(f)

	# Initialize model
	model = cls(config)

	# Load weights
	try:
	# Try safetensors first
	weights_path = hf_hub_download(repo_id=model_path, filename="model.safetensors")
	from safetensors.torch import load_file
	state_dict = load_file(weights_path)
	except:
	# Fall back to pytorch_model.bin
	weights_path = hf_hub_download(repo_id=model_path, filename="pytorch_model.bin")
	state_dict = torch.load(weights_path, map_location='cpu')

	model.load_state_dict(state_dict)
	return model


	def load_model_and_tokenizer(model_path):
	"""
	Convenience function to load model, tokenizer, and label mappings

	Args:
	model_path: Path or repo ID (e.g., 'username/aiba-bert-bilstm')

	Returns:
	tuple: (model, tokenizer, config_dict)
	"""
	import json
	from transformers import AutoTokenizer
	from huggingface_hub import hf_hub_download

	# Load config
	config_path = hf_hub_download(repo_id=model_path, filename="config.json")
	with open(config_path, 'r') as f:
	config = json.load(f)

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_path)

	# Load model
	model = EnhancedMultiTaskModel.from_pretrained(model_path)
	model.eval()

	return model, tokenizer, config