aiba-bert-bilstm / nn_model.py
primel's picture
Upload AIBA BERT-BiLSTM v2 (Avg F1: 0.9566)
0aaead9 verified
"""
AIBA BERT-BiLSTM Multi-Task Model Architecture
This file contains the model architecture for the AIBA multi-task model.
"""
import torch
import torch.nn as nn
from transformers import AutoModel
class EnhancedMultiTaskModel(nn.Module):
"""
Multi-task model for:
- Named Entity Recognition (NER)
- Intent Classification
- Language Detection
Architecture: BERT + BiLSTM + Multi-head classification
"""
def __init__(self, config):
super().__init__()
# Load BERT base model
self.bert = AutoModel.from_pretrained(config.get("base_model", "google-bert/bert-base-multilingual-uncased"))
hidden_size = self.bert.config.hidden_size
# Layer normalization
self.layer_norm = nn.LayerNorm(hidden_size)
# NER head with BiLSTM
lstm_hidden = config.get("lstm_hidden", 256)
self.ner_lstm = nn.LSTM(
hidden_size,
lstm_hidden // 2,
num_layers=1,
bidirectional=True,
batch_first=True,
dropout=0
)
dropout = config.get("dropout", 0.15)
self.ner_dropout = nn.Dropout(dropout)
self.ner_classifier = nn.Linear(lstm_hidden, config["num_ner_labels"])
# Attention pooling for sequence-level tasks
self.attention_pool = nn.Sequential(
nn.Linear(hidden_size, hidden_size // 4),
nn.Tanh(),
nn.Linear(hidden_size // 4, 1),
nn.Softmax(dim=1)
)
# Shared representation layer
self.shared_dense = nn.Sequential(
nn.Linear(hidden_size, hidden_size // 2),
nn.GELU(),
nn.LayerNorm(hidden_size // 2),
nn.Dropout(dropout)
)
# Intent classification head
self.intent_classifier = nn.Sequential(
nn.Linear(hidden_size // 2, hidden_size // 4),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_size // 4, hidden_size // 8),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_size // 8, config["num_intent_labels"])
)
# Language detection head
self.lang_classifier = nn.Sequential(
nn.Linear(hidden_size // 2, hidden_size // 4),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_size // 4, config["num_lang_labels"])
)
# Store config
self.num_ner_labels = config["num_ner_labels"]
self.num_intent_labels = config["num_intent_labels"]
self.num_lang_labels = config["num_lang_labels"]
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
labels_ner=None, labels_intent=None, labels_lang=None):
"""
Forward pass
Args:
input_ids: Input token IDs
attention_mask: Attention mask
token_type_ids: Token type IDs (optional)
labels_ner: NER labels (optional, for training)
labels_intent: Intent labels (optional, for training)
labels_lang: Language labels (optional, for training)
Returns:
dict with keys: 'loss', 'ner_logits', 'intent_logits', 'lang_logits'
"""
# BERT encoding
outputs = self.bert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids
)
sequence_output = outputs.last_hidden_state
sequence_output = self.layer_norm(sequence_output)
# NER predictions (token-level)
ner_lstm_out, _ = self.ner_lstm(sequence_output)
ner_output = self.ner_dropout(ner_lstm_out)
ner_logits = self.ner_classifier(ner_output)
# Attention pooling for sequence-level tasks
attention_weights = self.attention_pool(sequence_output)
pooled_output = torch.sum(sequence_output * attention_weights, dim=1)
# Shared representation
shared_repr = self.shared_dense(pooled_output)
# Intent and language predictions
intent_logits = self.intent_classifier(shared_repr)
lang_logits = self.lang_classifier(shared_repr)
return {
'ner_logits': ner_logits,
'intent_logits': intent_logits,
'lang_logits': lang_logits,
}
@classmethod
def from_pretrained(cls, model_path):
"""
Load pretrained model from Hugging Face Hub
Args:
model_path: Path or repo ID (e.g., 'username/aiba-bert-bilstm')
Returns:
Loaded model
"""
import json
from pathlib import Path
from huggingface_hub import hf_hub_download
# Download config
config_path = hf_hub_download(repo_id=model_path, filename="config.json")
with open(config_path, 'r') as f:
config = json.load(f)
# Initialize model
model = cls(config)
# Load weights
try:
# Try safetensors first
weights_path = hf_hub_download(repo_id=model_path, filename="model.safetensors")
from safetensors.torch import load_file
state_dict = load_file(weights_path)
except:
# Fall back to pytorch_model.bin
weights_path = hf_hub_download(repo_id=model_path, filename="pytorch_model.bin")
state_dict = torch.load(weights_path, map_location='cpu')
model.load_state_dict(state_dict)
return model
def load_model_and_tokenizer(model_path):
"""
Convenience function to load model, tokenizer, and label mappings
Args:
model_path: Path or repo ID (e.g., 'username/aiba-bert-bilstm')
Returns:
tuple: (model, tokenizer, config_dict)
"""
import json
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
# Load config
config_path = hf_hub_download(repo_id=model_path, filename="config.json")
with open(config_path, 'r') as f:
config = json.load(f)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Load model
model = EnhancedMultiTaskModel.from_pretrained(model_path)
model.eval()
return model, tokenizer, config