Initial Deployment Package with Inference Script
Browse files- README.md +14 -0
- config.json +36 -0
- inference.py +63 -0
- modeling.py +40 -0
- pytorch_model.bin +3 -0
- requirements.txt +5 -0
- tokenizer.json +0 -0
- tokenizer_config.json +16 -0
README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# taMASRIBERT – Egyptian Arabic Multi-Task Model
|
| 3 |
+
|
| 4 |
+
This model jointly performs **sarcasm detection**, **sentiment analysis** (3 classes), and **emotion recognition** (8 classes) for Egyptian Arabic and code-switched text. It combines a BERT‑based transformer (`MASRIBERTv3`) with a BiLSTM+BiGRU sequence encoder fed with FastText embeddings.
|
| 5 |
+
|
| 6 |
+
## Model Architecture
|
| 7 |
+
- **BERT backbone**: `T0KII/MASRIBERTv3` (trained on MASRISET, 3.2M Egyptian Arabic texts)
|
| 8 |
+
- **Sequence encoder**: 2‑layer BiLSTM + 2‑layer BiGRU on FastText (300‑dim)
|
| 9 |
+
- **Deep fusion**: Concatenation of BERT `[CLS]` and RNN final states → task‑specific heads
|
| 10 |
+
|
| 11 |
+
## Usage
|
| 12 |
+
1. Clone this repository.
|
| 13 |
+
2. Install dependencies: `pip install -r requirements.txt`
|
| 14 |
+
3. Run the inference script: `python inference.py`
|
config.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_cross_attention": false,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"BertForMaskedLM"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": null,
|
| 8 |
+
"classifier_dropout": null,
|
| 9 |
+
"directionality": "bidi",
|
| 10 |
+
"dtype": "float32",
|
| 11 |
+
"eos_token_id": null,
|
| 12 |
+
"gradient_checkpointing": false,
|
| 13 |
+
"hidden_act": "gelu",
|
| 14 |
+
"hidden_dropout_prob": 0.1,
|
| 15 |
+
"hidden_size": 768,
|
| 16 |
+
"initializer_range": 0.02,
|
| 17 |
+
"intermediate_size": 3072,
|
| 18 |
+
"is_decoder": false,
|
| 19 |
+
"layer_norm_eps": 1e-12,
|
| 20 |
+
"max_position_embeddings": 512,
|
| 21 |
+
"model_type": "bert",
|
| 22 |
+
"num_attention_heads": 12,
|
| 23 |
+
"num_hidden_layers": 12,
|
| 24 |
+
"pad_token_id": 0,
|
| 25 |
+
"pooler_fc_size": 768,
|
| 26 |
+
"pooler_num_attention_heads": 12,
|
| 27 |
+
"pooler_num_fc_layers": 3,
|
| 28 |
+
"pooler_size_per_head": 128,
|
| 29 |
+
"pooler_type": "first_token_transform",
|
| 30 |
+
"position_embedding_type": "absolute",
|
| 31 |
+
"tie_word_embeddings": false,
|
| 32 |
+
"transformers_version": "5.0.0",
|
| 33 |
+
"type_vocab_size": 2,
|
| 34 |
+
"use_cache": false,
|
| 35 |
+
"vocab_size": 140274
|
| 36 |
+
}
|
inference.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
import torch
|
| 4 |
+
import numpy as np
|
| 5 |
+
from transformers import AutoTokenizer
|
| 6 |
+
from modeling import UnifiedMASRIHead
|
| 7 |
+
import fasttext
|
| 8 |
+
from huggingface_hub import hf_hub_download
|
| 9 |
+
|
| 10 |
+
class taMASRIBERT:
|
| 11 |
+
def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
|
| 12 |
+
self.device = device
|
| 13 |
+
self.repo_id = "T0KII/taMASRIBERT"
|
| 14 |
+
|
| 15 |
+
print("Loading Tokenizer...")
|
| 16 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
|
| 17 |
+
|
| 18 |
+
print("Loading FastText embeddings (~3.3GB)...")
|
| 19 |
+
self.ft_model = fasttext.load_model(hf_hub_download("facebook/fasttext-arz-vectors", "model.bin"))
|
| 20 |
+
|
| 21 |
+
print("Initializing Deep Fusion Architecture...")
|
| 22 |
+
self.model = UnifiedMASRIHead(bert_model_name="T0KII/MASRIBERTv3").to(self.device)
|
| 23 |
+
|
| 24 |
+
print("Fetching Model Weights...")
|
| 25 |
+
# Automatically downloads weights if they aren't local
|
| 26 |
+
weights_path = hf_hub_download(repo_id=self.repo_id, filename="pytorch_model.bin")
|
| 27 |
+
state_dict = torch.load(weights_path, map_location=self.device)
|
| 28 |
+
self.model.load_state_dict(state_dict)
|
| 29 |
+
self.model.eval()
|
| 30 |
+
print("✓ taMASRIBERT is ready for inference.")
|
| 31 |
+
|
| 32 |
+
def _get_ft_embedding(self, text, max_len=128, embed_dim=300):
|
| 33 |
+
tokens = str(text).split()[:max_len]
|
| 34 |
+
matrix = np.zeros((max_len, embed_dim), dtype=np.float32)
|
| 35 |
+
for i, tok in enumerate(tokens):
|
| 36 |
+
try: matrix[i] = self.ft_model.get_word_vector(tok)
|
| 37 |
+
except Exception: pass
|
| 38 |
+
return torch.from_numpy(matrix).unsqueeze(0)
|
| 39 |
+
|
| 40 |
+
def predict(self, text, task='sarcasm'):
|
| 41 |
+
enc = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=128, padding='max_length')
|
| 42 |
+
input_ids = enc['input_ids'].to(self.device)
|
| 43 |
+
attention_mask = enc['attention_mask'].to(self.device)
|
| 44 |
+
ft_embeds = self._get_ft_embedding(text).to(self.device)
|
| 45 |
+
|
| 46 |
+
with torch.no_grad():
|
| 47 |
+
logits = self.model(input_ids, attention_mask, ft_embeds, task)
|
| 48 |
+
probs = torch.softmax(logits, dim=1)
|
| 49 |
+
pred = torch.argmax(probs, dim=1).item()
|
| 50 |
+
return pred, probs.cpu().numpy().tolist()[0]
|
| 51 |
+
|
| 52 |
+
if __name__ == "__main__":
|
| 53 |
+
model = taMASRIBERT()
|
| 54 |
+
text = "يا سلام عليك يا عبقري"
|
| 55 |
+
|
| 56 |
+
sarc_pred, sarc_probs = model.predict(text, task='sarcasm')
|
| 57 |
+
sent_pred, sent_probs = model.predict(text, task='sentiment')
|
| 58 |
+
emo_pred, emo_probs = model.predict(text, task='emotion')
|
| 59 |
+
|
| 60 |
+
print(f"\nText: {text}")
|
| 61 |
+
print(f"Sarcasm: {sarc_pred} | Probs: {[f'{p:.4f}' for p in sarc_probs]}")
|
| 62 |
+
print(f"Sentiment: {sent_pred} | Probs: {[f'{p:.4f}' for p in sent_probs]}")
|
| 63 |
+
print(f"Emotion: {emo_pred} | Probs: {[f'{p:.4f}' for p in emo_probs]}")
|
modeling.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
from transformers import AutoModel
|
| 5 |
+
|
| 6 |
+
class TaskHead(nn.Module):
|
| 7 |
+
def __init__(self, hidden_size, n_classes, dropout=0.3):
|
| 8 |
+
super().__init__()
|
| 9 |
+
self.net = nn.Sequential(
|
| 10 |
+
nn.Linear(hidden_size, 256),
|
| 11 |
+
nn.LayerNorm(256),
|
| 12 |
+
nn.GELU(),
|
| 13 |
+
nn.Dropout(dropout),
|
| 14 |
+
nn.Linear(256, n_classes),
|
| 15 |
+
)
|
| 16 |
+
def forward(self, x): return self.net(x)
|
| 17 |
+
|
| 18 |
+
class UnifiedMASRIHead(nn.Module):
|
| 19 |
+
def __init__(self, bert_model_name="T0KII/MASRIBERTv3", ft_dim=300, rnn_hidden=256, num_layers=2, dropout=0.3):
|
| 20 |
+
super().__init__()
|
| 21 |
+
self.bert = AutoModel.from_pretrained(bert_model_name)
|
| 22 |
+
self.bert_hidden = self.bert.config.hidden_size
|
| 23 |
+
self.bilstm = nn.LSTM(ft_dim, rnn_hidden, num_layers, batch_first=True, bidirectional=True, dropout=dropout)
|
| 24 |
+
self.bigru = nn.GRU(ft_dim, rnn_hidden, num_layers, batch_first=True, bidirectional=True, dropout=dropout)
|
| 25 |
+
self.rnn_dropout = nn.Dropout(dropout)
|
| 26 |
+
combined_dim = self.bert_hidden + (rnn_hidden * 4)
|
| 27 |
+
self.sarcasm_head = TaskHead(combined_dim, 2, dropout=0.5)
|
| 28 |
+
self.sentiment_head = TaskHead(combined_dim, 3, dropout=0.3)
|
| 29 |
+
self.emotion_head = TaskHead(combined_dim, 8, dropout=0.3)
|
| 30 |
+
|
| 31 |
+
def forward(self, input_ids, attention_mask, ft_embeds, task_name):
|
| 32 |
+
bert_out = self.bert(input_ids, attention_mask=attention_mask)
|
| 33 |
+
cls_vec = bert_out.last_hidden_state[:, 0, :]
|
| 34 |
+
lstm_out, _ = self.bilstm(ft_embeds)
|
| 35 |
+
gru_out, _ = self.bigru(ft_embeds)
|
| 36 |
+
rnn_feat = torch.cat([lstm_out[:, -1, :], gru_out[:, -1, :]], dim=1)
|
| 37 |
+
combined = torch.cat([cls_vec, self.rnn_dropout(rnn_feat)], dim=1)
|
| 38 |
+
if task_name == 'sarcasm': return self.sarcasm_head(combined)
|
| 39 |
+
elif task_name == 'sentiment': return self.sentiment_head(combined)
|
| 40 |
+
elif task_name == 'emotion': return self.emotion_head(combined)
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e217e8ed9e5724b16b258f37c1b08f6eb3da7b7aacdb37ec6a0c84dcd4d570d3
|
| 3 |
+
size 799748536
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=2.0.0
|
| 2 |
+
transformers>=4.30.0
|
| 3 |
+
fasttext
|
| 4 |
+
numpy
|
| 5 |
+
huggingface_hub
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"backend": "tokenizers",
|
| 3 |
+
"cls_token": "[CLS]",
|
| 4 |
+
"do_basic_tokenize": true,
|
| 5 |
+
"do_lower_case": true,
|
| 6 |
+
"is_local": false,
|
| 7 |
+
"mask_token": "[MASK]",
|
| 8 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 9 |
+
"never_split": null,
|
| 10 |
+
"pad_token": "[PAD]",
|
| 11 |
+
"sep_token": "[SEP]",
|
| 12 |
+
"strip_accents": null,
|
| 13 |
+
"tokenize_chinese_chars": true,
|
| 14 |
+
"tokenizer_class": "BertTokenizer",
|
| 15 |
+
"unk_token": "[UNK]"
|
| 16 |
+
}
|