Initial Deployment Package with Inference Script

Browse files

Files changed (8) hide show

README.md +14 -0
config.json +36 -0
inference.py +63 -0
modeling.py +40 -0
pytorch_model.bin +3 -0
requirements.txt +5 -0
tokenizer.json +0 -0
tokenizer_config.json +16 -0

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# taMASRIBERT – Egyptian Arabic Multi-Task Model
+This model jointly performs **sarcasm detection**, **sentiment analysis** (3 classes), and **emotion recognition** (8 classes) for Egyptian Arabic and code-switched text. It combines a BERT‑based transformer (`MASRIBERTv3`) with a BiLSTM+BiGRU sequence encoder fed with FastText embeddings.
+## Model Architecture
+- **BERT backbone**: `T0KII/MASRIBERTv3` (trained on MASRISET, 3.2M Egyptian Arabic texts)
+- **Sequence encoder**: 2‑layer BiLSTM + 2‑layer BiGRU on FastText (300‑dim)
+- **Deep fusion**: Concatenation of BERT `[CLS]` and RNN final states → task‑specific heads
+## Usage
+1. Clone this repository.
+2. Install dependencies: `pip install -r requirements.txt`
+3. Run the inference script: `python inference.py`

config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "add_cross_attention": false,
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": null,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "dtype": "float32",
+  "eos_token_id": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "is_decoder": false,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "tie_word_embeddings": false,
+  "transformers_version": "5.0.0",
+  "type_vocab_size": 2,
+  "use_cache": false,
+  "vocab_size": 140274
+}

inference.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import torch
+import numpy as np
+from transformers import AutoTokenizer
+from modeling import UnifiedMASRIHead
+import fasttext
+from huggingface_hub import hf_hub_download
+class taMASRIBERT:
+    def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
+        self.device = device
+        self.repo_id = "T0KII/taMASRIBERT"
+        print("Loading Tokenizer...")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
+        print("Loading FastText embeddings (~3.3GB)...")
+        self.ft_model = fasttext.load_model(hf_hub_download("facebook/fasttext-arz-vectors", "model.bin"))
+        print("Initializing Deep Fusion Architecture...")
+        self.model = UnifiedMASRIHead(bert_model_name="T0KII/MASRIBERTv3").to(self.device)
+        print("Fetching Model Weights...")
+        # Automatically downloads weights if they aren't local
+        weights_path = hf_hub_download(repo_id=self.repo_id, filename="pytorch_model.bin")
+        state_dict = torch.load(weights_path, map_location=self.device)
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+        print("✓ taMASRIBERT is ready for inference.")
+    def _get_ft_embedding(self, text, max_len=128, embed_dim=300):
+        tokens = str(text).split()[:max_len]
+        matrix = np.zeros((max_len, embed_dim), dtype=np.float32)
+        for i, tok in enumerate(tokens):
+            try: matrix[i] = self.ft_model.get_word_vector(tok)
+            except Exception: pass
+        return torch.from_numpy(matrix).unsqueeze(0)
+    def predict(self, text, task='sarcasm'):
+        enc = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=128, padding='max_length')
+        input_ids = enc['input_ids'].to(self.device)
+        attention_mask = enc['attention_mask'].to(self.device)
+        ft_embeds = self._get_ft_embedding(text).to(self.device)
+        with torch.no_grad():
+            logits = self.model(input_ids, attention_mask, ft_embeds, task)
+            probs = torch.softmax(logits, dim=1)
+            pred = torch.argmax(probs, dim=1).item()
+        return pred, probs.cpu().numpy().tolist()[0]
+if __name__ == "__main__":
+    model = taMASRIBERT()
+    text = "يا سلام عليك يا عبقري"
+    sarc_pred, sarc_probs = model.predict(text, task='sarcasm')
+    sent_pred, sent_probs = model.predict(text, task='sentiment')
+    emo_pred, emo_probs = model.predict(text, task='emotion')
+    print(f"\nText: {text}")
+    print(f"Sarcasm: {sarc_pred} | Probs: {[f'{p:.4f}' for p in sarc_probs]}")
+    print(f"Sentiment: {sent_pred} | Probs: {[f'{p:.4f}' for p in sent_probs]}")
+    print(f"Emotion: {emo_pred} | Probs: {[f'{p:.4f}' for p in emo_probs]}")

modeling.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+import torch.nn as nn
+from transformers import AutoModel
+class TaskHead(nn.Module):
+    def __init__(self, hidden_size, n_classes, dropout=0.3):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(hidden_size, 256),
+            nn.LayerNorm(256),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(256, n_classes),
+        )
+    def forward(self, x): return self.net(x)
+class UnifiedMASRIHead(nn.Module):
+    def __init__(self, bert_model_name="T0KII/MASRIBERTv3", ft_dim=300, rnn_hidden=256, num_layers=2, dropout=0.3):
+        super().__init__()
+        self.bert = AutoModel.from_pretrained(bert_model_name)
+        self.bert_hidden = self.bert.config.hidden_size
+        self.bilstm = nn.LSTM(ft_dim, rnn_hidden, num_layers, batch_first=True, bidirectional=True, dropout=dropout)
+        self.bigru = nn.GRU(ft_dim, rnn_hidden, num_layers, batch_first=True, bidirectional=True, dropout=dropout)
+        self.rnn_dropout = nn.Dropout(dropout)
+        combined_dim = self.bert_hidden + (rnn_hidden * 4)
+        self.sarcasm_head = TaskHead(combined_dim, 2, dropout=0.5)
+        self.sentiment_head = TaskHead(combined_dim, 3, dropout=0.3)
+        self.emotion_head = TaskHead(combined_dim, 8, dropout=0.3)
+    def forward(self, input_ids, attention_mask, ft_embeds, task_name):
+        bert_out = self.bert(input_ids, attention_mask=attention_mask)
+        cls_vec = bert_out.last_hidden_state[:, 0, :]
+        lstm_out, _ = self.bilstm(ft_embeds)
+        gru_out, _ = self.bigru(ft_embeds)
+        rnn_feat = torch.cat([lstm_out[:, -1, :], gru_out[:, -1, :]], dim=1)
+        combined = torch.cat([cls_vec, self.rnn_dropout(rnn_feat)], dim=1)
+        if task_name == 'sarcasm': return self.sarcasm_head(combined)
+        elif task_name == 'sentiment': return self.sentiment_head(combined)
+        elif task_name == 'emotion': return self.emotion_head(combined)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e217e8ed9e5724b16b258f37c1b08f6eb3da7b7aacdb37ec6a0c84dcd4d570d3
+size 799748536

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch>=2.0.0
+transformers>=4.30.0
+fasttext
+numpy
+huggingface_hub

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "backend": "tokenizers",
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "is_local": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}