T0KII commited on
Commit
31d7b01
·
verified ·
1 Parent(s): f23da70

Initial Deployment Package with Inference Script

Browse files
Files changed (8) hide show
  1. README.md +14 -0
  2. config.json +36 -0
  3. inference.py +63 -0
  4. modeling.py +40 -0
  5. pytorch_model.bin +3 -0
  6. requirements.txt +5 -0
  7. tokenizer.json +0 -0
  8. tokenizer_config.json +16 -0
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # taMASRIBERT – Egyptian Arabic Multi-Task Model
3
+
4
+ This model jointly performs **sarcasm detection**, **sentiment analysis** (3 classes), and **emotion recognition** (8 classes) for Egyptian Arabic and code-switched text. It combines a BERT‑based transformer (`MASRIBERTv3`) with a BiLSTM+BiGRU sequence encoder fed with FastText embeddings.
5
+
6
+ ## Model Architecture
7
+ - **BERT backbone**: `T0KII/MASRIBERTv3` (trained on MASRISET, 3.2M Egyptian Arabic texts)
8
+ - **Sequence encoder**: 2‑layer BiLSTM + 2‑layer BiGRU on FastText (300‑dim)
9
+ - **Deep fusion**: Concatenation of BERT `[CLS]` and RNN final states → task‑specific heads
10
+
11
+ ## Usage
12
+ 1. Clone this repository.
13
+ 2. Install dependencies: `pip install -r requirements.txt`
14
+ 3. Run the inference script: `python inference.py`
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "directionality": "bidi",
10
+ "dtype": "float32",
11
+ "eos_token_id": null,
12
+ "gradient_checkpointing": false,
13
+ "hidden_act": "gelu",
14
+ "hidden_dropout_prob": 0.1,
15
+ "hidden_size": 768,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "is_decoder": false,
19
+ "layer_norm_eps": 1e-12,
20
+ "max_position_embeddings": 512,
21
+ "model_type": "bert",
22
+ "num_attention_heads": 12,
23
+ "num_hidden_layers": 12,
24
+ "pad_token_id": 0,
25
+ "pooler_fc_size": 768,
26
+ "pooler_num_attention_heads": 12,
27
+ "pooler_num_fc_layers": 3,
28
+ "pooler_size_per_head": 128,
29
+ "pooler_type": "first_token_transform",
30
+ "position_embedding_type": "absolute",
31
+ "tie_word_embeddings": false,
32
+ "transformers_version": "5.0.0",
33
+ "type_vocab_size": 2,
34
+ "use_cache": false,
35
+ "vocab_size": 140274
36
+ }
inference.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import torch
4
+ import numpy as np
5
+ from transformers import AutoTokenizer
6
+ from modeling import UnifiedMASRIHead
7
+ import fasttext
8
+ from huggingface_hub import hf_hub_download
9
+
10
+ class taMASRIBERT:
11
+ def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
12
+ self.device = device
13
+ self.repo_id = "T0KII/taMASRIBERT"
14
+
15
+ print("Loading Tokenizer...")
16
+ self.tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
17
+
18
+ print("Loading FastText embeddings (~3.3GB)...")
19
+ self.ft_model = fasttext.load_model(hf_hub_download("facebook/fasttext-arz-vectors", "model.bin"))
20
+
21
+ print("Initializing Deep Fusion Architecture...")
22
+ self.model = UnifiedMASRIHead(bert_model_name="T0KII/MASRIBERTv3").to(self.device)
23
+
24
+ print("Fetching Model Weights...")
25
+ # Automatically downloads weights if they aren't local
26
+ weights_path = hf_hub_download(repo_id=self.repo_id, filename="pytorch_model.bin")
27
+ state_dict = torch.load(weights_path, map_location=self.device)
28
+ self.model.load_state_dict(state_dict)
29
+ self.model.eval()
30
+ print("✓ taMASRIBERT is ready for inference.")
31
+
32
+ def _get_ft_embedding(self, text, max_len=128, embed_dim=300):
33
+ tokens = str(text).split()[:max_len]
34
+ matrix = np.zeros((max_len, embed_dim), dtype=np.float32)
35
+ for i, tok in enumerate(tokens):
36
+ try: matrix[i] = self.ft_model.get_word_vector(tok)
37
+ except Exception: pass
38
+ return torch.from_numpy(matrix).unsqueeze(0)
39
+
40
+ def predict(self, text, task='sarcasm'):
41
+ enc = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=128, padding='max_length')
42
+ input_ids = enc['input_ids'].to(self.device)
43
+ attention_mask = enc['attention_mask'].to(self.device)
44
+ ft_embeds = self._get_ft_embedding(text).to(self.device)
45
+
46
+ with torch.no_grad():
47
+ logits = self.model(input_ids, attention_mask, ft_embeds, task)
48
+ probs = torch.softmax(logits, dim=1)
49
+ pred = torch.argmax(probs, dim=1).item()
50
+ return pred, probs.cpu().numpy().tolist()[0]
51
+
52
+ if __name__ == "__main__":
53
+ model = taMASRIBERT()
54
+ text = "يا سلام عليك يا عبقري"
55
+
56
+ sarc_pred, sarc_probs = model.predict(text, task='sarcasm')
57
+ sent_pred, sent_probs = model.predict(text, task='sentiment')
58
+ emo_pred, emo_probs = model.predict(text, task='emotion')
59
+
60
+ print(f"\nText: {text}")
61
+ print(f"Sarcasm: {sarc_pred} | Probs: {[f'{p:.4f}' for p in sarc_probs]}")
62
+ print(f"Sentiment: {sent_pred} | Probs: {[f'{p:.4f}' for p in sent_probs]}")
63
+ print(f"Emotion: {emo_pred} | Probs: {[f'{p:.4f}' for p in emo_probs]}")
modeling.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import torch.nn as nn
4
+ from transformers import AutoModel
5
+
6
+ class TaskHead(nn.Module):
7
+ def __init__(self, hidden_size, n_classes, dropout=0.3):
8
+ super().__init__()
9
+ self.net = nn.Sequential(
10
+ nn.Linear(hidden_size, 256),
11
+ nn.LayerNorm(256),
12
+ nn.GELU(),
13
+ nn.Dropout(dropout),
14
+ nn.Linear(256, n_classes),
15
+ )
16
+ def forward(self, x): return self.net(x)
17
+
18
+ class UnifiedMASRIHead(nn.Module):
19
+ def __init__(self, bert_model_name="T0KII/MASRIBERTv3", ft_dim=300, rnn_hidden=256, num_layers=2, dropout=0.3):
20
+ super().__init__()
21
+ self.bert = AutoModel.from_pretrained(bert_model_name)
22
+ self.bert_hidden = self.bert.config.hidden_size
23
+ self.bilstm = nn.LSTM(ft_dim, rnn_hidden, num_layers, batch_first=True, bidirectional=True, dropout=dropout)
24
+ self.bigru = nn.GRU(ft_dim, rnn_hidden, num_layers, batch_first=True, bidirectional=True, dropout=dropout)
25
+ self.rnn_dropout = nn.Dropout(dropout)
26
+ combined_dim = self.bert_hidden + (rnn_hidden * 4)
27
+ self.sarcasm_head = TaskHead(combined_dim, 2, dropout=0.5)
28
+ self.sentiment_head = TaskHead(combined_dim, 3, dropout=0.3)
29
+ self.emotion_head = TaskHead(combined_dim, 8, dropout=0.3)
30
+
31
+ def forward(self, input_ids, attention_mask, ft_embeds, task_name):
32
+ bert_out = self.bert(input_ids, attention_mask=attention_mask)
33
+ cls_vec = bert_out.last_hidden_state[:, 0, :]
34
+ lstm_out, _ = self.bilstm(ft_embeds)
35
+ gru_out, _ = self.bigru(ft_embeds)
36
+ rnn_feat = torch.cat([lstm_out[:, -1, :], gru_out[:, -1, :]], dim=1)
37
+ combined = torch.cat([cls_vec, self.rnn_dropout(rnn_feat)], dim=1)
38
+ if task_name == 'sarcasm': return self.sarcasm_head(combined)
39
+ elif task_name == 'sentiment': return self.sentiment_head(combined)
40
+ elif task_name == 'emotion': return self.emotion_head(combined)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e217e8ed9e5724b16b258f37c1b08f6eb3da7b7aacdb37ec6a0c84dcd4d570d3
3
+ size 799748536
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ transformers>=4.30.0
3
+ fasttext
4
+ numpy
5
+ huggingface_hub
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "is_local": false,
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 1000000000000000019884624838656,
9
+ "never_split": null,
10
+ "pad_token": "[PAD]",
11
+ "sep_token": "[SEP]",
12
+ "strip_accents": null,
13
+ "tokenize_chinese_chars": true,
14
+ "tokenizer_class": "BertTokenizer",
15
+ "unk_token": "[UNK]"
16
+ }