Spaces:
Runtime error
Runtime error
Commit
·
bde1c71
1
Parent(s):
cec147a
refactor the code for better scalability and update tsac naming to sentiment analysis, adding madar dataset for transliteration and normalization eval
Browse files- Roadmap.md +236 -0
- app.py +5 -5
- pyproject.toml +7 -2
- src/about.py +7 -6
- src/configs/config.json +23 -0
- src/configs/config.py +21 -0
- src/evaluators/__init__.py +18 -0
- src/evaluators/base_evaluator.py +17 -0
- src/{evaluator → evaluators}/evaluate.py +56 -71
- src/evaluators/madar_tun.py +108 -0
- src/evaluators/normalization/__init__.py +1 -0
- src/evaluators/normalization/datasets.py +10 -0
- src/evaluators/normalization/evaluator.py +96 -0
- src/{evaluator → evaluators}/run_evaluator.py +1 -1
- src/evaluators/sentiment_analysis/__init__.py +0 -0
- src/evaluators/sentiment_analysis/dataset.py +0 -0
- src/evaluators/sentiment_analysis/evaluator.py +207 -0
- src/evaluators/transliteration/__init__.py +1 -0
- src/evaluators/transliteration/datasets.py +10 -0
- src/evaluators/transliteration/evaluator.py +96 -0
- src/evaluators/tsac.py +133 -0
- src/{evaluator → evaluators}/tunisian_corpus_coverage.py +0 -0
- src/submission/submit.py +1 -1
Roadmap.md
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## 🗺️ Tunisian NLP Leaderboard Roadmap
|
| 2 |
+
|
| 3 |
+
### 📌 Phase 1: Dataset Acquisition & Preparation
|
| 4 |
+
|
| 5 |
+
#### 1. **Sentiment Analysis**
|
| 6 |
+
|
| 7 |
+
* **Existing Dataset**: **TUNIZI**
|
| 8 |
+
|
| 9 |
+
* **Description**: A large dataset containing 100,000 Tunisian Arabizi comments annotated as positive, negative, or neutral.
|
| 10 |
+
* **Source**: [K4All Foundation](https://k4all.org/project/database-tunisian-arabizi/)
|
| 11 |
+
|
| 12 |
+
* **Usage**: Utilize this dataset to evaluate models' performance in sentiment classification tasks.
|
| 13 |
+
|
| 14 |
+
#### 2. **Named Entity Recognition (NER)**
|
| 15 |
+
|
| 16 |
+
* **Existing Dataset**: **ArabNER**
|
| 17 |
+
|
| 18 |
+
* **Description**: A comprehensive Arabic NER corpus that can be adapted for Tunisian dialects.
|
| 19 |
+
* **Source**: [ResearchGate](https://www.researchgate.net/publication/374279027_Named_Entity_Recognition_of_Tunisian_Arabic_Using_the_Bi-LSTM-CRF_Model)
|
| 20 |
+
|
| 21 |
+
* **Usage**: Fine-tune models on this dataset to assess their ability to recognize entities in Tunisian Arabic text.
|
| 22 |
+
|
| 23 |
+
#### 3. **Corpus Coverage**
|
| 24 |
+
|
| 25 |
+
* **Existing Dataset**: **Tunisian Dialect Corpus**
|
| 26 |
+
|
| 27 |
+
* **Description**: A sizable collection of Tunisian dialect texts, useful for assessing vocabulary coverage.
|
| 28 |
+
* **Source**: [Hugging Face](https://huggingface.co/collections/tunis-ai/arabic-datasets-66344cf0df31dc81eb1dcf55)
|
| 29 |
+
|
| 30 |
+
* **Usage**: Evaluate models' coverage of the Tunisian dialect vocabulary using this corpus.
|
| 31 |
+
|
| 32 |
+
#### 4. **Arabizi Robustness**
|
| 33 |
+
|
| 34 |
+
* **Existing Dataset**: **TUNIZI**
|
| 35 |
+
|
| 36 |
+
* **Description**: Since it's in Arabizi, it can also serve to evaluate models' robustness to this writing style.
|
| 37 |
+
* **Source**: [K4All Foundation](https://k4all.org/project/database-tunisian-arabizi/)
|
| 38 |
+
|
| 39 |
+
* **Usage**: Assess models' robustness to Arabizi by evaluating their performance on this dataset.
|
| 40 |
+
|
| 41 |
+
#### 5. **Code-Switching**
|
| 42 |
+
|
| 43 |
+
* **Existing Dataset**: **TunSwitch**
|
| 44 |
+
|
| 45 |
+
* **Description**: A dataset of code-switched Tunisian Arabic speech, valuable for training and evaluating models on code-switching tasks.
|
| 46 |
+
* **Source**: [Zenodo](https://zenodo.org/records/8342762)
|
| 47 |
+
|
| 48 |
+
* **Usage**: Evaluate models' ability to handle code-switching between Tunisian Arabic and other languages using this dataset.
|
| 49 |
+
|
| 50 |
+
#### 6. **Typo Robustness**
|
| 51 |
+
|
| 52 |
+
* **Existing Dataset**: **TUNIZI**
|
| 53 |
+
|
| 54 |
+
* **Description**: Its informal nature includes typographical variations, making it suitable for evaluating models' tolerance to typos.
|
| 55 |
+
* **Source**: [K4All Foundation](https://k4all.org/project/database-tunisian-arabizi/)
|
| 56 |
+
|
| 57 |
+
* **Usage**: Assess models' robustness to typographical errors by evaluating their performance on this dataset.
|
| 58 |
+
|
| 59 |
+
#### 7. **Zero-Shot Transfer**
|
| 60 |
+
|
| 61 |
+
* **Existing Dataset**: **TUNIZI**
|
| 62 |
+
|
| 63 |
+
* **Description**: Can be used to test models' ability to generalize to tasks they weren't explicitly trained on.
|
| 64 |
+
* **Source**: [K4All Foundation](https://k4all.org/project/database-tunisian-arabizi/)
|
| 65 |
+
|
| 66 |
+
* **Usage**: Evaluate models' zero-shot transfer capabilities by assessing their performance on this dataset.
|
| 67 |
+
|
| 68 |
+
#### 8. **Domain Shift**
|
| 69 |
+
|
| 70 |
+
* **Existing Dataset**: **TUNIZI**
|
| 71 |
+
|
| 72 |
+
* **Description**: Its diverse sources provide a foundation for testing domain adaptation capabilities.
|
| 73 |
+
* **Source**: [K4All Foundation](https://k4all.org/project/database-tunisian-arabizi/)
|
| 74 |
+
|
| 75 |
+
* **Usage**: Assess models' ability to adapt to different domains by evaluating their performance on this dataset.
|
| 76 |
+
|
| 77 |
+
---
|
| 78 |
+
|
| 79 |
+
### 🧪 Phase 2: Metric Development & Evaluation Tasks
|
| 80 |
+
|
| 81 |
+
For each task, define the evaluation metric and the corresponding dataset:
|
| 82 |
+
|
| 83 |
+
| Task | Metric | Dataset |
|
| 84 |
+
| ------------------------ | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| 85 |
+
| Sentiment Analysis | Accuracy / F1 Score | [TUNIZI](https://k4all.org/project/database-tunisian-arabizi/) |
|
| 86 |
+
| Named Entity Recognition | F1 Score | [ArabNER](https://www.researchgate.net/publication/374279027_Named_Entity_Recognition_of_Tunisian_Arabic_Using_the_Bi-LSTM-CRF_Model) |
|
| 87 |
+
| Corpus Coverage | Vocabulary Coverage (%) | [Tunisian Dialect Corpus](https://huggingface.co/collections/tunis-ai/arabic-datasets-66344cf0df31dc81eb1dcf55) |
|
| 88 |
+
| Arabizi Robustness | Accuracy / F1 Score | [TUNIZI](https://k4all.org/project/database-tunisian-arabizi/) |
|
| 89 |
+
| Code-Switching | Accuracy / F1 Score | [TunSwitch](https://zenodo.org/records/8342762) |
|
| 90 |
+
| Typo Robustness | Accuracy / F1 Score | [TUNIZI]([https://k4all.org/project/database-tunisian](https://k4all.org/project/database-tunisian) |
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
Certainly! Here's a comprehensive roadmap to guide you through enhancing your **TunisianEncoderModelsLeaderboard** project, focusing on dataset acquisition, metric development, and evaluation tasks.
|
| 94 |
+
|
| 95 |
+
---
|
| 96 |
+
|
| 97 |
+
## 🗺️ Tunisian NLP Leaderboard Roadmap
|
| 98 |
+
|
| 99 |
+
### 📌 Phase 1: Dataset Acquisition & Preparation
|
| 100 |
+
|
| 101 |
+
#### 1. **Sentiment Analysis**
|
| 102 |
+
|
| 103 |
+
* **Existing Dataset**: **TUNIZI**
|
| 104 |
+
|
| 105 |
+
* **Description**: A large dataset containing 100,000 Tunisian Arabizi comments annotated as positive, negative, or neutral.
|
| 106 |
+
* **Source**: [K4All Foundation](https://k4all.org/project/database-tunisian-arabizi/)
|
| 107 |
+
|
| 108 |
+
* **Usage**: Utilize this dataset to evaluate models' performance in sentiment classification tasks.
|
| 109 |
+
|
| 110 |
+
#### 2. **Named Entity Recognition (NER)**
|
| 111 |
+
|
| 112 |
+
* **Existing Dataset**: **ArabNER**
|
| 113 |
+
|
| 114 |
+
* **Description**: A comprehensive Arabic NER corpus that can be adapted for Tunisian dialects.
|
| 115 |
+
* **Source**: [ResearchGate](https://www.researchgate.net/publication/374279027_Named_Entity_Recognition_of_Tunisian_Arabic_Using_the_Bi-LSTM-CRF_Model)
|
| 116 |
+
|
| 117 |
+
* **Usage**: Fine-tune models on this dataset to assess their ability to recognize entities in Tunisian Arabic text.
|
| 118 |
+
|
| 119 |
+
#### 3. **Corpus Coverage**
|
| 120 |
+
|
| 121 |
+
* **Existing Dataset**: **Tunisian Dialect Corpus**
|
| 122 |
+
|
| 123 |
+
* **Description**: A sizable collection of Tunisian dialect texts, useful for assessing vocabulary coverage.
|
| 124 |
+
* **Source**: [Hugging Face](https://huggingface.co/collections/tunis-ai/arabic-datasets-66344cf0df31dc81eb1dcf55)
|
| 125 |
+
|
| 126 |
+
* **Usage**: Evaluate models' coverage of the Tunisian dialect vocabulary using this corpus.
|
| 127 |
+
|
| 128 |
+
#### 4. **Arabizi Robustness**
|
| 129 |
+
|
| 130 |
+
* **Existing Dataset**: **TUNIZI**
|
| 131 |
+
|
| 132 |
+
* **Description**: Since it's in Arabizi, it can also serve to evaluate models' robustness to this writing style.
|
| 133 |
+
* **Source**: [K4All Foundation](https://k4all.org/project/database-tunisian-arabizi/)
|
| 134 |
+
|
| 135 |
+
* **Usage**: Assess models' robustness to Arabizi by evaluating their performance on this dataset.
|
| 136 |
+
|
| 137 |
+
#### 5. **Code-Switching**
|
| 138 |
+
|
| 139 |
+
* **Existing Dataset**: **TunSwitch**
|
| 140 |
+
|
| 141 |
+
* **Description**: A dataset of code-switched Tunisian Arabic speech, valuable for training and evaluating models on code-switching tasks.
|
| 142 |
+
* **Source**: [Zenodo](https://zenodo.org/records/8342762)
|
| 143 |
+
|
| 144 |
+
* **Usage**: Evaluate models' ability to handle code-switching between Tunisian Arabic and other languages using this dataset.
|
| 145 |
+
|
| 146 |
+
#### 6. **Typo Robustness**
|
| 147 |
+
|
| 148 |
+
* **Existing Dataset**: **TUNIZI**
|
| 149 |
+
|
| 150 |
+
* **Description**: Its informal nature includes typographical variations, making it suitable for evaluating models' tolerance to typos.
|
| 151 |
+
* **Source**: [K4All Foundation](https://k4all.org/project/database-tunisian-arabizi/)
|
| 152 |
+
|
| 153 |
+
* **Usage**: Assess models' robustness to typographical errors by evaluating their performance on this dataset.
|
| 154 |
+
|
| 155 |
+
#### 7. **Zero-Shot Transfer**
|
| 156 |
+
|
| 157 |
+
* **Existing Dataset**: **TUNIZI**
|
| 158 |
+
|
| 159 |
+
* **Description**: Can be used to test models' ability to generalize to tasks they weren't explicitly trained on.
|
| 160 |
+
* **Source**: [K4All Foundation](https://k4all.org/project/database-tunisian-arabizi/)
|
| 161 |
+
|
| 162 |
+
* **Usage**: Evaluate models' zero-shot transfer capabilities by assessing their performance on this dataset.
|
| 163 |
+
|
| 164 |
+
#### 8. **Domain Shift**
|
| 165 |
+
|
| 166 |
+
* **Existing Dataset**: **TUNIZI**
|
| 167 |
+
|
| 168 |
+
* **Description**: Its diverse sources provide a foundation for testing domain adaptation capabilities.
|
| 169 |
+
* **Source**: [K4All Foundation](https://k4all.org/project/database-tunisian-arabizi/)
|
| 170 |
+
|
| 171 |
+
* **Usage**: Assess models' ability to adapt to different domains by evaluating their performance on this dataset.
|
| 172 |
+
|
| 173 |
+
---
|
| 174 |
+
|
| 175 |
+
### 🧪 Phase 2: Metric Development & Evaluation Tasks
|
| 176 |
+
|
| 177 |
+
For each task, define the evaluation metric and the corresponding dataset:
|
| 178 |
+
|
| 179 |
+
| Task | Metric | Dataset |
|
| 180 |
+
| ------------------------ | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
| 181 |
+
| Sentiment Analysis | Accuracy / F1 Score | [TUNIZI](https://k4all.org/project/database-tunisian-arabizi/) |
|
| 182 |
+
| Named Entity Recognition | F1 Score | [ArabNER](https://www.researchgate.net/publication/374279027_Named_Entity_Recognition_of_Tunisian_Arabic_Using_the_Bi-LSTM-CRF_Model) |
|
| 183 |
+
| Corpus Coverage | Vocabulary Coverage (%) | [Tunisian Dialect Corpus](https://huggingface.co/collections/tunis-ai/arabic-datasets-66344cf0df31dc81eb1dcf55) |
|
| 184 |
+
| Arabizi Robustness | Accuracy / F1 Score | [TUNIZI](https://k4all.org/project/database-tunisian-arabizi/) |
|
| 185 |
+
| Code-Switching | Accuracy / F1 Score | [TunSwitch](https://zenodo.org/records/8342762) |
|
| 186 |
+
| Typo Robustness | Accuracy / F1 Score | [TUNIZI](https://k4all.org/project/database-tunisian-arabizi/) |
|
| 187 |
+
| Zero-Shot Transfer | Accuracy / F1 Score | [TUNIZI](https://k4all.org/project/database-tunisian-arabizi/) |
|
| 188 |
+
| Domain Shift | Accuracy / F1 Score | [TUNIZI](https://k4all.org/project/database-tunisian-arabizi/) |
|
| 189 |
+
|
| 190 |
+
---
|
| 191 |
+
|
| 192 |
+
### 🗂️ Suggested Folder Structure
|
| 193 |
+
|
| 194 |
+
To maintain organization and clarity, consider the following structure:
|
| 195 |
+
|
| 196 |
+
```
|
| 197 |
+
TunisianEncoderModelsLeaderboard/
|
| 198 |
+
├── datasets/
|
| 199 |
+
│ ├── sentiment/
|
| 200 |
+
│ │ └── tunizi.json
|
| 201 |
+
│ ├── ner/
|
| 202 |
+
│ │ └── arabner.json
|
| 203 |
+
│ ├── coverage/
|
| 204 |
+
│ │ └── tunisian_dialect_corpus.json
|
| 205 |
+
│ ├── arabizi_robustness/
|
| 206 |
+
│ │ └── tunizi.json
|
| 207 |
+
│ ├── code_switching/
|
| 208 |
+
│ │ └── tunswitch.json
|
| 209 |
+
│ ├── typo_robustness/
|
| 210 |
+
│ │ └── tunizi_with_typos.json
|
| 211 |
+
│ ├── zero_shot/
|
| 212 |
+
│ │ └── tunizi.json
|
| 213 |
+
│ └── domain_shift/
|
| 214 |
+
│ └── tunisian_domain_shift.json
|
| 215 |
+
├── scripts/
|
| 216 |
+
│ ├── preprocess.py
|
| 217 |
+
│ ├── evaluate.py
|
| 218 |
+
│ └── visualize.py
|
| 219 |
+
└── README.md
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
---
|
| 223 |
+
|
| 224 |
+
### ✅ Next Steps
|
| 225 |
+
|
| 226 |
+
1. **Integrate Existing Datasets**: Incorporate the datasets mentioned above into your repository, ensuring they are properly formatted and documented.
|
| 227 |
+
|
| 228 |
+
2. **Develop Evaluation Scripts**: Write scripts to evaluate models on each task, ensuring they are compatible with the leaderboard format.
|
| 229 |
+
|
| 230 |
+
3. **Populate the Leaderboard**: As models are evaluated, update the leaderboard to reflect their performance across tasks.
|
| 231 |
+
|
| 232 |
+
4. **Documentation**: Update the README.md file to provide clear instructions on how to use the leaderboard, contribute models, and interpret results.
|
| 233 |
+
|
| 234 |
+
---
|
| 235 |
+
|
| 236 |
+
If you need assistance with data collection, annotation guidelines, or script development, feel free to ask!
|
app.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
from dotenv import load_dotenv
|
| 2 |
|
| 3 |
-
load_dotenv()
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
|
@@ -31,8 +30,9 @@ from src.display.utils import (
|
|
| 31 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 32 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 33 |
from src.submission.submit import add_new_eval
|
| 34 |
-
from src.
|
| 35 |
|
|
|
|
| 36 |
def restart_space():
|
| 37 |
try:
|
| 38 |
print("Restarting space...")
|
|
@@ -240,9 +240,9 @@ with demo:
|
|
| 240 |
|
| 241 |
|
| 242 |
|
| 243 |
-
scheduler = BackgroundScheduler()
|
| 244 |
-
scheduler.add_job(restart_space, "interval", seconds=120)
|
| 245 |
thread = threading.Thread(target=evaluator_runner)
|
| 246 |
-
scheduler.start()
|
| 247 |
thread.start()
|
| 248 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 1 |
from dotenv import load_dotenv
|
| 2 |
|
|
|
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
|
|
|
| 30 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 31 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 32 |
from src.submission.submit import add_new_eval
|
| 33 |
+
from src.evaluators.run_evaluator import evaluator_runner
|
| 34 |
|
| 35 |
+
load_dotenv()
|
| 36 |
def restart_space():
|
| 37 |
try:
|
| 38 |
print("Restarting space...")
|
|
|
|
| 240 |
|
| 241 |
|
| 242 |
|
| 243 |
+
# scheduler = BackgroundScheduler()
|
| 244 |
+
# scheduler.add_job(restart_space, "interval", seconds=120)
|
| 245 |
thread = threading.Thread(target=evaluator_runner)
|
| 246 |
+
# scheduler.start()
|
| 247 |
thread.start()
|
| 248 |
demo.queue(default_concurrency_limit=40).launch()
|
pyproject.toml
CHANGED
|
@@ -12,8 +12,6 @@ dependencies = [
|
|
| 12 |
"gradio-leaderboard==0.0.13",
|
| 13 |
"gradio[oauth]>=5.35.0",
|
| 14 |
"huggingface-hub>=0.18.0",
|
| 15 |
-
"ipykernel>=6.29.5",
|
| 16 |
-
"ipywidgets>=8.1.7",
|
| 17 |
"matplotlib>=3.10.3",
|
| 18 |
"numpy>=2.3.1",
|
| 19 |
"pandas>=2.3.0",
|
|
@@ -22,12 +20,19 @@ dependencies = [
|
|
| 22 |
"python-dotenv>=1.1.1",
|
| 23 |
"scikit-learn>=1.7.0",
|
| 24 |
"sentencepiece>=0.2.0",
|
|
|
|
| 25 |
"tokenizers>=0.15.0",
|
| 26 |
"torch>=2.7.1",
|
| 27 |
"tqdm>=4.67.1",
|
| 28 |
"transformers>=4.53.1",
|
| 29 |
]
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
[tool.ruff]
|
| 32 |
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
| 33 |
select = ["E", "F"]
|
|
|
|
| 12 |
"gradio-leaderboard==0.0.13",
|
| 13 |
"gradio[oauth]>=5.35.0",
|
| 14 |
"huggingface-hub>=0.18.0",
|
|
|
|
|
|
|
| 15 |
"matplotlib>=3.10.3",
|
| 16 |
"numpy>=2.3.1",
|
| 17 |
"pandas>=2.3.0",
|
|
|
|
| 20 |
"python-dotenv>=1.1.1",
|
| 21 |
"scikit-learn>=1.7.0",
|
| 22 |
"sentencepiece>=0.2.0",
|
| 23 |
+
"seqeval>=1.2.2",
|
| 24 |
"tokenizers>=0.15.0",
|
| 25 |
"torch>=2.7.1",
|
| 26 |
"tqdm>=4.67.1",
|
| 27 |
"transformers>=4.53.1",
|
| 28 |
]
|
| 29 |
|
| 30 |
+
[project.optional-dependencies]
|
| 31 |
+
dev = [
|
| 32 |
+
"ipykernel>=6.30.1",
|
| 33 |
+
"ipywidgets>=8.1.7",
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
[tool.ruff]
|
| 37 |
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
|
| 38 |
select = ["E", "F"]
|
src/about.py
CHANGED
|
@@ -8,13 +8,14 @@ class Task:
|
|
| 8 |
col_name: str # Column name
|
| 9 |
|
| 10 |
|
| 11 |
-
# Tunisian Dialect Tasks
|
| 12 |
-
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
coverage = Task("arbml/Tunisian_Dialect_Corpus", "coverage", "Coverage
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 20 |
# ---------------------------------------------------
|
|
|
|
| 8 |
col_name: str # Column name
|
| 9 |
|
| 10 |
|
|
|
|
|
|
|
| 11 |
class Tasks(Enum):
|
| 12 |
+
sentiment_accuracy = Task("fbougares/tsac", "accuracy", "Accuracy (TSAC) ⬆️")
|
| 13 |
+
sentiment_f1 = Task("fbougares/tsac", "macro_f1", "Macro-F1 (TSAC) ⬆️")
|
| 14 |
+
ner_f1 = Task("arbml/tunisian_ner", "entity_f1", "Entity F1 (NER) ⬆️")
|
| 15 |
+
coverage = Task("arbml/Tunisian_Dialect_Corpus", "coverage", "Corpus Coverage % ⬆️")
|
| 16 |
+
arabizi_robustness = Task("tunis-ai/arabizi_eval", "arabizi_f1", "Arabizi Robustness F1 ⬆️")
|
| 17 |
+
code_switch = Task("tunis-ai/codeswitch_eval", "accuracy", "Code-Switch Accuracy ⬆️")
|
| 18 |
+
typo_robustness = Task("tunis-ai/typo_eval", "f1_drop", "Typo Robustness Drop % ⬇️")
|
| 19 |
|
| 20 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 21 |
# ---------------------------------------------------
|
src/configs/config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tsac": {
|
| 3 |
+
"path": "fbougares/tsac",
|
| 4 |
+
"text_column": "sentence",
|
| 5 |
+
"label_column": "target",
|
| 6 |
+
"label_map": {
|
| 7 |
+
"0": 0,
|
| 8 |
+
"1": 1
|
| 9 |
+
},
|
| 10 |
+
"trust_remote_code": true
|
| 11 |
+
},
|
| 12 |
+
"tunisian_sentiment": {
|
| 13 |
+
"path": "your-org/tunisian-sentiment",
|
| 14 |
+
"text_column": "text",
|
| 15 |
+
"label_column": "label",
|
| 16 |
+
"label_map": {
|
| 17 |
+
"negative": 0,
|
| 18 |
+
"positive": 1,
|
| 19 |
+
"neutral": -1
|
| 20 |
+
},
|
| 21 |
+
"trust_remote_code": false
|
| 22 |
+
}
|
| 23 |
+
}
|
src/configs/config.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class Config(BaseModel):
|
| 5 |
+
{
|
| 6 |
+
"tsac": {
|
| 7 |
+
"path": "fbougares/tsac",
|
| 8 |
+
"text_column": "sentence",
|
| 9 |
+
"label_column": "target",
|
| 10 |
+
"label_map": {0: 0, 1: 1}, # already binary
|
| 11 |
+
"trust_remote_code": True
|
| 12 |
+
},
|
| 13 |
+
"tunisian_sentiment": {
|
| 14 |
+
"path": "your-org/tunisian-sentiment", # hypothetical
|
| 15 |
+
"text_column": "text",
|
| 16 |
+
"label_column": "label",
|
| 17 |
+
"label_map": {"negative": 0, "positive": 1, "neutral": -1}, # drop neutral
|
| 18 |
+
"trust_remote_code": False
|
| 19 |
+
},
|
| 20 |
+
# Add more as they become available
|
| 21 |
+
}
|
src/evaluators/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/evaluators/__init__.py
|
| 2 |
+
from typing import Dict, Type
|
| 3 |
+
from .base_evaluator import BaseEvaluator
|
| 4 |
+
|
| 5 |
+
# Import all evaluators
|
| 6 |
+
from .sentiment_analysis.evaluator import SentimentAnalysisEvaluator
|
| 7 |
+
# from .tunisian_corpus_coverage import TunisianCorpusCoverageEvaluator
|
| 8 |
+
# Add new ones here as you create them:
|
| 9 |
+
from .normalization import NormalizationEvaluator
|
| 10 |
+
from .transliteration import TransliterationEvaluator
|
| 11 |
+
|
| 12 |
+
# Registry: task_name → Evaluator class
|
| 13 |
+
EVALUATOR_REGISTRY: Dict[str, Type[BaseEvaluator]] = {
|
| 14 |
+
"Sentiment Analysis": SentimentAnalysisEvaluator,
|
| 15 |
+
# "Corpus Coverage": TunisianCorpusCoverageEvaluator,
|
| 16 |
+
"Normalization": NormalizationEvaluator,
|
| 17 |
+
"Transliteration": TransliterationEvaluator,
|
| 18 |
+
}
|
src/evaluators/base_evaluator.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# evaluators/base_evaluator.py
|
| 2 |
+
from abc import ABC, abstractmethod
|
| 3 |
+
from typing import Dict, Any
|
| 4 |
+
|
| 5 |
+
class BaseEvaluator(ABC):
|
| 6 |
+
@abstractmethod
|
| 7 |
+
def load_dataset(self):
|
| 8 |
+
pass
|
| 9 |
+
|
| 10 |
+
@abstractmethod
|
| 11 |
+
def evaluate(self, model, tokenizer, device) -> Dict[str, Any]:
|
| 12 |
+
pass
|
| 13 |
+
|
| 14 |
+
@property
|
| 15 |
+
@abstractmethod
|
| 16 |
+
def task_name(self) -> str:
|
| 17 |
+
pass
|
src/{evaluator → evaluators}/evaluate.py
RENAMED
|
@@ -5,14 +5,16 @@ from typing import Dict
|
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from enum import Enum
|
| 7 |
import torch
|
| 8 |
-
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
| 9 |
import traceback
|
| 10 |
|
| 11 |
-
from src.
|
| 12 |
-
from src.
|
| 13 |
-
from src.
|
| 14 |
-
|
|
|
|
| 15 |
|
|
|
|
| 16 |
class EvaluationStatus(Enum):
|
| 17 |
PENDING = "PENDING"
|
| 18 |
RUNNING = "RUNNING"
|
|
@@ -30,85 +32,66 @@ class EvaluationResult:
|
|
| 30 |
error: str = None
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
| 33 |
def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
|
| 34 |
"""
|
| 35 |
-
Evaluates a
|
| 36 |
-
|
| 37 |
-
Args:
|
| 38 |
-
model_name (str): The name of the model on the Hugging Face Hub.
|
| 39 |
-
revision (str): The specific revision (commit hash or branch name) to use.
|
| 40 |
-
precision (str): The precision (e.g., 'float16') for model loading.
|
| 41 |
-
weight_type (str): The type of weights ('Original' or 'Adapter').
|
| 42 |
-
|
| 43 |
-
Returns:
|
| 44 |
-
EvaluationResult: A dataclass containing the evaluation results or an error message.
|
| 45 |
"""
|
| 46 |
try:
|
| 47 |
-
print(f"\nStarting evaluation for model: {model_name}
|
| 48 |
-
|
| 49 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 50 |
-
print(f"Using device: {device}")
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
print("\nStarting TSAC sentiment evaluation...")
|
| 81 |
-
try:
|
| 82 |
-
tsac_results = evaluate_tsac_sentiment(model, tokenizer, device)
|
| 83 |
-
print(f"TSAC results: {tsac_results}")
|
| 84 |
-
except Exception as e:
|
| 85 |
-
print(f"Error in TSAC evaluation for {model_name}: {str(e)}")
|
| 86 |
-
print(f"Full traceback: {traceback.format_exc()}")
|
| 87 |
-
|
| 88 |
-
print("\nStarting Tunisian Corpus evaluation...")
|
| 89 |
-
try:
|
| 90 |
-
tunisian_results = evaluate_tunisian_corpus_coverage(model, tokenizer, device)
|
| 91 |
-
print(f"Tunisian Corpus results: {tunisian_results}")
|
| 92 |
-
except Exception as e:
|
| 93 |
-
print(f"Error in Tunisian Corpus evaluation for {model_name}: {str(e)}")
|
| 94 |
-
print(f"Full traceback: {traceback.format_exc()}")
|
| 95 |
-
|
| 96 |
-
print("\nEvaluation completed successfully!")
|
| 97 |
-
|
| 98 |
return EvaluationResult(
|
| 99 |
model=model_name,
|
| 100 |
revision=revision,
|
| 101 |
precision=precision,
|
| 102 |
weight_type=weight_type,
|
| 103 |
-
results=
|
| 104 |
-
"accuracy": tsac_results.get("fbougares/tsac"),
|
| 105 |
-
"coverage": tunisian_results.get("arbml/Tunisian_Dialect_Corpus")
|
| 106 |
-
}
|
| 107 |
)
|
|
|
|
| 108 |
except Exception as e:
|
| 109 |
-
error_msg = f"
|
| 110 |
-
print(f"
|
| 111 |
-
print(f"Full traceback: {traceback.format_exc()}")
|
| 112 |
return EvaluationResult(
|
| 113 |
model=model_name,
|
| 114 |
revision=revision,
|
|
@@ -152,7 +135,7 @@ def process_evaluation_queue():
|
|
| 152 |
This function acts as a worker that finds a PENDING job, runs it,
|
| 153 |
and updates the status on the Hugging Face Hub.
|
| 154 |
"""
|
| 155 |
-
print(
|
| 156 |
print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 157 |
|
| 158 |
print(f"Looking for evaluation requests in: {EVAL_REQUESTS_PATH}")
|
|
@@ -206,6 +189,8 @@ def process_evaluation_queue():
|
|
| 206 |
|
| 207 |
for v in eval_result.results.values():
|
| 208 |
if v is None:
|
|
|
|
|
|
|
| 209 |
eval_result.error += f"Evaluation failed for {eval_entry['model']}: {v} is None"
|
| 210 |
|
| 211 |
print("\n=== Evaluation completed ===")
|
|
|
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from enum import Enum
|
| 7 |
import torch
|
| 8 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer,AutoModel
|
| 9 |
import traceback
|
| 10 |
|
| 11 |
+
from src.evaluators import EVALUATOR_REGISTRY
|
| 12 |
+
from src.evaluators.base_evaluator import BaseEvaluator
|
| 13 |
+
from src.envs import API, EVAL_REQUESTS_PATH, RESULTS_REPO, QUEUE_REPO,TOKEN
|
| 14 |
+
# from src.evaluators.tunisian_corpus_coverage import evaluate_tunisian_corpus_coverage
|
| 15 |
+
from src.evaluators.sentiment_analysis.evaluator import SentimentAnalysisEvaluator
|
| 16 |
|
| 17 |
+
sa_evaluator = SentimentAnalysisEvaluator()
|
| 18 |
class EvaluationStatus(Enum):
|
| 19 |
PENDING = "PENDING"
|
| 20 |
RUNNING = "RUNNING"
|
|
|
|
| 32 |
error: str = None
|
| 33 |
|
| 34 |
|
| 35 |
+
|
| 36 |
+
|
| 37 |
def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
|
| 38 |
"""
|
| 39 |
+
Evaluates a model on ALL registered tasks.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
"""
|
| 41 |
try:
|
| 42 |
+
print(f"\nStarting evaluation for model: {model_name}")
|
|
|
|
| 43 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
| 44 |
|
| 45 |
+
# Load model & tokenizer ONCE
|
| 46 |
+
print("Loading classification model and tokenizer...")
|
| 47 |
+
classification_model = AutoModelForSequenceClassification.from_pretrained(
|
| 48 |
+
model_name,
|
| 49 |
+
revision=revision,
|
| 50 |
+
torch_dtype=getattr(torch, precision),
|
| 51 |
+
trust_remote_code=True
|
| 52 |
+
).to(device)
|
| 53 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
|
| 54 |
+
print("✅ Classification Model loaded successfully.")
|
| 55 |
+
print("Loading base model...")
|
| 56 |
+
embdding_model = AutoModel.from_pretrained(
|
| 57 |
+
model_name,
|
| 58 |
+
revision=revision,
|
| 59 |
+
torch_dtype=getattr(torch, precision),
|
| 60 |
+
trust_remote_code=True
|
| 61 |
+
).to(device)
|
| 62 |
+
print("✅ Embedding Model loaded successfully.")
|
| 63 |
+
all_results = {}
|
| 64 |
+
for task_name, EvaluatorClass in EVALUATOR_REGISTRY.items():
|
| 65 |
+
print(f"\n--- Evaluating: {task_name} ---")
|
| 66 |
+
try:
|
| 67 |
+
if task_name == "Sentiment Analysis":
|
| 68 |
+
model = classification_model
|
| 69 |
+
elif task_name in ["Transliteration, Normalization"]:
|
| 70 |
+
model = embdding_model
|
| 71 |
+
|
| 72 |
+
evaluator: BaseEvaluator = EvaluatorClass()
|
| 73 |
+
result = evaluator.evaluate(model, tokenizer, device=device)
|
| 74 |
+
|
| 75 |
+
# Extract main metric (must be in every evaluator)
|
| 76 |
+
all_results[task_name] = result["main_metric"]
|
| 77 |
+
print(f"✅ {task_name}: {result['main_metric']:.4f}")
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
error_msg = f"Failed {task_name}: {str(e)}"
|
| 81 |
+
print(f"❌ {error_msg}")
|
| 82 |
+
all_results[task_name] = None # or skip
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
return EvaluationResult(
|
| 85 |
model=model_name,
|
| 86 |
revision=revision,
|
| 87 |
precision=precision,
|
| 88 |
weight_type=weight_type,
|
| 89 |
+
results=all_results
|
|
|
|
|
|
|
|
|
|
| 90 |
)
|
| 91 |
+
|
| 92 |
except Exception as e:
|
| 93 |
+
error_msg = f"Critical failure: {str(e)}"
|
| 94 |
+
print(f"💥 {error_msg}")
|
|
|
|
| 95 |
return EvaluationResult(
|
| 96 |
model=model_name,
|
| 97 |
revision=revision,
|
|
|
|
| 135 |
This function acts as a worker that finds a PENDING job, runs it,
|
| 136 |
and updates the status on the Hugging Face Hub.
|
| 137 |
"""
|
| 138 |
+
print("\n=== Starting evaluation queue processing ===")
|
| 139 |
print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 140 |
|
| 141 |
print(f"Looking for evaluation requests in: {EVAL_REQUESTS_PATH}")
|
|
|
|
| 189 |
|
| 190 |
for v in eval_result.results.values():
|
| 191 |
if v is None:
|
| 192 |
+
if eval_result.error is None:
|
| 193 |
+
eval_result.error = ""
|
| 194 |
eval_result.error += f"Evaluation failed for {eval_entry['model']}: {v} is None"
|
| 195 |
|
| 196 |
print("\n=== Evaluation completed ===")
|
src/evaluators/madar_tun.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
# from transformers import AutoTokenizer, AutoModel
|
| 4 |
+
from sklearn.metrics import accuracy_score
|
| 5 |
+
# import argparse
|
| 6 |
+
import warnings
|
| 7 |
+
warnings.filterwarnings("ignore")
|
| 8 |
+
|
| 9 |
+
def load_and_prepare_data():
|
| 10 |
+
"""Load MADAR-TUN and prepare normalization & transliteration pairs."""
|
| 11 |
+
print("Loading MADAR-TUN dataset...")
|
| 12 |
+
ds = load_dataset("tunis-ai/MADAR-TUN", split="train")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
valid_examples = [
|
| 16 |
+
ex for ex in ds
|
| 17 |
+
if ex["arabish"] != "<eos>"
|
| 18 |
+
and ex["words"] != "<eos>"
|
| 19 |
+
and ex["lem"] != "<eos>"
|
| 20 |
+
and ex["arabish"] is not None
|
| 21 |
+
and ex["arabish"].strip()
|
| 22 |
+
and ex["words"] is not None
|
| 23 |
+
and ex["words"].strip()
|
| 24 |
+
and ex["lem"] is not None
|
| 25 |
+
and ex["lem"].strip()
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
print(f"Loaded {len(valid_examples)} valid token entries.")
|
| 29 |
+
|
| 30 |
+
# Build unique pairs (deduplicate)
|
| 31 |
+
norm_pairs = {} # arabish -> canonical lemma
|
| 32 |
+
trans_pairs = {} # arabish <-> arabic
|
| 33 |
+
|
| 34 |
+
for ex in valid_examples:
|
| 35 |
+
arabizi = ex["arabish"]
|
| 36 |
+
arabic = ex["words"]
|
| 37 |
+
lemma = ex["lem"]
|
| 38 |
+
|
| 39 |
+
# For normalization: use lemma as canonical form
|
| 40 |
+
if arabizi not in norm_pairs:
|
| 41 |
+
norm_pairs[arabizi] = lemma
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
if arabizi not in trans_pairs:
|
| 45 |
+
trans_pairs[arabizi] = arabic
|
| 46 |
+
|
| 47 |
+
print(f"Normalization pairs: {len(norm_pairs)}")
|
| 48 |
+
print(f"Transliteration pairs: {len(trans_pairs)}")
|
| 49 |
+
|
| 50 |
+
return norm_pairs, trans_pairs
|
| 51 |
+
|
| 52 |
+
def evaluate_word_classification(model, tokenizer, word_pairs, device, task_name):
|
| 53 |
+
"""
|
| 54 |
+
Evaluate word-level classification (normalization or transliteration).
|
| 55 |
+
Treats it as closed-vocabulary classification via embedding similarity.
|
| 56 |
+
"""
|
| 57 |
+
words = list(word_pairs.keys())
|
| 58 |
+
targets = list(word_pairs.values())
|
| 59 |
+
|
| 60 |
+
# Build target vocabulary
|
| 61 |
+
unique_targets = sorted(set(targets))
|
| 62 |
+
target_to_id = {t: i for i, t in enumerate(unique_targets)}
|
| 63 |
+
_target_ids = [target_to_id[t] for t in targets]
|
| 64 |
+
|
| 65 |
+
print(f"\n[{task_name}] Vocabulary size: {len(unique_targets)}")
|
| 66 |
+
print(f"[{task_name}] Evaluation samples: {len(words)}")
|
| 67 |
+
|
| 68 |
+
# Get embeddings for all target forms
|
| 69 |
+
print(f"[{task_name}] Encoding target vocabulary...")
|
| 70 |
+
target_encodings = tokenizer(
|
| 71 |
+
unique_targets,
|
| 72 |
+
padding=True,
|
| 73 |
+
truncation=True,
|
| 74 |
+
max_length=32,
|
| 75 |
+
return_tensors="pt"
|
| 76 |
+
).to(device)
|
| 77 |
+
|
| 78 |
+
with torch.no_grad():
|
| 79 |
+
target_embeds = model(**target_encodings).last_hidden_state[:, 0] # [V, H]
|
| 80 |
+
|
| 81 |
+
# Predict for each input word
|
| 82 |
+
predictions = []
|
| 83 |
+
batch_size = 32
|
| 84 |
+
|
| 85 |
+
print(f"[{task_name}] Predicting...")
|
| 86 |
+
for i in range(0, len(words), batch_size):
|
| 87 |
+
batch_words = words[i:i+batch_size]
|
| 88 |
+
inputs = tokenizer(
|
| 89 |
+
batch_words,
|
| 90 |
+
padding=True,
|
| 91 |
+
truncation=True,
|
| 92 |
+
max_length=32,
|
| 93 |
+
return_tensors="pt"
|
| 94 |
+
).to(device)
|
| 95 |
+
|
| 96 |
+
with torch.no_grad():
|
| 97 |
+
word_embeds = model(**inputs).last_hidden_state[:, 0] # [B, H]
|
| 98 |
+
logits = torch.matmul(word_embeds, target_embeds.T) # [B, V]
|
| 99 |
+
preds = logits.argmax(dim=1).cpu().tolist()
|
| 100 |
+
predictions.extend(preds)
|
| 101 |
+
|
| 102 |
+
# Map back to target IDs
|
| 103 |
+
true_labels = [target_to_id[t] for t in targets]
|
| 104 |
+
|
| 105 |
+
acc = accuracy_score(true_labels, predictions)
|
| 106 |
+
print(f"[{task_name}] Accuracy: {acc:.4f}")
|
| 107 |
+
return acc
|
| 108 |
+
|
src/evaluators/normalization/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .evaluator import NormalizationEvaluator
|
src/evaluators/normalization/datasets.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/evaluators/normalization/datasets.py
|
| 2 |
+
NORMALIZATION_DATASETS = {
|
| 3 |
+
"madar-tun": {
|
| 4 |
+
"path": "tunis-ai/MADAR-TUN",
|
| 5 |
+
"split": "test", # or "test" if available
|
| 6 |
+
"arabish_col": "arabish",
|
| 7 |
+
"canonical_col": "lem", # could also be "words"
|
| 8 |
+
"description": "MADAR-TUN: Arabizi → Lemma normalization"
|
| 9 |
+
}
|
| 10 |
+
}
|
src/evaluators/normalization/evaluator.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/evaluators/normalization/evaluator.py
|
| 2 |
+
import torch
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
from sklearn.metrics import accuracy_score
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
import warnings
|
| 7 |
+
|
| 8 |
+
from ..base_evaluator import BaseEvaluator
|
| 9 |
+
from .datasets import NORMALIZATION_DATASETS
|
| 10 |
+
|
| 11 |
+
warnings.filterwarnings("ignore")
|
| 12 |
+
|
| 13 |
+
class NormalizationEvaluator(BaseEvaluator):
|
| 14 |
+
def __init__(self, dataset_key: str = "madar-tun", max_samples: int = None):
|
| 15 |
+
if dataset_key not in NORMALIZATION_DATASETS:
|
| 16 |
+
raise ValueError(f"Unknown dataset: {dataset_key}")
|
| 17 |
+
self.config = NORMALIZATION_DATASETS[dataset_key]
|
| 18 |
+
self.max_samples = max_samples
|
| 19 |
+
|
| 20 |
+
@property
|
| 21 |
+
def task_name(self) -> str:
|
| 22 |
+
return "Normalization"
|
| 23 |
+
|
| 24 |
+
def load_dataset(self):
|
| 25 |
+
print(f"\nLoading normalization data from {self.config['path']}...")
|
| 26 |
+
ds = load_dataset(
|
| 27 |
+
self.config["path"],
|
| 28 |
+
split=self.config["split"]
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
valid = []
|
| 32 |
+
for ex in ds:
|
| 33 |
+
a = ex[self.config["arabish_col"]]
|
| 34 |
+
c = ex[self.config["canonical_col"]]
|
| 35 |
+
if a and c and a != "<eos>" and c != "<eos>" and a is not None and a.strip() and c is not None and c.strip():
|
| 36 |
+
valid.append((a.strip(), c.strip()))
|
| 37 |
+
|
| 38 |
+
if self.max_samples:
|
| 39 |
+
valid = valid[:self.max_samples]
|
| 40 |
+
|
| 41 |
+
print(f"Loaded {len(valid)} normalization pairs.")
|
| 42 |
+
return valid # List[Tuple[noisy, canonical]]
|
| 43 |
+
|
| 44 |
+
def evaluate(self, model, tokenizer, device: str = "cuda") -> Dict[str, Any]:
|
| 45 |
+
pairs = self.load_dataset()
|
| 46 |
+
if not pairs:
|
| 47 |
+
raise ValueError("No valid normalization pairs found!")
|
| 48 |
+
|
| 49 |
+
words, targets = zip(*pairs)
|
| 50 |
+
words, targets = list(words), list(targets)
|
| 51 |
+
|
| 52 |
+
# Build vocab
|
| 53 |
+
unique_targets = sorted(set(targets))
|
| 54 |
+
target_to_id = {t: i for i, t in enumerate(unique_targets)}
|
| 55 |
+
|
| 56 |
+
# Encode targets
|
| 57 |
+
target_enc = tokenizer(
|
| 58 |
+
unique_targets,
|
| 59 |
+
padding=True,
|
| 60 |
+
truncation=True,
|
| 61 |
+
max_length=32,
|
| 62 |
+
return_tensors="pt"
|
| 63 |
+
).to(device)
|
| 64 |
+
|
| 65 |
+
with torch.no_grad():
|
| 66 |
+
target_embeds = model(**target_enc).last_hidden_state[:, 0]
|
| 67 |
+
|
| 68 |
+
# Predict
|
| 69 |
+
predictions = []
|
| 70 |
+
batch_size = 32
|
| 71 |
+
for i in range(0, len(words), batch_size):
|
| 72 |
+
batch = words[i:i+batch_size]
|
| 73 |
+
inputs = tokenizer(
|
| 74 |
+
batch,
|
| 75 |
+
padding=True,
|
| 76 |
+
truncation=True,
|
| 77 |
+
max_length=32,
|
| 78 |
+
return_tensors="pt"
|
| 79 |
+
).to(device)
|
| 80 |
+
|
| 81 |
+
with torch.no_grad():
|
| 82 |
+
word_embeds = model(**inputs).last_hidden_state[:, 0]
|
| 83 |
+
logits = torch.matmul(word_embeds, target_embeds.T)
|
| 84 |
+
preds = logits.argmax(dim=1).cpu().tolist()
|
| 85 |
+
predictions.extend(preds)
|
| 86 |
+
|
| 87 |
+
true_labels = [target_to_id[t] for t in targets]
|
| 88 |
+
acc = accuracy_score(true_labels, predictions)
|
| 89 |
+
|
| 90 |
+
print(f"✅ Normalization Accuracy: {acc:.4f}")
|
| 91 |
+
return {
|
| 92 |
+
"task": self.task_name,
|
| 93 |
+
"main_metric": acc,
|
| 94 |
+
"accuracy": acc,
|
| 95 |
+
"total_samples": len(pairs)
|
| 96 |
+
}
|
src/{evaluator → evaluators}/run_evaluator.py
RENAMED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import time
|
| 2 |
-
from src.
|
| 3 |
|
| 4 |
|
| 5 |
def evaluator_runner():
|
|
|
|
| 1 |
import time
|
| 2 |
+
from src.evaluators.evaluate import process_evaluation_queue
|
| 3 |
|
| 4 |
|
| 5 |
def evaluator_runner():
|
src/evaluators/sentiment_analysis/__init__.py
ADDED
|
File without changes
|
src/evaluators/sentiment_analysis/dataset.py
ADDED
|
File without changes
|
src/evaluators/sentiment_analysis/evaluator.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch.utils.data import DataLoader
|
| 3 |
+
from datasets import concatenate_datasets, load_dataset,Dataset
|
| 4 |
+
from typing import Dict, Any, List, Optional
|
| 5 |
+
import warnings
|
| 6 |
+
|
| 7 |
+
from ..base_evaluator import BaseEvaluator
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
SUPPORTED_DATASETS = {
|
| 12 |
+
"tsac": {
|
| 13 |
+
"path": "tunis-ai/tsac",
|
| 14 |
+
"text_column": "sentence",
|
| 15 |
+
"label_column": "target",
|
| 16 |
+
"label_map": {0: 0, 1: 1}, # already binary
|
| 17 |
+
"trust_remote_code": True,
|
| 18 |
+
"split": "test"
|
| 19 |
+
},
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class SentimentAnalysisEvaluator(BaseEvaluator):
|
| 27 |
+
"""
|
| 28 |
+
Unified evaluator for Tunisian sentiment analysis.
|
| 29 |
+
Supports multiple datasets, harmonizes labels to binary (0=neg, 1=pos).
|
| 30 |
+
Neutral/mapped-to-invalid labels are filtered out.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
datasets: Optional[List[str]] = None,
|
| 36 |
+
max_samples_per_dataset: int = 500,
|
| 37 |
+
batch_size: int = 16
|
| 38 |
+
):
|
| 39 |
+
"""
|
| 40 |
+
Args:
|
| 41 |
+
datasets: List of dataset keys from SUPPORTED_DATASETS.
|
| 42 |
+
If None, uses all available.
|
| 43 |
+
max_samples_per_dataset: Limit samples per dataset for faster eval.
|
| 44 |
+
batch_size: Inference batch size.
|
| 45 |
+
"""
|
| 46 |
+
if datasets is None:
|
| 47 |
+
self.dataset_keys = list(SUPPORTED_DATASETS.keys())
|
| 48 |
+
else:
|
| 49 |
+
for d in datasets:
|
| 50 |
+
if d not in SUPPORTED_DATASETS:
|
| 51 |
+
raise ValueError(f"Dataset '{d}' not in supported list: {list(SUPPORTED_DATASETS.keys())}")
|
| 52 |
+
self.dataset_keys = datasets
|
| 53 |
+
|
| 54 |
+
self.max_samples_per_dataset = max_samples_per_dataset
|
| 55 |
+
self.batch_size = batch_size
|
| 56 |
+
|
| 57 |
+
@property
|
| 58 |
+
def task_name(self) -> str:
|
| 59 |
+
return "Sentiment Analysis"
|
| 60 |
+
|
| 61 |
+
def load_dataset(self) -> Dataset:
|
| 62 |
+
"""Load and harmonize all configured sentiment datasets."""
|
| 63 |
+
print("\n=== Loading Tunisian Sentiment Datasets ===")
|
| 64 |
+
all_datasets = []
|
| 65 |
+
|
| 66 |
+
for key in self.dataset_keys:
|
| 67 |
+
cfg = SUPPORTED_DATASETS[key]
|
| 68 |
+
print(f"\nLoading '{key}': {cfg.get('description', "No description available.")}")
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
ds = load_dataset(
|
| 72 |
+
cfg["path"],
|
| 73 |
+
split=cfg["split"],
|
| 74 |
+
trust_remote_code=cfg.get("trust_remote_code", False)
|
| 75 |
+
)
|
| 76 |
+
print(f" Raw size: {len(ds)}")
|
| 77 |
+
except Exception as e:
|
| 78 |
+
warnings.warn(f"Failed to load {key}: {e}. Skipping.")
|
| 79 |
+
continue
|
| 80 |
+
|
| 81 |
+
# Harmonize to {"text": str, "label": int in {0,1}}
|
| 82 |
+
def harmonize(example):
|
| 83 |
+
# print(cfg)
|
| 84 |
+
try:
|
| 85 |
+
text = example[cfg["text_column"]]
|
| 86 |
+
orig_label = example[cfg["label_column"]]
|
| 87 |
+
|
| 88 |
+
if orig_label not in cfg["label_map"]:
|
| 89 |
+
return None
|
| 90 |
+
|
| 91 |
+
new_label = cfg["label_map"][orig_label]
|
| 92 |
+
if new_label not in [0, 1]:
|
| 93 |
+
return None # skip neutral/invalid
|
| 94 |
+
|
| 95 |
+
return {"text": text, "label": new_label}
|
| 96 |
+
except Exception:
|
| 97 |
+
return None
|
| 98 |
+
print(" Harmonizing and filtering...")
|
| 99 |
+
ds = ds.map(
|
| 100 |
+
harmonize,
|
| 101 |
+
load_from_cache_file=False,
|
| 102 |
+
desc=f"Harmonizing {key}"
|
| 103 |
+
)
|
| 104 |
+
# print(ds)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
print(" Filtering invalid/neutral samples...")
|
| 108 |
+
ds = ds.filter(lambda x: x is not None, load_from_cache_file=False)
|
| 109 |
+
print(f" Valid binary samples: {len(ds)}")
|
| 110 |
+
|
| 111 |
+
if self.max_samples_per_dataset and len(ds) > self.max_samples_per_dataset:
|
| 112 |
+
ds = ds.select(range(self.max_samples_per_dataset))
|
| 113 |
+
print(f" Trimmed to {self.max_samples_per_dataset} samples")
|
| 114 |
+
|
| 115 |
+
if len(ds) > 0:
|
| 116 |
+
all_datasets.append(ds)
|
| 117 |
+
|
| 118 |
+
if not all_datasets:
|
| 119 |
+
raise ValueError("No valid sentiment data found!")
|
| 120 |
+
|
| 121 |
+
# Combine all datasets
|
| 122 |
+
combined = concatenate_datasets(all_datasets)
|
| 123 |
+
print(f"\n✅ Total Tunisian sentiment samples: {len(combined)}")
|
| 124 |
+
return combined
|
| 125 |
+
|
| 126 |
+
def _tokenize_batch(self, examples, tokenizer):
|
| 127 |
+
return tokenizer(
|
| 128 |
+
examples["sentence"],
|
| 129 |
+
padding=True,
|
| 130 |
+
truncation=True,
|
| 131 |
+
max_length=512,
|
| 132 |
+
return_tensors=None
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
def _collate_fn(self, batch):
|
| 136 |
+
input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch])
|
| 137 |
+
attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch])
|
| 138 |
+
labels = torch.tensor([b["labels"] for b in batch], dtype=torch.long)
|
| 139 |
+
return {
|
| 140 |
+
"input_ids": input_ids,
|
| 141 |
+
"attention_mask": attention_mask,
|
| 142 |
+
"labels": labels
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
def evaluate(self, model, tokenizer, device: str = "cuda") -> Dict[str, Any]:
|
| 146 |
+
"""Evaluate model on unified Tunisian sentiment task."""
|
| 147 |
+
print(f"\n=== Evaluating {self.task_name} ===")
|
| 148 |
+
print(f"Model: {model.__class__.__name__} | Device: {device}")
|
| 149 |
+
print(f"Datasets: {self.dataset_keys}")
|
| 150 |
+
|
| 151 |
+
# Load and prepare data
|
| 152 |
+
raw_dataset = self.load_dataset()
|
| 153 |
+
tokenized = raw_dataset.map(
|
| 154 |
+
lambda ex: self._tokenize_batch(ex, tokenizer),
|
| 155 |
+
batched=True,
|
| 156 |
+
remove_columns=raw_dataset.column_names
|
| 157 |
+
)
|
| 158 |
+
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
|
| 159 |
+
tokenized = tokenized.add_column("labels", raw_dataset["label"])
|
| 160 |
+
print(tokenized.column_names)
|
| 161 |
+
|
| 162 |
+
dataloader = DataLoader(
|
| 163 |
+
tokenized,
|
| 164 |
+
batch_size=self.batch_size,
|
| 165 |
+
shuffle=False,
|
| 166 |
+
collate_fn=self._collate_fn
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
# Inference
|
| 170 |
+
model.eval()
|
| 171 |
+
all_preds, all_labels = [], []
|
| 172 |
+
|
| 173 |
+
with torch.no_grad():
|
| 174 |
+
for i, batch in enumerate(dataloader):
|
| 175 |
+
inputs = {
|
| 176 |
+
k: v.to(device) for k, v in batch.items()
|
| 177 |
+
if k in ["input_ids", "attention_mask"]
|
| 178 |
+
}
|
| 179 |
+
labels = batch["labels"].to(device)
|
| 180 |
+
|
| 181 |
+
outputs = model(**inputs)
|
| 182 |
+
logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
|
| 183 |
+
|
| 184 |
+
if logits.dim() == 3: # [B, L, C]
|
| 185 |
+
logits = logits[:, 0, :]
|
| 186 |
+
|
| 187 |
+
preds = logits.argmax(dim=-1).cpu().tolist()
|
| 188 |
+
trues = labels.cpu().tolist()
|
| 189 |
+
|
| 190 |
+
all_preds.extend(preds)
|
| 191 |
+
all_labels.extend(trues)
|
| 192 |
+
|
| 193 |
+
# Metrics
|
| 194 |
+
correct = sum(p == t for p, t in zip(all_preds, all_labels))
|
| 195 |
+
total = len(all_preds)
|
| 196 |
+
accuracy = correct / total if total > 0 else 0.0
|
| 197 |
+
|
| 198 |
+
print(f"\n✅ {self.task_name} Results:")
|
| 199 |
+
print(f" Accuracy: {accuracy:.4f} ({correct}/{total})")
|
| 200 |
+
|
| 201 |
+
return {
|
| 202 |
+
"task": self.task_name,
|
| 203 |
+
"accuracy": accuracy,
|
| 204 |
+
"main_metric": accuracy,
|
| 205 |
+
"total_samples": total,
|
| 206 |
+
"datasets_used": self.dataset_keys
|
| 207 |
+
}
|
src/evaluators/transliteration/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .evaluator import TransliterationEvaluator
|
src/evaluators/transliteration/datasets.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/evaluators/transliteration/datasets.py
|
| 2 |
+
TRANSLITERATION_DATASETS = {
|
| 3 |
+
"madar-tun": {
|
| 4 |
+
"path": "tunis-ai/MADAR-TUN",
|
| 5 |
+
"split": "test",
|
| 6 |
+
"source_col": "arabish", # Latin
|
| 7 |
+
"target_col": "words", # Arabic script
|
| 8 |
+
"description": "MADAR-TUN: Arabizi ↔ Arabic script"
|
| 9 |
+
}
|
| 10 |
+
}
|
src/evaluators/transliteration/evaluator.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/evaluators/transliteration/evaluator.py
|
| 2 |
+
import torch
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
from sklearn.metrics import accuracy_score
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
import warnings
|
| 7 |
+
|
| 8 |
+
from ..base_evaluator import BaseEvaluator
|
| 9 |
+
from .datasets import TRANSLITERATION_DATASETS
|
| 10 |
+
|
| 11 |
+
warnings.filterwarnings("ignore")
|
| 12 |
+
|
| 13 |
+
class TransliterationEvaluator(BaseEvaluator):
|
| 14 |
+
def __init__(self, dataset_key: str = "madar-tun", max_samples: int = None):
|
| 15 |
+
if dataset_key not in TRANSLITERATION_DATASETS:
|
| 16 |
+
raise ValueError(f"Unknown dataset: {dataset_key}")
|
| 17 |
+
self.config = TRANSLITERATION_DATASETS[dataset_key]
|
| 18 |
+
self.max_samples = max_samples
|
| 19 |
+
|
| 20 |
+
@property
|
| 21 |
+
def task_name(self) -> str:
|
| 22 |
+
return "Transliteration"
|
| 23 |
+
|
| 24 |
+
def load_dataset(self):
|
| 25 |
+
print(f"\nLoading transliteration data from {self.config['path']}...")
|
| 26 |
+
ds = load_dataset(
|
| 27 |
+
self.config["path"],
|
| 28 |
+
split=self.config["split"]
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
valid = []
|
| 32 |
+
for ex in ds:
|
| 33 |
+
src = ex[self.config["source_col"]]
|
| 34 |
+
tgt = ex[self.config["target_col"]]
|
| 35 |
+
if src and tgt and src != "<eos>" and tgt != "<eos>" and src.strip() and tgt.strip():
|
| 36 |
+
valid.append((src.strip(), tgt.strip()))
|
| 37 |
+
|
| 38 |
+
if self.max_samples:
|
| 39 |
+
valid = valid[:self.max_samples]
|
| 40 |
+
|
| 41 |
+
print(f"Loaded {len(valid)} transliteration pairs.")
|
| 42 |
+
return valid
|
| 43 |
+
|
| 44 |
+
def evaluate(self, model, tokenizer, device: str = "cuda") -> Dict[str, Any]:
|
| 45 |
+
pairs = self.load_dataset()
|
| 46 |
+
if not pairs:
|
| 47 |
+
raise ValueError("No valid transliteration pairs found!")
|
| 48 |
+
|
| 49 |
+
sources, targets = zip(*pairs)
|
| 50 |
+
sources, targets = list(sources), list(targets)
|
| 51 |
+
|
| 52 |
+
# Build target vocab
|
| 53 |
+
unique_targets = sorted(set(targets))
|
| 54 |
+
target_to_id = {t: i for i, t in enumerate(unique_targets)}
|
| 55 |
+
|
| 56 |
+
# Encode targets
|
| 57 |
+
target_enc = tokenizer(
|
| 58 |
+
unique_targets,
|
| 59 |
+
padding=True,
|
| 60 |
+
truncation=True,
|
| 61 |
+
max_length=32,
|
| 62 |
+
return_tensors="pt"
|
| 63 |
+
).to(device)
|
| 64 |
+
|
| 65 |
+
with torch.no_grad():
|
| 66 |
+
target_embeds = model(**target_enc).last_hidden_state[:, 0]
|
| 67 |
+
|
| 68 |
+
# Predict
|
| 69 |
+
predictions = []
|
| 70 |
+
batch_size = 32
|
| 71 |
+
for i in range(0, len(sources), batch_size):
|
| 72 |
+
batch = sources[i:i+batch_size]
|
| 73 |
+
inputs = tokenizer(
|
| 74 |
+
batch,
|
| 75 |
+
padding=True,
|
| 76 |
+
truncation=True,
|
| 77 |
+
max_length=32,
|
| 78 |
+
return_tensors="pt"
|
| 79 |
+
).to(device)
|
| 80 |
+
|
| 81 |
+
with torch.no_grad():
|
| 82 |
+
src_embeds = model(**inputs).last_hidden_state[:, 0]
|
| 83 |
+
logits = torch.matmul(src_embeds, target_embeds.T)
|
| 84 |
+
preds = logits.argmax(dim=1).cpu().tolist()
|
| 85 |
+
predictions.extend(preds)
|
| 86 |
+
|
| 87 |
+
true_labels = [target_to_id[t] for t in targets]
|
| 88 |
+
acc = accuracy_score(true_labels, predictions)
|
| 89 |
+
|
| 90 |
+
print(f"✅ Transliteration Accuracy: {acc:.4f}")
|
| 91 |
+
return {
|
| 92 |
+
"task": self.task_name,
|
| 93 |
+
"main_metric": acc,
|
| 94 |
+
"accuracy": acc,
|
| 95 |
+
"total_samples": len(pairs)
|
| 96 |
+
}
|
src/evaluators/tsac.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
import traceback
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def evaluate_tsac_sentiment(model, tokenizer, device):
|
| 8 |
+
"""Evaluate model on TSAC sentiment analysis task"""
|
| 9 |
+
try:
|
| 10 |
+
print("\n=== Starting TSAC sentiment evaluation ===")
|
| 11 |
+
print(f"Current device: {device}")
|
| 12 |
+
|
| 13 |
+
# Load and preprocess dataset
|
| 14 |
+
print("\nLoading and preprocessing TSAC dataset...")
|
| 15 |
+
dataset = load_dataset("fbougares/tsac", split="test", trust_remote_code=True)
|
| 16 |
+
dataset = dataset.select(range(10)) # Only evaluate on 200 samples
|
| 17 |
+
|
| 18 |
+
# print(f"Dataset size: {len(dataset)} examples")
|
| 19 |
+
|
| 20 |
+
def preprocess(examples):
|
| 21 |
+
return tokenizer(
|
| 22 |
+
examples['sentence'],
|
| 23 |
+
padding=True,
|
| 24 |
+
truncation=True,
|
| 25 |
+
max_length=512,
|
| 26 |
+
return_tensors=None
|
| 27 |
+
)
|
| 28 |
+
print(dataset.column_names)
|
| 29 |
+
dataset = dataset.map(preprocess, batched=True)
|
| 30 |
+
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target'])
|
| 31 |
+
|
| 32 |
+
# Check first example
|
| 33 |
+
first_example = dataset[0]
|
| 34 |
+
print("\nFirst example details:")
|
| 35 |
+
print(f"Input IDs shape: {first_example['input_ids'].shape}")
|
| 36 |
+
print(f"Attention mask shape: {first_example['attention_mask'].shape}")
|
| 37 |
+
print(f"Target: {first_example['target']}")
|
| 38 |
+
|
| 39 |
+
model.eval()
|
| 40 |
+
print(f"\nModel class: {model.__class__.__name__}")
|
| 41 |
+
print(f"Model device: {next(model.parameters()).device}")
|
| 42 |
+
|
| 43 |
+
with torch.no_grad():
|
| 44 |
+
predictions = []
|
| 45 |
+
targets = []
|
| 46 |
+
|
| 47 |
+
# Create DataLoader with batch size 16
|
| 48 |
+
from torch.utils.data import DataLoader
|
| 49 |
+
|
| 50 |
+
# Define a custom collate function
|
| 51 |
+
def collate_fn(batch):
|
| 52 |
+
input_ids = torch.stack([sample['input_ids'] for sample in batch])
|
| 53 |
+
attention_mask = torch.stack([sample['attention_mask'] for sample in batch])
|
| 54 |
+
targets = torch.stack([sample['target'] for sample in batch])
|
| 55 |
+
return {
|
| 56 |
+
'input_ids': input_ids,
|
| 57 |
+
'attention_mask': attention_mask,
|
| 58 |
+
'target': targets
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
dataloader = DataLoader(
|
| 64 |
+
dataset,
|
| 65 |
+
batch_size=16,
|
| 66 |
+
shuffle=False,
|
| 67 |
+
collate_fn=collate_fn
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
for i, batch in enumerate(dataloader):
|
| 71 |
+
if i % 10 == 0 :
|
| 72 |
+
print("\nProcessing first batch...")
|
| 73 |
+
print(f"Batch keys: {list(batch.keys())}")
|
| 74 |
+
print(f"Target shape: {batch['target'].shape}")
|
| 75 |
+
|
| 76 |
+
inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'}
|
| 77 |
+
target = batch['target'].to(device)
|
| 78 |
+
before = time.time()
|
| 79 |
+
outputs = model(**inputs)
|
| 80 |
+
# print(f"\nBatch {i} output type: {type(outputs)}")
|
| 81 |
+
|
| 82 |
+
# Handle different model output formats
|
| 83 |
+
if isinstance(outputs, dict):
|
| 84 |
+
# print(f"Output keys: {list(outputs.keys())}")
|
| 85 |
+
if 'logits' in outputs:
|
| 86 |
+
logits = outputs['logits']
|
| 87 |
+
elif 'prediction_logits' in outputs:
|
| 88 |
+
logits = outputs['prediction_logits']
|
| 89 |
+
else:
|
| 90 |
+
raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}")
|
| 91 |
+
elif isinstance(outputs, tuple):
|
| 92 |
+
print(f"Output tuple length: {len(outputs)}")
|
| 93 |
+
logits = outputs[0]
|
| 94 |
+
else:
|
| 95 |
+
logits = outputs
|
| 96 |
+
|
| 97 |
+
# print(f"Logits shape: {logits.shape}")
|
| 98 |
+
|
| 99 |
+
# For sequence classification, we typically use the [CLS] token's prediction
|
| 100 |
+
if len(logits.shape) == 3: # [batch_size, sequence_length, num_classes]
|
| 101 |
+
logits = logits[:, 0, :] # Take the [CLS] token prediction
|
| 102 |
+
|
| 103 |
+
# print(f"Final logits shape: {logits.shape}")
|
| 104 |
+
|
| 105 |
+
batch_predictions = logits.argmax(dim=-1).cpu().tolist()
|
| 106 |
+
batch_targets = target.cpu().tolist()
|
| 107 |
+
|
| 108 |
+
predictions.extend(batch_predictions)
|
| 109 |
+
targets.extend(batch_targets)
|
| 110 |
+
|
| 111 |
+
if i % 10 == 0:
|
| 112 |
+
print("\nFirst batch predictions:")
|
| 113 |
+
print(f"Predictions: {batch_predictions[:5]}")
|
| 114 |
+
print(f"Targets: {batch_targets[:5]}")
|
| 115 |
+
|
| 116 |
+
print(f"\nTotal predictions: {len(predictions)}")
|
| 117 |
+
print(f"Total targets: {len(targets)}")
|
| 118 |
+
|
| 119 |
+
# Calculate accuracy
|
| 120 |
+
correct = sum(p == t for p, t in zip(predictions, targets))
|
| 121 |
+
total = len(predictions)
|
| 122 |
+
accuracy = correct / total if total > 0 else 0.0
|
| 123 |
+
|
| 124 |
+
print(f"\nEvaluation results:")
|
| 125 |
+
print(f"Correct predictions: {correct}")
|
| 126 |
+
print(f"Total predictions: {total}")
|
| 127 |
+
print(f"Accuracy: {accuracy:.4f}")
|
| 128 |
+
|
| 129 |
+
return {"fbougares/tsac": accuracy}
|
| 130 |
+
except Exception as e:
|
| 131 |
+
print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
|
| 132 |
+
print(f"Full traceback: {traceback.format_exc()}")
|
| 133 |
+
raise e
|
src/{evaluator → evaluators}/tunisian_corpus_coverage.py
RENAMED
|
File without changes
|
src/submission/submit.py
CHANGED
|
@@ -12,7 +12,7 @@ from src.submission.check_validity import (
|
|
| 12 |
get_model_size,
|
| 13 |
is_model_on_hub,
|
| 14 |
)
|
| 15 |
-
from src.
|
| 16 |
|
| 17 |
|
| 18 |
REQUESTED_MODELS = None
|
|
|
|
| 12 |
get_model_size,
|
| 13 |
is_model_on_hub,
|
| 14 |
)
|
| 15 |
+
from src.evaluators.evaluate import EvaluationStatus
|
| 16 |
|
| 17 |
|
| 18 |
REQUESTED_MODELS = None
|