hamzabouajila's picture
refactor the code for better scalability and update tsac naming to sentiment analysis, adding madar dataset for transliteration and normalization eval
bde1c71
raw
history blame contribute delete
344 Bytes
# src/evaluators/normalization/datasets.py
NORMALIZATION_DATASETS = {
"madar-tun": {
"path": "tunis-ai/MADAR-TUN",
"split": "test", # or "test" if available
"arabish_col": "arabish",
"canonical_col": "lem", # could also be "words"
"description": "MADAR-TUN: Arabizi → Lemma normalization"
}
}