TunisianEncodersArena / src /evaluators /tunisian_corpus_coverage.py
hamzabouajila's picture
refactor the code for better scalability and update tsac naming to sentiment analysis, adding madar dataset for transliteration and normalization eval
bde1c71
import json
import os
from typing import Dict, Any
from dataclasses import dataclass
from enum import Enum
from datetime import datetime
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from datasets import load_dataset
import traceback
from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
"""Evaluate model's coverage on Tunisian Dialect Corpus"""
try:
dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train")
def preprocess(examples):
# print("Tunisian Corpus preprocess exemples -------------",examples)
# Use 'Tweet' field as per dataset structure
return tokenizer(
examples['Tweet'],
padding=False, # We don't need padding for token coverage
truncation=False, # Don't truncate long sequences
max_length=None # Let tokenizer handle the length
)
dataset = dataset.map(preprocess, batched=True)
total_tokens = 0
covered_tokens = 0
for example in dataset:
input_ids = example['input_ids']
tokens = tokenizer.convert_ids_to_tokens(input_ids)
total_tokens += len(tokens)
covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
print(f"Tunisian Corpus Coverage: {coverage:.2%}")
return {"arbml/Tunisian_Dialect_Corpus": coverage}
except Exception as e:
print(f"Error in Tunisian Corpus evaluation: {str(e)}")
print(f"Full traceback: {traceback.format_exc()}")
raise e