TunisianEncodersArena

Runtime error

hamzabouajila commited on Aug 4

Commit

34052ff

1 Parent(s): 1228655

feat: enhance evaluation pipeline and error handling

- Add Hugging Face Hub integration for downloading evaluation queue
- Improve error handling and status updates in evaluation process
- Streamline file upload and status management
- Add proper type hints and imports
- Update tokenizer loading to be more efficient
- Implement better logging for evaluation status
- Add snapshot download for evaluation requests
- Fix race conditions in file handling
- Update dependencies and imports

Files changed (9) hide show

app.py +60 -84
src/about.py +5 -5
src/evaluator/evaluate.py +157 -396
src/evaluator/run_evaluator.py +2 -2
src/evaluator/tsac.py +133 -0
src/evaluator/tunisian_corpus_coverage.py +48 -0
src/leaderboard/read_evals.py +49 -186
src/populate.py +37 -30
src/submission/submit.py +111 -313

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
-import os
 from dotenv import load_dotenv
-# Load environment variables from .env file
 load_dotenv()
 import gradio as gr
@@ -39,95 +37,36 @@ import time
 def restart_space():
     try:
-        # Restart the space
         API.restart_space(repo_id=REPO_ID)
     except Exception as e:
         print(f"Error restarting space: {str(e)}")
-        # If restart fails, try to download the datasets again
         try:
             print("Attempting to download datasets again...")
             snapshot_download(
-                repo_id=QUEUE_REPO,
-                local_dir=EVAL_REQUESTS_PATH,
-                repo_type="dataset",
-                tqdm_class=None,
-                etag_timeout=30,
-                token=TOKEN
-            )
             snapshot_download(
-                repo_id=RESULTS_REPO,
-                local_dir=EVAL_RESULTS_PATH,
-                repo_type="dataset",
-                tqdm_class=None,
-                etag_timeout=30,
-                token=TOKEN
             )
         except Exception as download_error:
             print(f"Error downloading datasets: {str(download_error)}")
-### Space initialisation
-try:
-    print(f"\n=== Starting space initialization ===")
-    print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
-    print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
-    print(f"QUEUE_REPO: {QUEUE_REPO}")
-    print(f"RESULTS_REPO: {RESULTS_REPO}")
-    print(f"TOKEN: {bool(TOKEN)}")
-    print("\n=== Downloading request files ===")
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-    print("\n=== Downloading results files ===")
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-    print("\n=== Loading leaderboard data ===")
-    LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-    print(f"Leaderboard DataFrame shape: {LEADERBOARD_DF.shape if LEADERBOARD_DF is not None else 'None'}")
-    print("\n=== Loading evaluation queue data ===")
-    (
-        finished_eval_queue_df,
-        running_eval_queue_df,
-        pending_eval_queue_df,
-    ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-    print(f"Finished eval queue shape: {finished_eval_queue_df.shape if finished_eval_queue_df is not None else 'None'}")
-    print(f"Running eval queue shape: {running_eval_queue_df.shape if running_eval_queue_df is not None else 'None'}")
-    print(f"Pending eval queue shape: {pending_eval_queue_df.shape if pending_eval_queue_df is not None else 'None'}")
-except Exception as e:
-    print(f"\n=== Error during space initialization ===")
-    print(f"Error: {str(e)}")
-    restart_space()
-# Start evaluator service in a separate thread
 def run_evaluator():
     print("Starting evaluator service...")
     while True:
         try:
             process_evaluation_queue()
             print("Evaluation queue processed. Sleeping for 5 minutes...")
-            time.sleep(300)  # Sleep for 5 minutes
         except Exception as e:
             print(f"Error in evaluation process: {e}")
             print("Retrying in 5 minutes...")
-            time.sleep(300)
-# Start evaluator in a separate thread
-evaluator_thread = threading.Thread(target=run_evaluator, daemon=True)
-evaluator_thread.start()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def init_leaderboard(dataframe):
     if dataframe is None:
@@ -145,36 +84,26 @@ def init_leaderboard(dataframe):
         filter_columns=[
             ColumnFilter(AutoEvalColumn().model_type.name, type="checkboxgroup", label="Model types"),
             ColumnFilter(AutoEvalColumn().precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn().params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn().still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
-# Add model evaluation functionality
 def evaluate_and_update(model_name, revision, precision, weight_type):
     """Add a model evaluation request to the queue"""
     try:
-        # Add evaluation request to queue
         add_new_eval(
             model_name=model_name,
             revision=revision,
             precision=precision,
             weight_type=weight_type,
-            model_type="LLM",  # Add appropriate model type
         )
-        # Update leaderboard
-        LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
         return "Evaluation request added to queue! Check the leaderboard for updates."
     except Exception as e:
         print(f"Error in evaluate_and_update: {str(e)}")
@@ -182,6 +111,52 @@ def evaluate_and_update(model_name, revision, precision, weight_type):
         return f"Error adding evaluation request: {str(e)}"
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
@@ -218,6 +193,7 @@ with demo:
                         open=False,
                     ):
                         with gr.Row():
                             running_eval_table = gr.components.Dataframe(
                                 value=running_eval_queue_df,
                                 headers=EVAL_COLS,

 from dotenv import load_dotenv
 load_dotenv()
 import gradio as gr
 def restart_space():
     try:
         API.restart_space(repo_id=REPO_ID)
     except Exception as e:
         print(f"Error restarting space: {str(e)}")
         try:
             print("Attempting to download datasets again...")
             snapshot_download(
+                repo_id=QUEUE_REPO,  local_dir=EVAL_REQUESTS_PATH,  repo_type="dataset",  tqdm_class=None,  etag_timeout=30,  token=TOKEN, force_download=True
+            )
             snapshot_download(
+                repo_id=RESULTS_REPO,  local_dir=EVAL_RESULTS_PATH,  repo_type="dataset",  tqdm_class=None,  etag_timeout=30,  token=TOKEN, force_download=True
             )
         except Exception as download_error:
             print(f"Error downloading datasets: {str(download_error)}")
 def run_evaluator():
     print("Starting evaluator service...")
     while True:
         try:
             process_evaluation_queue()
             print("Evaluation queue processed. Sleeping for 5 minutes...")
+            time.sleep(10)  # Sleep for 5 minutes
         except Exception as e:
             print(f"Error in evaluation process: {e}")
             print("Retrying in 5 minutes...")
+            time.sleep(10)
 def init_leaderboard(dataframe):
     if dataframe is None:
         filter_columns=[
             ColumnFilter(AutoEvalColumn().model_type.name, type="checkboxgroup", label="Model types"),
             ColumnFilter(AutoEvalColumn().precision.name, type="checkboxgroup", label="Precision"),
+            ColumnFilter(AutoEvalColumn().params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
+            ColumnFilter(AutoEvalColumn().still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
 def evaluate_and_update(model_name, revision, precision, weight_type):
     """Add a model evaluation request to the queue"""
     try:
         add_new_eval(
             model_name=model_name,
             revision=revision,
             precision=precision,
             weight_type=weight_type,
+            model_type="LLM",
         )
+        get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
         return "Evaluation request added to queue! Check the leaderboard for updates."
     except Exception as e:
         print(f"Error in evaluate_and_update: {str(e)}")
         return f"Error adding evaluation request: {str(e)}"
+### Space initialisation
+try:
+    print(f"\n=== Starting space initialization ===")
+    print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
+    print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
+    print(f"QUEUE_REPO: {QUEUE_REPO}")
+    print(f"RESULTS_REPO: {RESULTS_REPO}")
+    print(f"TOKEN: {bool(TOKEN)}")
+    print("\n=== Downloading request files ===")
+    snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN,force_download=True
+    )
+    print("\n=== Downloading results files ===")
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN,force_download=True
+    )
+    print("\n=== Loading leaderboard data ===")
+    LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+    print(f"Leaderboard DataFrame shape: {LEADERBOARD_DF.shape if LEADERBOARD_DF is not None else 'None'}")
+    print("\n=== Loading evaluation queue data ===")
+    finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+    print(f"Finished eval queue shape: {finished_eval_queue_df.shape if finished_eval_queue_df is not None else 'None'}")
+    print(f"Running eval queue shape: {running_eval_queue_df.shape if running_eval_queue_df is not None else 'None'}")
+    print(f"Pending eval queue shape: {pending_eval_queue_df.shape if pending_eval_queue_df is not None else 'None'}")
+except Exception as e:
+    print(f"\n=== Error during space initialization ===")
+    print(f"Error: {str(e)}")
+    restart_space()
+evaluator_thread = threading.Thread(target=run_evaluator, daemon=True)
+evaluator_thread.start()
+LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
                         open=False,
                     ):
                         with gr.Row():
+                            print(running_eval_queue_df)
                             running_eval_table = gr.components.Dataframe(
                                 value=running_eval_queue_df,
                                 headers=EVAL_COLS,

src/about.py CHANGED Viewed

@@ -3,18 +3,18 @@ from enum import Enum
 @dataclass
 class Task:
-    benchmark: str
-    metric: str
-    col_name: str
 # Tunisian Dialect Tasks
 # ---------------------------------------------------
 class Tasks(Enum):
     # Example: Sentiment Analysis on TSAC
-    tsac_sentiment = Task("fbougares/tsac", "accuracy", "TSAC Sentiment")
     # Example: Text Classification or Corpus Coverage on Tunisian Dialect Corpus
-    tunisian_corpus = Task("arbml/Tunisian_Dialect_Corpus", "coverage", "Tunisian Corpus Coverage")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------

 @dataclass
 class Task:
+    benchmark: str # Dataset name
+    metric: str # Metric name
+    col_name: str # Column name
 # Tunisian Dialect Tasks
 # ---------------------------------------------------
 class Tasks(Enum):
     # Example: Sentiment Analysis on TSAC
+    accuracy = Task("fbougares/tsac", "accuracy", "Accuracy (TSAC) ⬆️")
     # Example: Text Classification or Corpus Coverage on Tunisian Dialect Corpus
+    coverage = Task("arbml/Tunisian_Dialect_Corpus", "coverage", "Coverage (Tunisian Corpus) %")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------

src/evaluator/evaluate.py CHANGED Viewed

@@ -1,16 +1,20 @@
 import json
 import os
 from typing import Dict, Any
 from dataclasses import dataclass
 from enum import Enum
 from datetime import datetime
 import torch
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
-from datasets import load_dataset
 import traceback
-from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
-from src.display.utils import Tasks
 class EvaluationStatus(Enum):
     PENDING = "PENDING"
@@ -20,6 +24,7 @@ class EvaluationStatus(Enum):
 @dataclass
 class EvaluationResult:
     model: str
     revision: str
     precision: str
@@ -27,275 +32,41 @@ class EvaluationResult:
     results: Dict[str, float]
     error: str = None
-def evaluate_tsac_sentiment(model, tokenizer, device):
-    """Evaluate model on TSAC sentiment analysis task"""
-    try:
-        print("\n=== Starting TSAC sentiment evaluation ===")
-        print(f"Current device: {device}")
-        # Load and preprocess dataset
-        print("\nLoading and preprocessing TSAC dataset...")
-        dataset = load_dataset("fbougares/tsac", split="test", trust_remote_code=True)
-        print(f"Dataset size: {len(dataset)} examples")
-        def preprocess(examples):
-            print(f"\nProcessing batch of {len(examples['sentence'])} examples")
-            # Use 'sentence' field as per dataset structure
-            return tokenizer(
-                examples['sentence'],
-                padding=True,
-                truncation=True,
-                max_length=512,
-                return_tensors='pt'
-            )
-        dataset = dataset.map(preprocess, batched=True)
-        dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target'])
-        # Check first example
-        first_example = dataset[0]
-        print("\nFirst example details:")
-        print(f"Input IDs shape: {first_example['input_ids'].shape}")
-        print(f"Attention mask shape: {first_example['attention_mask'].shape}")
-        print(f"Target: {first_example['target']}")
-        model.eval()
-        print(f"\nModel class: {model.__class__.__name__}")
-        print(f"Model device: {next(model.parameters()).device}")
-        with torch.no_grad():
-            predictions = []
-            targets = []
-            # Create DataLoader with batch size 16
-            from torch.utils.data import DataLoader
-            # Define a custom collate function
-            def collate_fn(batch):
-                # Stack tensors for input_ids and attention_mask
-                input_ids = torch.stack([sample['input_ids'] for sample in batch])
-                attention_mask = torch.stack([sample['attention_mask'] for sample in batch])
-                # Stack targets
-                targets = torch.stack([torch.tensor(sample['target']) for sample in batch])
-                return {
-                    'input_ids': input_ids,
-                    'attention_mask': attention_mask,
-                    'target': targets
-                }
-            dataloader = DataLoader(
-                dataset,
-                batch_size=16,
-                shuffle=False,
-                collate_fn=collate_fn
-            )
-            for i, batch in enumerate(dataloader):
-                if i == 0:
-                    print("\nProcessing first batch...")
-                    print(f"Batch keys: {list(batch.keys())}")
-                    print(f"Target shape: {batch['target'].shape}")
-                inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'}
-                target = batch['target'].to(device)
-                outputs = model(**inputs)
-                print(f"\nBatch {i} output type: {type(outputs)}")
-                # Handle different model output formats
-                if isinstance(outputs, dict):
-                    print(f"Output keys: {list(outputs.keys())}")
-                    if 'logits' in outputs:
-                        logits = outputs['logits']
-                    elif 'prediction_logits' in outputs:
-                        logits = outputs['prediction_logits']
-                    else:
-                        raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}")
-                elif isinstance(outputs, tuple):
-                    print(f"Output tuple length: {len(outputs)}")
-                    logits = outputs[0]
-                else:
-                    logits = outputs
-                print(f"Logits shape: {logits.shape}")
-                # For sequence classification, we typically use the [CLS] token's prediction
-                if len(logits.shape) == 3:  # [batch_size, sequence_length, num_classes]
-                    logits = logits[:, 0, :]  # Take the [CLS] token prediction
-                print(f"Final logits shape: {logits.shape}")
-                batch_predictions = logits.argmax(dim=-1).cpu().tolist()
-                batch_targets = target.cpu().tolist()
-                predictions.extend(batch_predictions)
-                targets.extend(batch_targets)
-                if i == 0:
-                    print("\nFirst batch predictions:")
-                    print(f"Predictions: {batch_predictions[:5]}")
-                    print(f"Targets: {batch_targets[:5]}")
-            print(f"\nTotal predictions: {len(predictions)}")
-            print(f"Total targets: {len(targets)}")
-            # Calculate accuracy
-            correct = sum(p == t for p, t in zip(predictions, targets))
-            total = len(predictions)
-            accuracy = correct / total if total > 0 else 0.0
-            print(f"\nEvaluation results:")
-            print(f"Correct predictions: {correct}")
-            print(f"Total predictions: {total}")
-            print(f"Accuracy: {accuracy:.4f}")
-            return {"fbougares/tsac": accuracy}
-    except Exception as e:
-        print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
-        print(f"Full traceback: {traceback.format_exc()}")
-        raise e
-def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
-    """Evaluate model's coverage on Tunisian Dialect Corpus"""
-    try:
-        dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train")
-        def preprocess(examples):
-            print("Tunisian Corpus preprocess exemples -------------",examples)
-            # Use 'Tweet' field as per dataset structure
-            return tokenizer(
-                examples['Tweet'],
-                padding=False,  # We don't need padding for token coverage
-                truncation=False,  # Don't truncate long sequences
-                max_length=None  # Let tokenizer handle the length
-            )
-        dataset = dataset.map(preprocess, batched=True)
-        # Calculate token coverage
-        total_tokens = 0
-        covered_tokens = 0
-        for example in dataset:
-            # Get the tokenized input IDs
-            input_ids = example['input_ids']
-            # Convert to tokens and count
-            tokens = tokenizer.convert_ids_to_tokens(input_ids)
-            total_tokens += len(tokens)
-            covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
-        coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
-        print(f"Tunisian Corpus Coverage: {coverage:.2%}")
-        return {"arbml/Tunisian_Dialect_Corpus": coverage}
-    except Exception as e:
-        print(f"Error in Tunisian Corpus evaluation: {str(e)}")
-        print(f"Full traceback: {traceback.format_exc()}")
-        raise e
 def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
-    """Evaluate a single model on all tasks"""
     try:
         print(f"\nStarting evaluation for model: {model_name} (revision: {revision}, precision: {precision}, weight_type: {weight_type})")
-        print(f"Current working directory: {os.getcwd()}")
-        print(f"Evaluation requests path: {EVAL_REQUESTS_PATH}")
-        print(f"Evaluation results path: {EVAL_RESULTS_PATH}")
-        # Initialize device
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         print(f"Using device: {device}")
-        # Load model and tokenizer with enhanced error handling
         try:
-            print(f"\nLoading model: {model_name}")
-            print(f"Model path exists: {os.path.exists(model_name)}")
-            # First try to load the config to check model type
-            try:
-                config = AutoConfig.from_pretrained(model_name, revision=revision)
-                print(f"Model type from config: {config.model_type}")
-            except Exception as config_error:
-                print(f"Error loading config: {str(config_error)}")
-            # Try loading with trust_remote_code=True first
-            try:
-                print("\nAttempting to load with trust_remote_code=True...")
-                model = AutoModelForSequenceClassification.from_pretrained(
-                    model_name,
-                    revision=revision,
-                    torch_dtype=getattr(torch, precision),
-                    trust_remote_code=True
-                ).to(device)
-                print(f"Successfully loaded model {model_name} with trust_remote_code=True")
-                print(f"Model class: {model.__class__.__name__}")
-            except Exception as e1:
-                print(f"Error loading with trust_remote_code=True: {str(e1)}")
-                print(f"Error type: {type(e1).__name__}")
-                # If it's a model type error, try with llama as model type
-                if "Unrecognized model" in str(e1) and "llama" in model_name.lower():
-                    print("\nAttempting to load as llama model...")
-                    try:
-                        model = AutoModelForSequenceClassification.from_pretrained(
-                            model_name,
-                            revision=revision,
-                            torch_dtype=getattr(torch, precision),
-                            trust_remote_code=True,
-                            model_type="llama"
-                        ).to(device)
-                        print(f"Successfully loaded model {model_name} as llama model")
-                        print(f"Model class: {model.__class__.__name__}")
-                    except Exception as e2:
-                        print(f"Error loading as llama model: {str(e2)}")
-                        print(f"Error type: {type(e2).__name__}")
-                        raise Exception(f"Failed to load model with both methods: {str(e1)}, {str(e2)}")
-                else:
-                    raise e1
-            print(f"\nLoading tokenizer: {model_name}")
-            try:
-                tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
-                print(f"Successfully loaded tokenizer for {model_name}")
-                print(f"Tokenizer class: {tokenizer.__class__.__name__}")
-            except Exception as e:
-                print(f"Error loading tokenizer: {str(e)}")
-                print(f"Error type: {type(e).__name__}")
-                raise Exception(f"Failed to load tokenizer: {str(e)}")
-            # Run evaluations
-            print("\nStarting TSAC sentiment evaluation...")
-            try:
-                tsac_results = evaluate_tsac_sentiment(model, tokenizer, device)
-                print(f"TSAC results: {tsac_results}")
-            except Exception as e:
-                print(f"Error in TSAC evaluation for {model_name}: {str(e)}")
-                print(f"Error type: {type(e).__name__}")
-                tsac_results = {"accuracy": None}
-            print("\nStarting Tunisian Corpus evaluation...")
-            try:
-                tunisian_results = evaluate_tunisian_corpus_coverage(model, tokenizer, device)
-                print(f"Tunisian Corpus results: {tunisian_results}")
-            except Exception as e:
-                print(f"Error in Tunisian Corpus evaluation for {model_name}: {str(e)}")
-                print(f"Error type: {type(e).__name__}")
-                tunisian_results = {"coverage": None}
-            print("\nEvaluation completed successfully!")
-            print(f"Final results: {tsac_results} | {tunisian_results}")
-            return EvaluationResult(
-                model=model_name,
                 revision=revision,
-                precision=precision,
-                weight_type=weight_type,
-                results={
-                    Tasks.tsac_sentiment.value.metric: tsac_results.get(Tasks.tsac_sentiment.value.metric),
-                    Tasks.tunisian_corpus.value.metric: tunisian_results.get(Tasks.tunisian_corpus.value.metric)
-                }
-            )
         except Exception as e:
-            print(f"\nError loading model {model_name}: {str(e)}")
-            print(f"Error type: {type(e).__name__}")
             print(f"Full traceback: {traceback.format_exc()}")
             return EvaluationResult(
                 model=model_name,
@@ -303,11 +74,43 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
                 precision=precision,
                 weight_type=weight_type,
                 results={},
-                error=str(e)
             )
     except Exception as e:
-        print(f"\nError evaluating model {model_name}: {str(e)}")
-        print(f"Error type: {type(e).__name__}")
         print(f"Full traceback: {traceback.format_exc()}")
         return EvaluationResult(
             model=model_name,
@@ -315,54 +118,75 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
             precision=precision,
             weight_type=weight_type,
             results={},
-            error=str(e)
         )
 def process_evaluation_queue():
-    """Process all pending evaluations in the queue"""
     print(f"\n=== Starting evaluation queue processing ===")
     print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
     print(f"Looking for evaluation requests in: {EVAL_REQUESTS_PATH}")
-    # Get all pending evaluations
     if not os.path.exists(EVAL_REQUESTS_PATH):
         print(f"Evaluation requests path does not exist: {EVAL_REQUESTS_PATH}")
         return
-    # Find all model directories (each model has its own directory)
-    model_dirs = [d for d in os.listdir(EVAL_REQUESTS_PATH) if os.path.isdir(os.path.join(EVAL_REQUESTS_PATH, d))]
-    print(f"Found {len(model_dirs)} model directories")
-    for model_dir in model_dirs:
-        model_dir_path = os.path.join(EVAL_REQUESTS_PATH, model_dir)
-        print(f"\nChecking model directory: {model_dir_path}")
-        # Find all JSON files in the model directory
-        json_files = [f for f in os.listdir(model_dir_path) if f.endswith('.json')]
-        print(f"Found {len(json_files)} pending evaluation requests")
-        for file in json_files:
-            file_path = os.path.join(model_dir_path, file)
-            print(f"  - {file_path}")
-            try:
-                with open(file_path, 'r') as f:
-                    eval_entry = json.load(f)
-                # Check if this is a pending or running evaluation
-                status = eval_entry.get('status', '')
-                if status == EvaluationStatus.PENDING.value:
-                    print(f"\n=== Found pending evaluation ===")
-                    print(f"Model: {eval_entry['model']}")
-                    print(f"Revision: {eval_entry['revision']}")
-                    print(f"Precision: {eval_entry['precision']}")
-                    print(f"Weight type: {eval_entry['weight_type']}")
-                    # Update status to RUNNING
-                    eval_entry['status'] = EvaluationStatus.RUNNING.value
-                    with open(file_path, 'w') as f:
-                        json.dump(eval_entry, f, indent=2)
-                    # Run evaluation
-                    try:
                         print("\n=== Starting evaluation ===")
                         eval_result = evaluate_model(
                             model_name=eval_entry['model'],
@@ -370,121 +194,58 @@ def process_evaluation_queue():
                             precision=eval_entry['precision'],
                             weight_type=eval_entry['weight_type']
                         )
                         print("\n=== Evaluation completed ===")
-                        print(f"Results: {eval_result.results}")
-                        # Update status to FINISHED and add results
-                        eval_entry['status'] = EvaluationStatus.FINISHED.value
-                        eval_entry['results'] = eval_result.results
                         if eval_result.error:
                             eval_entry['error'] = eval_result.error
-                        # Save updated entry
                         with open(file_path, 'w') as f:
                             json.dump(eval_entry, f, indent=2)
-                        # Move file to results directory
-                        if not os.path.exists(EVAL_RESULTS_PATH):
-                            os.makedirs(EVAL_RESULTS_PATH)
-                        result_filename = os.path.basename(file_path)
-                        result_path = os.path.join(EVAL_RESULTS_PATH, result_filename)
-                        os.rename(file_path, result_path)
-                        print(f"\nMoved evaluation result to: {result_path}")
-                        # Upload to Hugging Face
                         try:
                             API.upload_file(
-                                path_or_fileobj=result_path,
-                                path_in_repo=result_filename,
                                 repo_id=RESULTS_REPO,
                                 repo_type="dataset",
-                                commit_message=f"Add evaluation results for {eval_entry['model']}"
                             )
-                            print("\nResults uploaded to Hugging Face")
                         except Exception as upload_error:
                             print(f"Error uploading results: {str(upload_error)}")
-                            eval_entry['error'] = f"Evaluation completed but failed to upload results: {str(upload_error)}"
-                            with open(file_path, 'w') as f:
-                                json.dump(eval_entry, f, indent=2)
-                    except Exception as eval_error:
-                        print(f"\n=== Error during evaluation ===")
-                        print(f"Error: {str(eval_error)}")
-                        print(f"Full traceback: {traceback.format_exc()}")
-                        # Update status to FAILED and add error
-                        eval_entry['status'] = EvaluationStatus.FAILED.value
-                        eval_entry['error'] = str(eval_error)
-                        with open(file_path, 'w') as f:
-                            json.dump(eval_entry, f, indent=2)
-                        # Move failed evaluation to results directory
-                        if not os.path.exists(EVAL_RESULTS_PATH):
-                            os.makedirs(EVAL_RESULTS_PATH)
-                        result_filename = os.path.basename(file_path)
-                        result_path = os.path.join(EVAL_RESULTS_PATH, result_filename)
-                        os.rename(file_path, result_path)
-                        print(f"\nMoved failed evaluation to: {result_path}")
-                        # Upload error file
                         try:
-                            API.upload_file(
-                                path_or_fileobj=result_path,
-                                path_in_repo=result_filename,
-                                repo_id=RESULTS_REPO,
                                 repo_type="dataset",
-                                commit_message=f"Add evaluation error for {eval_entry['model']}"
                             )
-                            print("\nError file uploaded to Hugging Face")
-                        except Exception as upload_error:
-                            print(f"Error uploading error file: {str(upload_error)}")
-                elif status == EvaluationStatus.RUNNING.value:
-                    print(f"\n=== Found running evaluation ===")
-                    print(f"Model: {eval_entry['model']}")
-                    print(f"Revision: {eval_entry['revision']}")
-                    print(f"Precision: {eval_entry['precision']}")
-                    print(f"Weight type: {eval_entry['weight_type']}")
-                    try:
-                        # Check if we have results for this evaluation
-                        result_filename = os.path.basename(file_path)
-                        result_path = os.path.join(EVAL_RESULTS_PATH, result_filename)
-                        if os.path.exists(result_path):
-                            print(f"\nFound existing results file: {result_path}")
-                            # Update status to FINISHED
-                            eval_entry['status'] = EvaluationStatus.FINISHED.value
-                            with open(file_path, 'w') as f:
-                                json.dump(eval_entry, f, indent=2)
-                        else:
-                            print("\nNo results found. Restarting evaluation...")
-                            # Restart the evaluation
-                            eval_entry['status'] = EvaluationStatus.PENDING.value
-                            with open(file_path, 'w') as f:
-                                json.dump(eval_entry, f, indent=2)
-                    except Exception as check_error:
-                        print(f"\n=== Error checking running evaluation ===")
-                        print(f"Error: {str(check_error)}")
-                        print(f"Full traceback: {traceback.format_exc()}")
-                        # If we can't check the status, restart the evaluation
-                        eval_entry['status'] = EvaluationStatus.PENDING.value
-                        with open(file_path, 'w') as f:
-                            json.dump(eval_entry, f, indent=2)
-            except Exception as e:
-                print(f"Error processing file {file}: {str(e)}")
-                print(f"Full traceback: {traceback.format_exc()}")
-                continue
-    print(f"\n=== Evaluation queue summary ===")
-    print(f"Total directories checked: {len(model_dirs)}")
-    print(f"Total files processed: {len(json_files)}")
-    print(f"\nEvaluation queue processed. Sleeping for 5 minutes...")
-    return

 import json
 import os
+import time
 from typing import Dict, Any
 from dataclasses import dataclass
 from enum import Enum
 from datetime import datetime
 import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
 import traceback
+from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO, QUEUE_REPO
+from src.evaluator.tunisian_corpus_coverage import evaluate_tunisian_corpus_coverage
+from src.evaluator.tsac import evaluate_tsac_sentiment
+from huggingface_hub import snapshot_download
 class EvaluationStatus(Enum):
     PENDING = "PENDING"
 @dataclass
 class EvaluationResult:
+    """Dataclass to hold the results of a single model evaluation."""
     model: str
     revision: str
     precision: str
     results: Dict[str, float]
     error: str = None
 def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
+    """
+    Evaluates a single model on all defined tasks.
+    Args:
+        model_name (str): The name of the model on the Hugging Face Hub.
+        revision (str): The specific revision (commit hash or branch name) to use.
+        precision (str): The precision (e.g., 'float16') for model loading.
+        weight_type (str): The type of weights ('Original' or 'Adapter').
+    Returns:
+        EvaluationResult: A dataclass containing the evaluation results or an error message.
+    """
     try:
         print(f"\nStarting evaluation for model: {model_name} (revision: {revision}, precision: {precision}, weight_type: {weight_type})")
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         print(f"Using device: {device}")
         try:
+            print(f"\nLoading model and tokenizer for: {model_name}")
+            model = AutoModelForSequenceClassification.from_pretrained(
+                model_name,
                 revision=revision,
+                torch_dtype=getattr(torch, precision),
+                trust_remote_code=True
+            ).to(device)
+            tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
+            print(f"Successfully loaded model and tokenizer.")
         except Exception as e:
+            error_msg = f"Failed to load model or tokenizer: {str(e)}"
+            print(f"Error: {error_msg}")
             print(f"Full traceback: {traceback.format_exc()}")
             return EvaluationResult(
                 model=model_name,
                 precision=precision,
                 weight_type=weight_type,
                 results={},
+                error=error_msg
             )
+        tsac_results = {"accuracy": None}
+        tunisian_results = {"coverage": None}
+        print("\nStarting TSAC sentiment evaluation...")
+        try:
+            tsac_results = evaluate_tsac_sentiment(model, tokenizer, device)
+            print(f"TSAC results: {tsac_results}")
+        except Exception as e:
+            print(f"Error in TSAC evaluation for {model_name}: {str(e)}")
+            print(f"Full traceback: {traceback.format_exc()}")
+        print("\nStarting Tunisian Corpus evaluation...")
+        try:
+            tunisian_results = evaluate_tunisian_corpus_coverage(model, tokenizer, device)
+            print(f"Tunisian Corpus results: {tunisian_results}")
+        except Exception as e:
+            print(f"Error in Tunisian Corpus evaluation for {model_name}: {str(e)}")
+            print(f"Full traceback: {traceback.format_exc()}")
+        print("\nEvaluation completed successfully!")
+        return EvaluationResult(
+            model=model_name,
+            revision=revision,
+            precision=precision,
+            weight_type=weight_type,
+            results={
+                "accuracy": tsac_results.get("fbougares/tsac"),
+                "coverage": tunisian_results.get("arbml/Tunisian_Dialect_Corpus")
+            }
+        )
     except Exception as e:
+        error_msg = f"An unexpected error occurred during evaluation: {str(e)}"
+        print(f"Error: {error_msg}")
         print(f"Full traceback: {traceback.format_exc()}")
         return EvaluationResult(
             model=model_name,
             precision=precision,
             weight_type=weight_type,
             results={},
+            error=error_msg
         )
 def process_evaluation_queue():
+    """
+    Processes all pending evaluations in the queue.
+    This function acts as a worker that finds a PENDING job, runs it,
+    and updates the status on the Hugging Face Hub.
+    """
     print(f"\n=== Starting evaluation queue processing ===")
     print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    # --- NEW STEP: Download the latest queue from Hugging Face Hub ---
+    try:
+        print(f"Downloading evaluation requests from: {QUEUE_REPO}")
+        snapshot_download(
+            repo_id=QUEUE_REPO,
+            repo_type="dataset",
+            local_dir=EVAL_REQUESTS_PATH,
+            local_dir_use_symlinks=False,
+            token=API.token
+        )
+        print("Successfully downloaded evaluation queue.")
+    except Exception as e:
+        print(f"Error downloading evaluation queue: {str(e)}")
+        print(f"Full traceback: {traceback.format_exc()}")
+        return
     print(f"Looking for evaluation requests in: {EVAL_REQUESTS_PATH}")
     if not os.path.exists(EVAL_REQUESTS_PATH):
         print(f"Evaluation requests path does not exist: {EVAL_REQUESTS_PATH}")
         return
+    for root, _, files in os.walk(EVAL_REQUESTS_PATH):
+        for filename in files:
+            if filename.endswith('.json'):
+                file_path = os.path.join(root, filename)
+                print(f"\nProcessing file: {file_path}")
+                try:
+                    with open(file_path, 'r') as f:
+                        eval_entry = json.load(f)
+                    status = eval_entry.get('status', '')
+                    if status == EvaluationStatus.PENDING.value:
+                        print(f"Found pending evaluation for model: {eval_entry['model']}")
+                        # --- Step 1: Update status to RUNNING locally and on Hub ---
+                        eval_entry['status'] = EvaluationStatus.RUNNING.value
+                        with open(file_path, 'w') as f:
+                            json.dump(eval_entry, f, indent=2)
+                        user_name = os.path.basename(root)
+                        path_in_repo_queue = os.path.join(user_name, filename)
+                        # Upload the updated file to the queue repo to reflect 'RUNNING' status
+                        API.upload_file(
+                            path_or_fileobj=file_path,
+                            path_in_repo=path_in_repo_queue,
+                            repo_id=QUEUE_REPO,
+                            repo_type="dataset",
+                            commit_message=f"Update status to RUNNING for {eval_entry['model']}"
+                        )
+                        print(f"Updated status to RUNNING in queue: {path_in_repo_queue}")
+                        # --- Step 2: Run the evaluation ---
                         print("\n=== Starting evaluation ===")
                         eval_result = evaluate_model(
                             model_name=eval_entry['model'],
                             precision=eval_entry['precision'],
                             weight_type=eval_entry['weight_type']
                         )
                         print("\n=== Evaluation completed ===")
+                        # --- Step 3: Update file with final status and results locally ---
                         if eval_result.error:
+                            eval_entry['status'] = EvaluationStatus.FAILED.value
                             eval_entry['error'] = eval_result.error
+                            print(f"Evaluation failed with error: {eval_result.error}")
+                        else:
+                            eval_entry['status'] = EvaluationStatus.FINISHED.value
+                            eval_entry['results'] = eval_result.results
+                            print(f"Evaluation finished successfully. Results: {eval_result.results}")
                         with open(file_path, 'w') as f:
                             json.dump(eval_entry, f, indent=2)
+                        # --- Step 4: Upload the final file to the results directory on the Hub ---
                         try:
+                            # Use the local file with its final status as the basis for the results file
+                            path_in_repo_results = os.path.join(user_name, filename)
                             API.upload_file(
+                                path_or_fileobj=file_path,
+                                path_in_repo=path_in_repo_results,
                                 repo_id=RESULTS_REPO,
                                 repo_type="dataset",
+                                commit_message=f"Evaluation {'results' if not eval_result.error else 'error'} for {eval_entry['model']}"
                             )
+                            print("\nResults uploaded to Hugging Face successfully.")
                         except Exception as upload_error:
                             print(f"Error uploading results: {str(upload_error)}")
+                        # --- Step 5: Update the status of the request in the queue to FINISHED/FAILED ---
+                        # This keeps a record of all processed jobs in the queue repo.
                         try:
+                             API.upload_file(
+                                path_or_fileobj=file_path,
+                                path_in_repo=path_in_repo_queue,
+                                repo_id=QUEUE_REPO,
                                 repo_type="dataset",
+                                commit_message=f"Final status update for {eval_entry['model']}"
                             )
+                             print(f"Final status for {eval_entry['model']} updated in the queue repository.")
+                        except Exception as status_update_error:
+                            print(f"Error updating status in queue: {str(status_update_error)}")
+                    else:
+                        print(f"Skipping file with status: {status}")
+                except Exception as e:
+                    print(f"Error processing file {file_path}: {str(e)}")
+                    print(f"Full traceback: {traceback.format_exc()}")
+                    continue
+    print("\n=== Evaluation queue processed. ===")
+    print("No more pending jobs found.")
+    return

src/evaluator/run_evaluator.py CHANGED Viewed

@@ -17,11 +17,11 @@ def main():
         try:
             process_evaluation_queue()
             print("Evaluation queue processed. Sleeping for 5 minutes...")
-            time.sleep(300)  # Sleep for 5 minutes
         except Exception as e:
             print(f"Error in evaluation process: {e}")
             print("Retrying in 5 minutes...")
-            time.sleep(300)
 if __name__ == "__main__":
     main()

         try:
             process_evaluation_queue()
             print("Evaluation queue processed. Sleeping for 5 minutes...")
+            time.sleep(20)  # Sleep for 5 minutes
         except Exception as e:
             print(f"Error in evaluation process: {e}")
             print("Retrying in 5 minutes...")
+            time.sleep(20)
 if __name__ == "__main__":
     main()

src/evaluator/tsac.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+from datasets import load_dataset
+import traceback
+import time
+def evaluate_tsac_sentiment(model, tokenizer, device):
+    """Evaluate model on TSAC sentiment analysis task"""
+    try:
+        print("\n=== Starting TSAC sentiment evaluation ===")
+        print(f"Current device: {device}")
+        # Load and preprocess dataset
+        print("\nLoading and preprocessing TSAC dataset...")
+        dataset = load_dataset("fbougares/tsac", split="test", trust_remote_code=True)
+        dataset = dataset.select(range(10))  # Only evaluate on 200 samples
+        # print(f"Dataset size: {len(dataset)} examples")
+        def preprocess(examples):
+            return tokenizer(
+                examples['sentence'],
+                padding=True,
+                truncation=True,
+                max_length=512,
+                return_tensors=None
+            )
+        print(dataset.column_names)
+        dataset = dataset.map(preprocess, batched=True)
+        dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target'])
+        # Check first example
+        first_example = dataset[0]
+        print("\nFirst example details:")
+        print(f"Input IDs shape: {first_example['input_ids'].shape}")
+        print(f"Attention mask shape: {first_example['attention_mask'].shape}")
+        print(f"Target: {first_example['target']}")
+        model.eval()
+        print(f"\nModel class: {model.__class__.__name__}")
+        print(f"Model device: {next(model.parameters()).device}")
+        with torch.no_grad():
+            predictions = []
+            targets = []
+            # Create DataLoader with batch size 16
+            from torch.utils.data import DataLoader
+            # Define a custom collate function
+            def collate_fn(batch):
+                input_ids = torch.stack([sample['input_ids'] for sample in batch])
+                attention_mask = torch.stack([sample['attention_mask'] for sample in batch])
+                targets = torch.stack([sample['target'] for sample in batch])
+                return {
+                    'input_ids': input_ids,
+                    'attention_mask': attention_mask,
+                    'target': targets
+                }
+            dataloader = DataLoader(
+                dataset,
+                batch_size=16,
+                shuffle=False,
+                collate_fn=collate_fn
+            )
+            for i, batch in enumerate(dataloader):
+                if i % 10 == 0 :
+                    print("\nProcessing first batch...")
+                    print(f"Batch keys: {list(batch.keys())}")
+                    print(f"Target shape: {batch['target'].shape}")
+                inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'}
+                target = batch['target'].to(device)
+                before = time.time()
+                outputs = model(**inputs)
+                # print(f"\nBatch {i} output type: {type(outputs)}")
+                # Handle different model output formats
+                if isinstance(outputs, dict):
+                    # print(f"Output keys: {list(outputs.keys())}")
+                    if 'logits' in outputs:
+                        logits = outputs['logits']
+                    elif 'prediction_logits' in outputs:
+                        logits = outputs['prediction_logits']
+                    else:
+                        raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}")
+                elif isinstance(outputs, tuple):
+                    print(f"Output tuple length: {len(outputs)}")
+                    logits = outputs[0]
+                else:
+                    logits = outputs
+                # print(f"Logits shape: {logits.shape}")
+                # For sequence classification, we typically use the [CLS] token's prediction
+                if len(logits.shape) == 3:  # [batch_size, sequence_length, num_classes]
+                    logits = logits[:, 0, :]  # Take the [CLS] token prediction
+                # print(f"Final logits shape: {logits.shape}")
+                batch_predictions = logits.argmax(dim=-1).cpu().tolist()
+                batch_targets = target.cpu().tolist()
+                predictions.extend(batch_predictions)
+                targets.extend(batch_targets)
+                if i % 10 == 0:
+                    print("\nFirst batch predictions:")
+                    print(f"Predictions: {batch_predictions[:5]}")
+                    print(f"Targets: {batch_targets[:5]}")
+            print(f"\nTotal predictions: {len(predictions)}")
+            print(f"Total targets: {len(targets)}")
+            # Calculate accuracy
+            correct = sum(p == t for p, t in zip(predictions, targets))
+            total = len(predictions)
+            accuracy = correct / total if total > 0 else 0.0
+            print(f"\nEvaluation results:")
+            print(f"Correct predictions: {correct}")
+            print(f"Total predictions: {total}")
+            print(f"Accuracy: {accuracy:.4f}")
+            return {"fbougares/tsac": accuracy}
+    except Exception as e:
+        print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
+        print(f"Full traceback: {traceback.format_exc()}")
+        raise e

src/evaluator/tunisian_corpus_coverage.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import json
+import os
+from typing import Dict, Any
+from dataclasses import dataclass
+from enum import Enum
+from datetime import datetime
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
+from datasets import load_dataset
+import traceback
+from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
+def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
+    """Evaluate model's coverage on Tunisian Dialect Corpus"""
+    try:
+        dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train")
+        def preprocess(examples):
+            # print("Tunisian Corpus preprocess exemples -------------",examples)
+            # Use 'Tweet' field as per dataset structure
+            return tokenizer(
+                examples['Tweet'],
+                padding=False,  # We don't need padding for token coverage
+                truncation=False,  # Don't truncate long sequences
+                max_length=None  # Let tokenizer handle the length
+            )
+        dataset = dataset.map(preprocess, batched=True)
+        total_tokens = 0
+        covered_tokens = 0
+        for example in dataset:
+            input_ids = example['input_ids']
+            tokens = tokenizer.convert_ids_to_tokens(input_ids)
+            total_tokens += len(tokens)
+            covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
+        coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
+        print(f"Tunisian Corpus Coverage: {coverage:.2%}")
+        return {"arbml/Tunisian_Dialect_Corpus": coverage}
+    except Exception as e:
+        print(f"Error in Tunisian Corpus evaluation: {str(e)}")
+        print(f"Full traceback: {traceback.format_exc()}")
+        raise e

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,11 +1,7 @@
-import glob
 import json
-import math
 import os
 from dataclasses import dataclass
-import dateutil
-import numpy as np
 from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
@@ -14,8 +10,7 @@ from src.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
-    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
-    """
     eval_name: str # org_model_precision (uid)
     full_model: str # org/model (path on hub)
     org: str
@@ -38,108 +33,61 @@ class EvalResult:
         try:
             with open(json_filepath) as fp:
                 data = json.load(fp)
-            # Get model info
-            model_name = data.get('model')
-            org_and_model = model_name.split("/", 1)
             org = org_and_model[0]
             model = org_and_model[1]
-            # Get results
             results = data.get('results', {})
-            precision = Precision.from_str(data.get('precision', 'Unknown'))
-            # Create EvalResult
             return EvalResult(
                 eval_name=f"{org}_{model}_{precision.value}",
-                full_model=model_name,
                 org=org,
                 model=model,
-                revision=data.get('revision', ''),
                 results=results,
                 precision=precision,
-                model_type=ModelType.from_str(data.get('model_type', 'Unknown')),
-                weight_type=WeightType.from_str(data.get('weight_type', 'Original')),
-                date=data.get('submitted_at', ''),
-                still_on_hub=is_model_on_hub(model_name, revision="main")
             )
         except Exception as e:
             print(f"Error reading evaluation file {json_filepath}: {str(e)}")
             return None
-        # Precision
-        precision = Precision.from_str(config.get("model_dtype"))
-        # Get model and org
-        org_and_model = config.get("model_name", config.get("model_args", None))
-        org_and_model = org_and_model.split("/", 1)
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}"
-        full_model = "/".join(org_and_model)
-        still_on_hub, _, model_config = is_model_on_hub(
-            full_model, revision=config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
-        )
-        architecture = "?"
-        if model_config is not None:
-            architectures = getattr(model_config, "architectures", None)
-            if architectures:
-                architecture = ";".join(architectures)
-        # Extract results available in this file (some results are split in several files)
-        results = {}
-        for task in Tasks:
-            task = task.value
-            # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
-                continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
-        return self(
-            eval_name=result_key,
-            full_model=full_model,
-            org=org,
-            model=model,
-            results=results,
-            precision=precision,
-            revision= config.get("model_sha", ""),
-            still_on_hub=still_on_hub,
-            architecture=architecture
-        )
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
-        try:
-            with open(request_file, "r") as f:
-                request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.license = request.get("license", "?")
-            self.likes = request.get("likes", 0)
-            self.num_params = request.get("params", 0)
-            self.date = request.get("submitted_time", "")
-        except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         AutoEvalColumnInstance = AutoEvalColumn()
         data_dict = {
-            "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumnInstance.precision.name: self.precision.value.name,
             AutoEvalColumnInstance.model_type.name: self.model_type.value.name,
             AutoEvalColumnInstance.model_type_symbol.name: self.model_type.value.symbol,
@@ -151,124 +99,39 @@ class EvalResult:
             AutoEvalColumnInstance.license.name: self.license,
             AutoEvalColumnInstance.likes.name: self.likes,
             AutoEvalColumnInstance.params.name: self.num_params,
-            AutoEvalColumnInstance.still_on_hub.name: True if isinstance(self.still_on_hub, tuple) and self.still_on_hub[0] else False,
         }
-        # Map dataset names to their metric values
-        tsac_result = self.results.get("fbougares/tsac")
-        tunisian_result = self.results.get("arbml/Tunisian_Dialect_Corpus")
-        # Map metric values to their corresponding dataset names
         for task in Tasks:
-            if task.value.benchmark == "fbougares/tsac":
-                data_dict[task.value.col_name] = self.results.get("accuracy")
-            elif task.value.benchmark == "arbml/Tunisian_Dialect_Corpus":
-                data_dict[task.value.col_name] = self.results.get("coverage")
-        print("data_dict : ", data_dict)
         return data_dict
-def get_request_file_for_model(requests_path, model_name, precision):
-    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
-    request_files = os.path.join(
-        requests_path,
-        f"{model_name}_eval_request_*.json",
-    )
-    request_files = glob.glob(request_files)
-    # Select correct request file (precision)
-    request_file = ""
-    request_files = sorted(request_files, reverse=True)
-    for tmp_request_file in request_files:
-        with open(tmp_request_file, "r") as f:
-            req_content = json.load(f)
-            if (
-                req_content["status"] in ["FINISHED"]
-                and req_content["precision"] == precision.split(".")[-1]
-            ):
-                request_file = tmp_request_file
-    return request_file
 def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
     for root, _, files in os.walk(results_path):
-        # Only process .json files
         json_files = [f for f in files if f.endswith(".json")]
-        print(json_files)
         for file in json_files:
             model_result_filepaths.append(os.path.join(root, file))
-            print(model_result_filepaths)
-    eval_results = {}
     for model_result_filepath in model_result_filepaths:
         try:
-            # Creation of result
             eval_result = EvalResult.init_from_json_file(model_result_filepath)
-            # print(eval_result)
-            if eval_result is None:
                 print(f"Skipping invalid evaluation file: {model_result_filepath}")
-                continue
-            eval_result.update_with_request_file(requests_path)
-            # print(eval_result)
-            # Store results of same eval together
-            if eval_result.eval_name not in eval_results:
-                eval_results[eval_result.eval_name] = []
-            eval_results[eval_result.eval_name].append(eval_result)
-            # print(eval_results)
         except Exception as e:
             print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
             continue
-        # Store results of same eval together
-        eval_name = eval_result.eval_name
-        print("eval_name : ", eval_name)
-        if eval_name in eval_results.keys():
-            # If we already have results for this eval, append to list
-            eval_results[eval_name].append(eval_result)
-        else:
-            # Initialize list for this eval name
-            eval_results[eval_name] = [eval_result]
-    print("eval_results : ", eval_results)
-    # Process final results
-    final_results = {}
-    for eval_name, eval_list in eval_results.items():
-        # Create merged results from all evaluations, ensuring all required task keys are present
-        merged_results = {task.value.metric: None for task in Tasks}
-        for eval_result in eval_list:
-            merged_results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        # Take the first eval_result as base and update with merged results
-        print("evaluation list : ", len(eval_list))
-        base_result = eval_list[0]
-        print("base_result : ", base_result)
-        # print(base_result)
-        final_results[eval_name] = EvalResult(
-            eval_name=eval_name,
-            full_model=base_result.full_model,
-            org=base_result.org,
-            model=base_result.model,
-            revision=base_result.revision,
-            results=merged_results,
-            precision=base_result.precision,
-            model_type=base_result.model_type,
-            weight_type=base_result.weight_type,
-            date=base_result.date,
-            still_on_hub=base_result.still_on_hub
-        )
-        print(len(final_results))
-        print(final_results.keys())
-        print(final_results.values())
-    results = []
-    for v in final_results.values():
-        try:
-            v.to_dict()  # we test if the dict version is complete
-            results.append(v)
-        except KeyError as e:  # not all eval values present
-            print("error in v",e)
-            continue
-    return results

 import json
 import os
 from dataclasses import dataclass
 from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 @dataclass
 class EvalResult:
+    """Represents one full evaluation. Built from a single result file for a given run."""
     eval_name: str # org_model_precision (uid)
     full_model: str # org/model (path on hub)
     org: str
         try:
             with open(json_filepath) as fp:
                 data = json.load(fp)
+            # Extract model information from the JSON data
+            full_model_name = data.get('model')
+            org_and_model = full_model_name.split("/", 1)
             org = org_and_model[0]
             model = org_and_model[1]
+            # Extract other metadata
+            precision_str = data.get('precision', 'Unknown')
+            precision = Precision.from_str(precision_str)
+            model_type = ModelType.from_str(data.get('model_type', 'Unknown'))
+            weight_type = WeightType.from_str(data.get('weight_type', 'Original'))
+            revision = data.get('revision', '')
+            date = data.get('submitted_at', '')
+            # Extract results and metadata
             results = data.get('results', {})
+            license = data.get('license', '?')
+            likes = data.get('likes', 0)
+            num_params = data.get('params', 0)
+            architecture = data.get('architecture', 'Unknown')
+            # Check if the model is still on the hub
+            still_on_hub, _, _ = is_model_on_hub(full_model_name, revision=revision)
             return EvalResult(
                 eval_name=f"{org}_{model}_{precision.value}",
+                full_model=full_model_name,
                 org=org,
                 model=model,
+                revision=revision,
                 results=results,
                 precision=precision,
+                model_type=model_type,
+                weight_type=weight_type,
+                architecture=architecture,
+                license=license,
+                likes=likes,
+                num_params=num_params,
+                date=date,
+                still_on_hub=still_on_hub
             )
         except Exception as e:
             print(f"Error reading evaluation file {json_filepath}: {str(e)}")
             return None
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        # Calculate the average score for the leaderboard
+        scores = [v for k, v in self.results.items() if v is not None and k in [task.value.metric for task in Tasks]]
+        average = sum(scores) / len(scores) if scores else 0
         AutoEvalColumnInstance = AutoEvalColumn()
         data_dict = {
+            "eval_name": self.eval_name,
             AutoEvalColumnInstance.precision.name: self.precision.value.name,
             AutoEvalColumnInstance.model_type.name: self.model_type.value.name,
             AutoEvalColumnInstance.model_type_symbol.name: self.model_type.value.symbol,
             AutoEvalColumnInstance.license.name: self.license,
             AutoEvalColumnInstance.likes.name: self.likes,
             AutoEvalColumnInstance.params.name: self.num_params,
+            AutoEvalColumnInstance.still_on_hub.name: self.still_on_hub,
         }
+        # Dynamically map metric values to their corresponding column names
         for task in Tasks:
+            task_metric = task.value.metric
+            task_col_name = task.value.col_name
+            data_dict[task_col_name] = self.results.get(task_metric)
         return data_dict
 def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
+    # Recursively find all result files
     for root, _, files in os.walk(results_path):
         json_files = [f for f in files if f.endswith(".json")]
         for file in json_files:
             model_result_filepaths.append(os.path.join(root, file))
+    eval_results = []
     for model_result_filepath in model_result_filepaths:
         try:
             eval_result = EvalResult.init_from_json_file(model_result_filepath)
+            if eval_result is not None:
+                eval_results.append(eval_result)
+            else:
                 print(f"Skipping invalid evaluation file: {model_result_filepath}")
         except Exception as e:
             print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
             continue
+    return eval_results

src/populate.py CHANGED Viewed

@@ -1,26 +1,23 @@
 import json
 import os
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
-    print(raw_data)
     all_data_json = [v.to_dict() for v in raw_data]
-    print(all_data_json)
     df = pd.DataFrame.from_records(all_data_json)
-    print(df)
     if df.empty:
         print("No evaluation results found. Returning empty DataFrame with correct columns.")
         return pd.DataFrame(columns=cols)
     df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
-    # print(df)
     df = df[cols].round(decimals=4)
     df = df[has_no_nan_values(df, benchmark_cols)]
     return df
@@ -28,34 +25,44 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     """Creates the different dataframes for the evaluation queues requestes"""
-    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
     all_evals = []
-    for entry in entries:
-        if ".json" in entry:
-            file_path = os.path.join(save_path, entry)
-            with open(file_path) as fp:
-                data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-            all_evals.append(data)
-        elif ".md" not in entry and os.path.isdir(os.path.join(save_path, entry)):
-            # this is a folder
-            sub_entries = [e for e in os.listdir(os.path.join(save_path, entry)) if os.path.isfile(os.path.join(save_path, entry, e)) and not e.startswith(".")]
-            for sub_entry in sub_entries:
-                file_path = os.path.join(save_path, entry, sub_entry)
-                with open(file_path) as fp:
-                    data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-                all_evals.append(data)
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
     finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
-    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
-    df_running = pd.DataFrame.from_records(running_list, columns=cols)
-    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
     return df_finished[cols], df_running[cols], df_pending[cols]

 import json
 import os
 import pandas as pd
+from datetime import datetime, timedelta
+import dateutil
 from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import AutoEvalColumn, EvalQueueColumn, ModelType, Tasks, Precision, WeightType
 from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     if df.empty:
         print("No evaluation results found. Returning empty DataFrame with correct columns.")
         return pd.DataFrame(columns=cols)
     df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
     df = df[cols].round(decimals=4)
     df = df[has_no_nan_values(df, benchmark_cols)]
     return df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     """Creates the different dataframes for the evaluation queues requestes"""
     all_evals = []
+    # Define a threshold to identify "stuck" jobs
+    time_threshold = datetime.now() - timedelta(hours=1)
+    # Use os.walk for a robust way to find all files recursively
+    for root, _, files in os.walk(save_path):
+        for filename in files:
+            if filename.endswith(".json"):
+                file_path = os.path.join(root, filename)
+                try:
+                    with open(file_path, "r") as fp:
+                        data = json.load(fp)
+                    # Check for "stuck" jobs
+                    if data.get("status") == "RUNNING":
+                        submitted_time_str = data.get("submitted_at")
+                        if submitted_time_str:
+                            submitted_time = dateutil.parser.isoparse(submitted_time_str)
+                            if submitted_time < time_threshold:
+                                print(f"Stuck job detected for {data['model']}. Changing status to PENDING.")
+                                data["status"] = "PENDING"
+                    data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+                    data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+                    all_evals.append(data)
+                except Exception as e:
+                    print(f"Error processing file {file_path}: {e}")
+                    continue
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
     finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
+    df_pending = pd.DataFrame.from_records(pending_list, columns=cols) if pending_list else pd.DataFrame(columns=cols)
+    df_running = pd.DataFrame.from_records(running_list, columns=cols) if running_list else pd.DataFrame(columns=cols)
+    df_finished = pd.DataFrame.from_records(finished_list, columns=cols) if finished_list else pd.DataFrame(columns=cols)
     return df_finished[cols], df_running[cols], df_pending[cols]

src/submission/submit.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import json
 import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
@@ -10,27 +12,27 @@ from src.submission.check_validity import (
     get_model_size,
     is_model_on_hub,
 )
-from src.evaluator.evaluate import evaluate_model, EvaluationStatus, EvaluationResult
-from src.display.utils import Tasks
-import torch
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-from datasets import load_dataset
-import time
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
-def create_eval_request(
     model: str,
     base_model: str,
     revision: str,
     precision: str,
     weight_type: str,
     model_type: str,
 ):
-    """Create and upload an evaluation request"""
     try:
-        # Create evaluation request file
         request_data = {
             'model': model,
             'base_model': base_model,
@@ -39,345 +41,141 @@ def create_eval_request(
             'weight_type': weight_type,
             'model_type': model_type,
             'status': EvaluationStatus.PENDING.value,
-            'submitted_time': datetime.now(timezone.utc).isoformat()
         }
-        # Create filename
-        username = model.split('/')[0] if '/' in model else None
-        request_filename = f"{username or 'unknown'}_{model.replace('/', '_')}_eval_request_{revision}_{precision}_{weight_type}.json"
-        request_path = os.path.join(EVAL_REQUESTS_PATH, request_filename)
-        # Write request file
-        with open(request_path, 'w') as f:
-            json.dump(request_data, f, indent=2)
-        print(f"Created evaluation request: {request_filename}")
-        # Upload to Hugging Face
-        API.upload_file(
-            path_or_fileobj=request_path,
-            path_in_repo=request_filename if not username else os.path.join(username, request_filename),
-            repo_id=QUEUE_REPO,
-            repo_type="dataset",
-            commit_message=f"Add evaluation request for {model}",
-            token=TOKEN
-        )
-        print(f"Uploaded evaluation request to {QUEUE_REPO}")
-        return styled_message(
-            "Evaluation request created! Please wait for the evaluation to complete."
-        )
     except Exception as e:
-        print(f"Error creating evaluation request: {str(e)}")
         return styled_error(f"Failed to create evaluation request: {str(e)}")
-def add_new_eval(
-    model: str,
-    base_model: str,
-    revision: str,
-    precision: str,
-    weight_type: str,
-    model_type: str,
-):
-    """Validate model and create evaluation request"""
     try:
-        print("\n=== Starting evaluation submission ===")
         print(f"Submission time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC")
-        print(f"Model: {model}")
-        print(f"Base model: {base_model}")
-        print(f"Revision: {revision}")
-        print(f"Precision: {precision}")
-        print(f"Weight type: {weight_type}")
-        print(f"Model type: {model_type}")
-        print(f"Evaluation requests path: {EVAL_REQUESTS_PATH}")
-        print(f"Queue repo: {QUEUE_REPO}")
-        # Always refresh the cache before checking for duplicates
-        print("\n=== Checking for duplicate submissions ===")
         global REQUESTED_MODELS
         global USERS_TO_SUBMISSION_DATES
         start_time = time.time()
         REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
-        print(f"Cache refresh completed in {time.time() - start_time:.2f} seconds")
-        print(f"Found {len(REQUESTED_MODELS)} existing submissions")
-        user_name = ""
-        model_path = model
-        if "/" in model:
-            user_name = model.split("/")[0]
-            model_path = model.split("/")[1]
-        print(f"\nUser name: {user_name}")
-        print(f"Model path: {model_path}")
-        precision = precision.split(" ")[0]
-        if revision == "":
-            revision = "main"
-            print("Using default revision: main")
-        current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-        # Check if model is already submitted
-        print("\n=== Checking for existing submission ===")
         model_key = f"{model}_{revision}_{precision}"
         if model_key in REQUESTED_MODELS:
-            print(f"Found existing submission with key: {model_key}")
-            # Get the status from the queue file
-            queue_file = REQUESTED_MODELS[model_key]
             try:
-                with open(queue_file, 'r') as f:
                     queue_entry = json.load(f)
                 status = queue_entry.get('status')
-                print(f"Found existing submission with status: {status}")
-                if status is None:
-                    print(f"Warning: No status found in queue file {queue_file}")
-                    return styled_warning("Error checking model status. Please try again later.")
-                if status != EvaluationStatus.FAILED.value:
-                    print(f"Model already submitted and in {status} status")
-                    return styled_warning(f"This model has been already submitted and is in {status} status.")
             except Exception as e:
                 print(f"Error reading queue file: {e}")
-                print(f"Full traceback: {traceback.format_exc()}")
                 return styled_warning("Error checking model status. Please try again later.")
-    except Exception as e:
-        print(f"Error during evaluation: {str(e)}")
-        raise
-        print("\n=== Validating model type ===")
-        if model_type is None or model_type == "":
-            print("Error: Model type is missing")
             return styled_error("Please select a model type.")
-        print("\n=== Validating model existence ===")
-        if revision == "":
-            revision = "main"
-            print("Using default revision: main")
-        print("\n=== Validating model on Hugging Face ===")
         try:
             if weight_type in ["Delta", "Adapter"]:
-                print(f"Checking base model {base_model} on Hugging Face...")
-                base_model_on_hub, error, _ = is_model_on_hub(
-                    model_name=base_model,
-                    revision=revision,
-                    token=TOKEN,
-                    test_tokenizer=True
-                )
-                print(f"Base model check result: {base_model_on_hub}")
                 if not base_model_on_hub:
-                    print(f"Error: Base model not found: {error}")
-                    return styled_error(f'Base model "{base_model}" {error}')
-            if not weight_type == "Adapter":
-                print(f"Checking model {model} on Hugging Face...")
-                model_on_hub, error, _ = is_model_on_hub(
-                    model_name=model,
-                    revision=revision,
-                    token=TOKEN,
-                    test_tokenizer=True
-                )
-                print(f"Model check result: {model_on_hub}")
-                if not model_on_hub:
-                    print(f"Error: Model not found: {error}")
-                    return styled_error(f'Model "{model}" {error}')
-        except Exception as e:
-            print(f"Error checking model on Hugging Face: {e}")
-            print(f"Full traceback: {traceback.format_exc()}")
-            return styled_error(f"Failed to validate model on Hugging Face: {str(e)}")
-        print("\n=== Getting model info ===")
-        try:
             model_info = API.model_info(repo_id=model, revision=revision)
-            print(f"Successfully retrieved model info for {model}")
-        except Exception as e:
-            print(f"Error getting model info: {e}")
-            print(f"Full traceback: {traceback.format_exc()}")
-            return styled_error("Could not get your model information. Please fill it up properly.")
-        print("\n=== Getting model size ===")
-        try:
-            model_size = get_model_size(model_info=model_info, precision=precision)
-            print(f"Model size: {model_size}")
-        except Exception as e:
-            print(f"Error getting model size: {e}")
-            print(f"Full traceback: {traceback.format_exc()}")
-            model_size = "?"
-        print("\n=== Validating model card and license ===")
-        try:
-            license = model_info.cardData["license"]
-            print(f"Model license: {license}")
-        except Exception as e:
-            print(f"Error getting model license: {e}")
-            print(f"Full traceback: {traceback.format_exc()}")
-            return styled_error("Please select a license for your model")
-        print("\n=== Checking model card ===")
-        try:
-            modelcard_OK, error_msg = check_model_card(model)
-            print(f"Model card check result: {modelcard_OK}")
-            if not modelcard_OK:
-                print(f"Model card error: {error_msg}")
                 return styled_error(error_msg)
-        except Exception as e:
-            print(f"Error checking model card: {e}")
-            print(f"Full traceback: {traceback.format_exc()}")
-            return styled_error("Failed to validate model card")
-        print("\n=== Creating evaluation entry ===")
-        eval_entry = {
-            "model": model,
-            "base_model": base_model,
-            "revision": revision,
-            "precision": precision,
-            "weight_type": weight_type,
-            "status": "PENDING",
-            "submitted_time": current_time,
-            "model_type": model_type,
-            "likes": model_info.likes,
-            "params": model_size,
-            "license": license,
-            "private": False,
-        }
-        print(f"\nEvaluation entry created: {json.dumps(eval_entry, indent=2)}")
-        print("\n=== Checking for duplicate submission ===")
-        model_key = f"{model}_{revision}_{precision}"
-        if model_key in REQUESTED_MODELS:
-            print(f"Found existing submission with key: {model_key}")
-            # Get the status from the queue file
-            queue_file = REQUESTED_MODELS[model_key]
-            try:
-                with open(queue_file, 'r') as f:
-                    queue_entry = json.load(f)
-                status = queue_entry.get('status')
-                print(f"Found existing submission with status: {status}")
-                if status is None:
-                    print(f"Warning: No status found in queue file {queue_file}")
-                    return styled_warning("Error checking model status. Please try again later.")
-                if status != EvaluationStatus.FAILED.value:
-                    print(f"Model already submitted and in {status} status")
-                    return styled_warning(f"This model has been already submitted and is in {status} status.")
-            except Exception as e:
-                print(f"Error reading queue file: {e}")
-                print(f"Full traceback: {traceback.format_exc()}")
-                return styled_warning("Error checking model status. Please try again later.")
-        print("\n=== Creating evaluation file ===")
-        OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
-        print(f"Creating output directory: {OUT_DIR}")
-        os.makedirs(OUT_DIR, exist_ok=True)
-        out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
-        print(f"Output file path: {out_path}")
-        # Write evaluation entry to file
-        try:
-            with open(out_path, "w") as f:
-                f.write(json.dumps(eval_entry))
-            print("\nEvaluation file created successfully")
-            # Upload to Hugging Face
-            print("\n=== Uploading evaluation file ===")
-            API.upload_file(
-                path_or_fileobj=out_path,
-                path_in_repo=out_path.split("eval-queue/")[1],
-                repo_id=QUEUE_REPO,
-                repo_type="dataset",
-                commit_message=f"Add evaluation request for {model}",
-                token=TOKEN
-            )
-            print(f"\nEvaluation request uploaded successfully to {QUEUE_REPO}")
-            # Clean up local file
-            os.remove(out_path)
-            print("\nLocal evaluation file removed")
-            return styled_message(
-                "Evaluation request created successfully! Please wait for the evaluation to complete."
-            )
         except Exception as e:
-            print(f"Error during file operations: {str(e)}")
-            print(f"Full traceback: {traceback.format_exc()}")
-            return styled_error(f"Failed to create evaluation request: {str(e)}")
-        dataloader = DataLoader(tsac_dataset, batch_size=32, shuffle=False)
-        model_obj.eval()
-        with torch.no_grad():
-            predictions = []
-            targets = []
-            for batch in dataloader:
-                inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'}
-                target = batch['target'].to(device)
-                # Log the first batch details
-                if len(predictions) == 0:  # Only log for the first batch
-                    print(f"\nFirst batch example:")
-                    print(f"Input keys: {list(inputs.keys())}")
-                    print(f"Target shape: {target.shape}")
-                outputs = model_obj(**inputs)
-                print(f"\nModel output type: {type(outputs)}")
-                # Try to get logits from different possible formats
-                if isinstance(outputs, dict):
-                    print(f"Output keys: {list(outputs.keys())}")
-                    # Try different common keys
-                    if 'logits' in outputs:
-                        logits = outputs['logits']
-                    elif 'prediction_logits' in outputs:
-                        logits = outputs['prediction_logits']
-                    else:
-                        raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}")
-                elif isinstance(outputs, tuple):
-                    print(f"Output tuple length: {len(outputs)}")
-                    # Try different positions in the tuple
-                    if len(outputs) > 0:
-                        logits = outputs[0]
-                    else:
-                        raise ValueError("Empty output tuple")
-                else:
-                    # If it's a single tensor, assume it's the logits
-                    logits = outputs
-                print(f"Logits shape: {logits.shape}")
-                # For sequence classification, we typically use the [CLS] token's prediction
-                # Get the first token's prediction (CLS token)
-                cls_logits = logits[:, 0, :]  # Shape: [batch_size, num_classes]
-                predictions.extend(cls_logits.argmax(dim=-1).cpu().tolist())
-                targets.extend(target.cpu().tolist())
-        accuracy = sum(p == t for p, t in zip(predictions, targets)) / len(predictions)
-        eval_entry['results'] = {'accuracy': accuracy}
-        # Update the queue file with results
-        with open(out_path, "w") as f:
-            f.write(json.dumps(eval_entry))
-        # Evaluate on ArabML
-        print("Evaluating on ArabML Tunisian Corpus...")
-        arabml_dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train", trust_remote_code=True)
-        def preprocess_arabml(examples):
-            return tokenizer(examples['Tweet'], padding=True, truncation=True, max_length=512)
-        arabml_dataset = arabml_dataset.map(preprocess_arabml, batched=True)
-        total_tokens = 0
-        covered_tokens = 0
-        for example in arabml_dataset:
-            tokens = tokenizer.tokenize(example['Tweet'])
-            total_tokens += len(tokens)
-            covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
-        arabml_coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
-        # Store results

 import json
 import os
+import time
+import traceback
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
     get_model_size,
     is_model_on_hub,
 )
+from src.evaluator.evaluate import EvaluationStatus
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
+def _create_eval_request(
     model: str,
     base_model: str,
     revision: str,
     precision: str,
     weight_type: str,
     model_type: str,
+    model_info: dict,
 ):
+    """
+    Creates and uploads a JSON file for a new model evaluation request.
+    This function is a helper for add_new_eval and should not be called directly.
+    """
     try:
         request_data = {
             'model': model,
             'base_model': base_model,
             'weight_type': weight_type,
             'model_type': model_type,
             'status': EvaluationStatus.PENDING.value,
+            'submitted_time': datetime.now(timezone.utc).isoformat(),
+            'likes': model_info.likes,
+            'params': get_model_size(model_info, precision),
+            'license': model_info.cardData.get("license"),
+            'private': model_info.private,
         }
+        user_name = model.split('/')[0] if '/' in model else 'unknown'
+        safe_revision = revision.replace('/', '_')
+        request_filename = f"{model.replace('/', '_')}_eval_request_{safe_revision}_{precision}_{weight_type}.json"
+        local_dir = os.path.join(EVAL_REQUESTS_PATH, user_name)
+        os.makedirs(local_dir, exist_ok=True)
+        local_path = os.path.join(local_dir, request_filename)
+        print(f"Creating local evaluation request file: {local_path}")
+        # Use a try-finally block to ensure the local file is always removed
+        try:
+            with open(local_path, 'w') as f:
+                json.dump(request_data, f, indent=2)
+            # Upload the request file to the Hugging Face queue repository
+            print(f"Uploading evaluation request to {QUEUE_REPO}")
+            path_in_repo = os.path.join(user_name, request_filename)
+            API.upload_file(
+                path_or_fileobj=local_path,
+                path_in_repo=path_in_repo,
+                repo_id=QUEUE_REPO,
+                repo_type="dataset",
+                commit_message=f"Add evaluation request for {model}",
+                token=TOKEN
+            )
+            print(f"Uploaded successfully to {path_in_repo} in {QUEUE_REPO}")
+            return styled_message(
+                "Evaluation request created successfully! Please wait for the evaluation to complete."
+            )
+        finally:
+            if os.path.exists(local_path):
+                os.remove(local_path)
+                print(f"Local file {local_path} removed.")
     except Exception as e:
+        print(f"Error creating or uploading evaluation request: {str(e)}")
+        print(f"Full traceback:\n{traceback.format_exc()}")
         return styled_error(f"Failed to create evaluation request: {str(e)}")
+def add_new_eval(model: str, base_model: str, revision: str, precision: str, weight_type: str, model_type: str):
+    """
+    Validates a model and creates an evaluation request for it.
+    This is the main function to be called by the user.
+    """
     try:
+        print("\n=== Starting Evaluation Submission ===")
         print(f"Submission time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC")
+        print(f"Model: {model}, Base: {base_model}, Revision: {revision}, Precision: {precision}")
+        precision = precision.split(" ")[0]
+        if not revision:
+            revision = "main"
+            print("Using default revision: main")
+        # --- Step 1: Check for existing submissions ---
+        print("\n=== Checking for existing submissions ===")
         global REQUESTED_MODELS
         global USERS_TO_SUBMISSION_DATES
         start_time = time.time()
         REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
+        print(f"Cache refresh completed in {time.time() - start_time:.2f} seconds. Found {len(REQUESTED_MODELS)} existing submissions.")
         model_key = f"{model}_{revision}_{precision}"
         if model_key in REQUESTED_MODELS:
+            queue_file_path = REQUESTED_MODELS[model_key]
             try:
+                with open(queue_file_path, 'r') as f:
                     queue_entry = json.load(f)
                 status = queue_entry.get('status')
+                if status is not None and status != EvaluationStatus.FAILED.value:
+                    return styled_warning(f"This model has already been submitted and is in a '{status}' status.")
             except Exception as e:
                 print(f"Error reading queue file: {e}")
+                print(f"Full traceback:\n{traceback.format_exc()}")
                 return styled_warning("Error checking model status. Please try again later.")
+        print(f"No existing submission found for key: {model_key} or previous submission had a FAILED status.")
+        # --- Step 2: Validate model type and existence on the Hub ---
+        print("\n=== Validating model existence and card === ")
+        if not model_type:
             return styled_error("Please select a model type.")
         try:
+            # Validate the base model first for delta/adapter weights
             if weight_type in ["Delta", "Adapter"]:
+                print(f"Checking base model '{base_model}' on Hugging Face...")
+                base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN)
                 if not base_model_on_hub:
+                    return styled_error(f'Base model "{base_model}" was not found on the Hugging Face Hub: {error}')
+            # Validate the main model
+            print(f"Checking model '{model}' on Hugging Face...")
+            model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN)
+            if not model_on_hub:
+                return styled_error(f'Model "{model}" was not found on the Hugging Face Hub: {error}')
+            # Get model information and validate the model card and license
             model_info = API.model_info(repo_id=model, revision=revision)
+            model_card_ok, error_msg = check_model_card(model)
+            if not model_card_ok:
                 return styled_error(error_msg)
+            if "license" not in model_info.cardData:
+                return styled_error("Please select a license for your model in its model card.")
         except Exception as e:
+            print(f"Error during model validation: {e}")
+            print(f"Full traceback:\n{traceback.format_exc()}")
+            return styled_error(f"Failed to validate model on Hugging Face: {str(e)}")
+        # --- Step 3: Create the evaluation request ---
+        print("\n=== Creating and uploading evaluation request ===")
+        # This function encapsulates the file creation and upload logic.
+        return _create_eval_request(
+            model=model,
+            base_model=base_model,
+            revision=revision,
+            precision=precision,
+            weight_type=weight_type,
+            model_type=model_type,
+            model_info=model_info,
+        )
+    except Exception as e:
+        print(f"An unexpected error occurred during submission: {e}")
+        print(f"Full traceback:\n{traceback.format_exc()}")
+        return styled_error(f"An unexpected error occurred during submission: {str(e)}")