hamzabouajila commited on
Commit
34052ff
·
1 Parent(s): 1228655

feat: enhance evaluation pipeline and error handling

Browse files

- Add Hugging Face Hub integration for downloading evaluation queue
- Improve error handling and status updates in evaluation process
- Streamline file upload and status management
- Add proper type hints and imports
- Update tokenizer loading to be more efficient
- Implement better logging for evaluation status
- Add snapshot download for evaluation requests
- Fix race conditions in file handling
- Update dependencies and imports

app.py CHANGED
@@ -1,7 +1,5 @@
1
- import os
2
  from dotenv import load_dotenv
3
 
4
- # Load environment variables from .env file
5
  load_dotenv()
6
 
7
  import gradio as gr
@@ -39,95 +37,36 @@ import time
39
 
40
  def restart_space():
41
  try:
42
- # Restart the space
43
  API.restart_space(repo_id=REPO_ID)
44
  except Exception as e:
45
  print(f"Error restarting space: {str(e)}")
46
- # If restart fails, try to download the datasets again
47
  try:
48
  print("Attempting to download datasets again...")
49
  snapshot_download(
50
- repo_id=QUEUE_REPO,
51
- local_dir=EVAL_REQUESTS_PATH,
52
- repo_type="dataset",
53
- tqdm_class=None,
54
- etag_timeout=30,
55
- token=TOKEN
56
- )
57
  snapshot_download(
58
- repo_id=RESULTS_REPO,
59
- local_dir=EVAL_RESULTS_PATH,
60
- repo_type="dataset",
61
- tqdm_class=None,
62
- etag_timeout=30,
63
- token=TOKEN
64
  )
65
  except Exception as download_error:
66
  print(f"Error downloading datasets: {str(download_error)}")
67
 
68
- ### Space initialisation
69
- try:
70
- print(f"\n=== Starting space initialization ===")
71
- print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
72
- print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
73
- print(f"QUEUE_REPO: {QUEUE_REPO}")
74
- print(f"RESULTS_REPO: {RESULTS_REPO}")
75
- print(f"TOKEN: {bool(TOKEN)}")
76
-
77
- print("\n=== Downloading request files ===")
78
- snapshot_download(
79
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
80
- )
81
-
82
- print("\n=== Downloading results files ===")
83
- snapshot_download(
84
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
85
- )
86
-
87
- print("\n=== Loading leaderboard data ===")
88
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
89
- print(f"Leaderboard DataFrame shape: {LEADERBOARD_DF.shape if LEADERBOARD_DF is not None else 'None'}")
90
-
91
- print("\n=== Loading evaluation queue data ===")
92
- (
93
- finished_eval_queue_df,
94
- running_eval_queue_df,
95
- pending_eval_queue_df,
96
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
97
- print(f"Finished eval queue shape: {finished_eval_queue_df.shape if finished_eval_queue_df is not None else 'None'}")
98
- print(f"Running eval queue shape: {running_eval_queue_df.shape if running_eval_queue_df is not None else 'None'}")
99
- print(f"Pending eval queue shape: {pending_eval_queue_df.shape if pending_eval_queue_df is not None else 'None'}")
100
-
101
- except Exception as e:
102
- print(f"\n=== Error during space initialization ===")
103
- print(f"Error: {str(e)}")
104
- restart_space()
105
 
106
 
107
- # Start evaluator service in a separate thread
 
108
  def run_evaluator():
109
  print("Starting evaluator service...")
110
  while True:
111
  try:
112
  process_evaluation_queue()
113
  print("Evaluation queue processed. Sleeping for 5 minutes...")
114
- time.sleep(300) # Sleep for 5 minutes
115
  except Exception as e:
116
  print(f"Error in evaluation process: {e}")
117
  print("Retrying in 5 minutes...")
118
- time.sleep(300)
119
-
120
- # Start evaluator in a separate thread
121
- evaluator_thread = threading.Thread(target=run_evaluator, daemon=True)
122
- evaluator_thread.start()
123
 
124
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
125
-
126
- (
127
- finished_eval_queue_df,
128
- running_eval_queue_df,
129
- pending_eval_queue_df,
130
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
131
 
132
  def init_leaderboard(dataframe):
133
  if dataframe is None:
@@ -145,36 +84,26 @@ def init_leaderboard(dataframe):
145
  filter_columns=[
146
  ColumnFilter(AutoEvalColumn().model_type.name, type="checkboxgroup", label="Model types"),
147
  ColumnFilter(AutoEvalColumn().precision.name, type="checkboxgroup", label="Precision"),
148
- ColumnFilter(
149
- AutoEvalColumn().params.name,
150
- type="slider",
151
- min=0.01,
152
- max=150,
153
- label="Select the number of parameters (B)",
154
- ),
155
- ColumnFilter(
156
- AutoEvalColumn().still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
157
- ),
158
  ],
159
  bool_checkboxgroup_label="Hide models",
160
  interactive=False,
161
  )
162
 
163
- # Add model evaluation functionality
 
164
  def evaluate_and_update(model_name, revision, precision, weight_type):
165
  """Add a model evaluation request to the queue"""
166
  try:
167
- # Add evaluation request to queue
168
  add_new_eval(
169
  model_name=model_name,
170
  revision=revision,
171
  precision=precision,
172
  weight_type=weight_type,
173
- model_type="LLM", # Add appropriate model type
174
  )
175
-
176
- # Update leaderboard
177
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
178
  return "Evaluation request added to queue! Check the leaderboard for updates."
179
  except Exception as e:
180
  print(f"Error in evaluate_and_update: {str(e)}")
@@ -182,6 +111,52 @@ def evaluate_and_update(model_name, revision, precision, weight_type):
182
  return f"Error adding evaluation request: {str(e)}"
183
 
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  demo = gr.Blocks(css=custom_css)
186
  with demo:
187
  gr.HTML(TITLE)
@@ -218,6 +193,7 @@ with demo:
218
  open=False,
219
  ):
220
  with gr.Row():
 
221
  running_eval_table = gr.components.Dataframe(
222
  value=running_eval_queue_df,
223
  headers=EVAL_COLS,
 
 
1
  from dotenv import load_dotenv
2
 
 
3
  load_dotenv()
4
 
5
  import gradio as gr
 
37
 
38
  def restart_space():
39
  try:
 
40
  API.restart_space(repo_id=REPO_ID)
41
  except Exception as e:
42
  print(f"Error restarting space: {str(e)}")
 
43
  try:
44
  print("Attempting to download datasets again...")
45
  snapshot_download(
46
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN, force_download=True
47
+ )
 
 
 
 
 
48
  snapshot_download(
49
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN, force_download=True
 
 
 
 
 
50
  )
51
  except Exception as download_error:
52
  print(f"Error downloading datasets: {str(download_error)}")
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
 
56
+
57
+
58
  def run_evaluator():
59
  print("Starting evaluator service...")
60
  while True:
61
  try:
62
  process_evaluation_queue()
63
  print("Evaluation queue processed. Sleeping for 5 minutes...")
64
+ time.sleep(10) # Sleep for 5 minutes
65
  except Exception as e:
66
  print(f"Error in evaluation process: {e}")
67
  print("Retrying in 5 minutes...")
68
+ time.sleep(10)
 
 
 
 
69
 
 
 
 
 
 
 
 
70
 
71
  def init_leaderboard(dataframe):
72
  if dataframe is None:
 
84
  filter_columns=[
85
  ColumnFilter(AutoEvalColumn().model_type.name, type="checkboxgroup", label="Model types"),
86
  ColumnFilter(AutoEvalColumn().precision.name, type="checkboxgroup", label="Precision"),
87
+ ColumnFilter(AutoEvalColumn().params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
88
+ ColumnFilter(AutoEvalColumn().still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
 
 
 
 
 
 
 
 
89
  ],
90
  bool_checkboxgroup_label="Hide models",
91
  interactive=False,
92
  )
93
 
94
+
95
+
96
  def evaluate_and_update(model_name, revision, precision, weight_type):
97
  """Add a model evaluation request to the queue"""
98
  try:
 
99
  add_new_eval(
100
  model_name=model_name,
101
  revision=revision,
102
  precision=precision,
103
  weight_type=weight_type,
104
+ model_type="LLM",
105
  )
106
+ get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
107
  return "Evaluation request added to queue! Check the leaderboard for updates."
108
  except Exception as e:
109
  print(f"Error in evaluate_and_update: {str(e)}")
 
111
  return f"Error adding evaluation request: {str(e)}"
112
 
113
 
114
+ ### Space initialisation
115
+ try:
116
+ print(f"\n=== Starting space initialization ===")
117
+ print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
118
+ print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
119
+ print(f"QUEUE_REPO: {QUEUE_REPO}")
120
+ print(f"RESULTS_REPO: {RESULTS_REPO}")
121
+ print(f"TOKEN: {bool(TOKEN)}")
122
+
123
+ print("\n=== Downloading request files ===")
124
+ snapshot_download(
125
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN,force_download=True
126
+ )
127
+
128
+ print("\n=== Downloading results files ===")
129
+ snapshot_download(
130
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN,force_download=True
131
+ )
132
+
133
+ print("\n=== Loading leaderboard data ===")
134
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
135
+ print(f"Leaderboard DataFrame shape: {LEADERBOARD_DF.shape if LEADERBOARD_DF is not None else 'None'}")
136
+
137
+ print("\n=== Loading evaluation queue data ===")
138
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
139
+
140
+ print(f"Finished eval queue shape: {finished_eval_queue_df.shape if finished_eval_queue_df is not None else 'None'}")
141
+ print(f"Running eval queue shape: {running_eval_queue_df.shape if running_eval_queue_df is not None else 'None'}")
142
+ print(f"Pending eval queue shape: {pending_eval_queue_df.shape if pending_eval_queue_df is not None else 'None'}")
143
+
144
+ except Exception as e:
145
+ print(f"\n=== Error during space initialization ===")
146
+ print(f"Error: {str(e)}")
147
+ restart_space()
148
+
149
+
150
+
151
+ evaluator_thread = threading.Thread(target=run_evaluator, daemon=True)
152
+ evaluator_thread.start()
153
+
154
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
155
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
156
+
157
+
158
+
159
+
160
  demo = gr.Blocks(css=custom_css)
161
  with demo:
162
  gr.HTML(TITLE)
 
193
  open=False,
194
  ):
195
  with gr.Row():
196
+ print(running_eval_queue_df)
197
  running_eval_table = gr.components.Dataframe(
198
  value=running_eval_queue_df,
199
  headers=EVAL_COLS,
src/about.py CHANGED
@@ -3,18 +3,18 @@ from enum import Enum
3
 
4
  @dataclass
5
  class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
9
 
10
 
11
  # Tunisian Dialect Tasks
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # Example: Sentiment Analysis on TSAC
15
- tsac_sentiment = Task("fbougares/tsac", "accuracy", "TSAC Sentiment")
16
  # Example: Text Classification or Corpus Coverage on Tunisian Dialect Corpus
17
- tunisian_corpus = Task("arbml/Tunisian_Dialect_Corpus", "coverage", "Tunisian Corpus Coverage")
18
 
19
  NUM_FEWSHOT = 0 # Change with your few shot
20
  # ---------------------------------------------------
 
3
 
4
  @dataclass
5
  class Task:
6
+ benchmark: str # Dataset name
7
+ metric: str # Metric name
8
+ col_name: str # Column name
9
 
10
 
11
  # Tunisian Dialect Tasks
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # Example: Sentiment Analysis on TSAC
15
+ accuracy = Task("fbougares/tsac", "accuracy", "Accuracy (TSAC) ⬆️")
16
  # Example: Text Classification or Corpus Coverage on Tunisian Dialect Corpus
17
+ coverage = Task("arbml/Tunisian_Dialect_Corpus", "coverage", "Coverage (Tunisian Corpus) %")
18
 
19
  NUM_FEWSHOT = 0 # Change with your few shot
20
  # ---------------------------------------------------
src/evaluator/evaluate.py CHANGED
@@ -1,16 +1,20 @@
1
  import json
2
  import os
 
3
  from typing import Dict, Any
4
  from dataclasses import dataclass
5
  from enum import Enum
6
  from datetime import datetime
7
  import torch
8
- from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
9
- from datasets import load_dataset
10
  import traceback
11
 
12
- from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
13
- from src.display.utils import Tasks
 
 
 
 
14
 
15
  class EvaluationStatus(Enum):
16
  PENDING = "PENDING"
@@ -20,6 +24,7 @@ class EvaluationStatus(Enum):
20
 
21
  @dataclass
22
  class EvaluationResult:
 
23
  model: str
24
  revision: str
25
  precision: str
@@ -27,275 +32,41 @@ class EvaluationResult:
27
  results: Dict[str, float]
28
  error: str = None
29
 
30
- def evaluate_tsac_sentiment(model, tokenizer, device):
31
- """Evaluate model on TSAC sentiment analysis task"""
32
- try:
33
- print("\n=== Starting TSAC sentiment evaluation ===")
34
- print(f"Current device: {device}")
35
-
36
- # Load and preprocess dataset
37
- print("\nLoading and preprocessing TSAC dataset...")
38
- dataset = load_dataset("fbougares/tsac", split="test", trust_remote_code=True)
39
- print(f"Dataset size: {len(dataset)} examples")
40
-
41
- def preprocess(examples):
42
- print(f"\nProcessing batch of {len(examples['sentence'])} examples")
43
- # Use 'sentence' field as per dataset structure
44
- return tokenizer(
45
- examples['sentence'],
46
- padding=True,
47
- truncation=True,
48
- max_length=512,
49
- return_tensors='pt'
50
- )
51
-
52
- dataset = dataset.map(preprocess, batched=True)
53
- dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target'])
54
-
55
- # Check first example
56
- first_example = dataset[0]
57
- print("\nFirst example details:")
58
- print(f"Input IDs shape: {first_example['input_ids'].shape}")
59
- print(f"Attention mask shape: {first_example['attention_mask'].shape}")
60
- print(f"Target: {first_example['target']}")
61
-
62
- model.eval()
63
- print(f"\nModel class: {model.__class__.__name__}")
64
- print(f"Model device: {next(model.parameters()).device}")
65
-
66
- with torch.no_grad():
67
- predictions = []
68
- targets = []
69
-
70
- # Create DataLoader with batch size 16
71
- from torch.utils.data import DataLoader
72
-
73
- # Define a custom collate function
74
- def collate_fn(batch):
75
- # Stack tensors for input_ids and attention_mask
76
- input_ids = torch.stack([sample['input_ids'] for sample in batch])
77
- attention_mask = torch.stack([sample['attention_mask'] for sample in batch])
78
- # Stack targets
79
- targets = torch.stack([torch.tensor(sample['target']) for sample in batch])
80
- return {
81
- 'input_ids': input_ids,
82
- 'attention_mask': attention_mask,
83
- 'target': targets
84
- }
85
-
86
- dataloader = DataLoader(
87
- dataset,
88
- batch_size=16,
89
- shuffle=False,
90
- collate_fn=collate_fn
91
- )
92
-
93
- for i, batch in enumerate(dataloader):
94
- if i == 0:
95
- print("\nProcessing first batch...")
96
- print(f"Batch keys: {list(batch.keys())}")
97
- print(f"Target shape: {batch['target'].shape}")
98
-
99
- inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'}
100
- target = batch['target'].to(device)
101
-
102
- outputs = model(**inputs)
103
- print(f"\nBatch {i} output type: {type(outputs)}")
104
-
105
- # Handle different model output formats
106
- if isinstance(outputs, dict):
107
- print(f"Output keys: {list(outputs.keys())}")
108
- if 'logits' in outputs:
109
- logits = outputs['logits']
110
- elif 'prediction_logits' in outputs:
111
- logits = outputs['prediction_logits']
112
- else:
113
- raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}")
114
- elif isinstance(outputs, tuple):
115
- print(f"Output tuple length: {len(outputs)}")
116
- logits = outputs[0]
117
- else:
118
- logits = outputs
119
-
120
- print(f"Logits shape: {logits.shape}")
121
-
122
- # For sequence classification, we typically use the [CLS] token's prediction
123
- if len(logits.shape) == 3: # [batch_size, sequence_length, num_classes]
124
- logits = logits[:, 0, :] # Take the [CLS] token prediction
125
-
126
- print(f"Final logits shape: {logits.shape}")
127
-
128
- batch_predictions = logits.argmax(dim=-1).cpu().tolist()
129
- batch_targets = target.cpu().tolist()
130
-
131
- predictions.extend(batch_predictions)
132
- targets.extend(batch_targets)
133
-
134
- if i == 0:
135
- print("\nFirst batch predictions:")
136
- print(f"Predictions: {batch_predictions[:5]}")
137
- print(f"Targets: {batch_targets[:5]}")
138
-
139
- print(f"\nTotal predictions: {len(predictions)}")
140
- print(f"Total targets: {len(targets)}")
141
-
142
- # Calculate accuracy
143
- correct = sum(p == t for p, t in zip(predictions, targets))
144
- total = len(predictions)
145
- accuracy = correct / total if total > 0 else 0.0
146
-
147
- print(f"\nEvaluation results:")
148
- print(f"Correct predictions: {correct}")
149
- print(f"Total predictions: {total}")
150
- print(f"Accuracy: {accuracy:.4f}")
151
-
152
- return {"fbougares/tsac": accuracy}
153
- except Exception as e:
154
- print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
155
- print(f"Full traceback: {traceback.format_exc()}")
156
- raise e
157
-
158
- def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
159
- """Evaluate model's coverage on Tunisian Dialect Corpus"""
160
- try:
161
- dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train")
162
-
163
- def preprocess(examples):
164
- print("Tunisian Corpus preprocess exemples -------------",examples)
165
- # Use 'Tweet' field as per dataset structure
166
- return tokenizer(
167
- examples['Tweet'],
168
- padding=False, # We don't need padding for token coverage
169
- truncation=False, # Don't truncate long sequences
170
- max_length=None # Let tokenizer handle the length
171
- )
172
-
173
- dataset = dataset.map(preprocess, batched=True)
174
-
175
- # Calculate token coverage
176
- total_tokens = 0
177
- covered_tokens = 0
178
-
179
- for example in dataset:
180
- # Get the tokenized input IDs
181
- input_ids = example['input_ids']
182
-
183
- # Convert to tokens and count
184
- tokens = tokenizer.convert_ids_to_tokens(input_ids)
185
- total_tokens += len(tokens)
186
- covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
187
-
188
- coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
189
- print(f"Tunisian Corpus Coverage: {coverage:.2%}")
190
- return {"arbml/Tunisian_Dialect_Corpus": coverage}
191
- except Exception as e:
192
- print(f"Error in Tunisian Corpus evaluation: {str(e)}")
193
- print(f"Full traceback: {traceback.format_exc()}")
194
- raise e
195
 
196
  def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
197
- """Evaluate a single model on all tasks"""
 
 
 
 
 
 
 
 
 
 
 
198
  try:
199
  print(f"\nStarting evaluation for model: {model_name} (revision: {revision}, precision: {precision}, weight_type: {weight_type})")
200
- print(f"Current working directory: {os.getcwd()}")
201
- print(f"Evaluation requests path: {EVAL_REQUESTS_PATH}")
202
- print(f"Evaluation results path: {EVAL_RESULTS_PATH}")
203
 
204
- # Initialize device
205
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
206
  print(f"Using device: {device}")
207
 
208
- # Load model and tokenizer with enhanced error handling
209
  try:
210
- print(f"\nLoading model: {model_name}")
211
- print(f"Model path exists: {os.path.exists(model_name)}")
212
 
213
- # First try to load the config to check model type
214
- try:
215
- config = AutoConfig.from_pretrained(model_name, revision=revision)
216
- print(f"Model type from config: {config.model_type}")
217
- except Exception as config_error:
218
- print(f"Error loading config: {str(config_error)}")
219
-
220
- # Try loading with trust_remote_code=True first
221
- try:
222
- print("\nAttempting to load with trust_remote_code=True...")
223
- model = AutoModelForSequenceClassification.from_pretrained(
224
- model_name,
225
- revision=revision,
226
- torch_dtype=getattr(torch, precision),
227
- trust_remote_code=True
228
- ).to(device)
229
- print(f"Successfully loaded model {model_name} with trust_remote_code=True")
230
- print(f"Model class: {model.__class__.__name__}")
231
- except Exception as e1:
232
- print(f"Error loading with trust_remote_code=True: {str(e1)}")
233
- print(f"Error type: {type(e1).__name__}")
234
-
235
- # If it's a model type error, try with llama as model type
236
- if "Unrecognized model" in str(e1) and "llama" in model_name.lower():
237
- print("\nAttempting to load as llama model...")
238
- try:
239
- model = AutoModelForSequenceClassification.from_pretrained(
240
- model_name,
241
- revision=revision,
242
- torch_dtype=getattr(torch, precision),
243
- trust_remote_code=True,
244
- model_type="llama"
245
- ).to(device)
246
- print(f"Successfully loaded model {model_name} as llama model")
247
- print(f"Model class: {model.__class__.__name__}")
248
- except Exception as e2:
249
- print(f"Error loading as llama model: {str(e2)}")
250
- print(f"Error type: {type(e2).__name__}")
251
- raise Exception(f"Failed to load model with both methods: {str(e1)}, {str(e2)}")
252
- else:
253
- raise e1
254
-
255
- print(f"\nLoading tokenizer: {model_name}")
256
- try:
257
- tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
258
- print(f"Successfully loaded tokenizer for {model_name}")
259
- print(f"Tokenizer class: {tokenizer.__class__.__name__}")
260
- except Exception as e:
261
- print(f"Error loading tokenizer: {str(e)}")
262
- print(f"Error type: {type(e).__name__}")
263
- raise Exception(f"Failed to load tokenizer: {str(e)}")
264
-
265
- # Run evaluations
266
- print("\nStarting TSAC sentiment evaluation...")
267
- try:
268
- tsac_results = evaluate_tsac_sentiment(model, tokenizer, device)
269
- print(f"TSAC results: {tsac_results}")
270
- except Exception as e:
271
- print(f"Error in TSAC evaluation for {model_name}: {str(e)}")
272
- print(f"Error type: {type(e).__name__}")
273
- tsac_results = {"accuracy": None}
274
-
275
- print("\nStarting Tunisian Corpus evaluation...")
276
- try:
277
- tunisian_results = evaluate_tunisian_corpus_coverage(model, tokenizer, device)
278
- print(f"Tunisian Corpus results: {tunisian_results}")
279
- except Exception as e:
280
- print(f"Error in Tunisian Corpus evaluation for {model_name}: {str(e)}")
281
- print(f"Error type: {type(e).__name__}")
282
- tunisian_results = {"coverage": None}
283
-
284
- print("\nEvaluation completed successfully!")
285
- print(f"Final results: {tsac_results} | {tunisian_results}")
286
- return EvaluationResult(
287
- model=model_name,
288
  revision=revision,
289
- precision=precision,
290
- weight_type=weight_type,
291
- results={
292
- Tasks.tsac_sentiment.value.metric: tsac_results.get(Tasks.tsac_sentiment.value.metric),
293
- Tasks.tunisian_corpus.value.metric: tunisian_results.get(Tasks.tunisian_corpus.value.metric)
294
- }
295
- )
296
  except Exception as e:
297
- print(f"\nError loading model {model_name}: {str(e)}")
298
- print(f"Error type: {type(e).__name__}")
299
  print(f"Full traceback: {traceback.format_exc()}")
300
  return EvaluationResult(
301
  model=model_name,
@@ -303,11 +74,43 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
303
  precision=precision,
304
  weight_type=weight_type,
305
  results={},
306
- error=str(e)
307
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  except Exception as e:
309
- print(f"\nError evaluating model {model_name}: {str(e)}")
310
- print(f"Error type: {type(e).__name__}")
311
  print(f"Full traceback: {traceback.format_exc()}")
312
  return EvaluationResult(
313
  model=model_name,
@@ -315,54 +118,75 @@ def evaluate_model(model_name: str, revision: str, precision: str, weight_type:
315
  precision=precision,
316
  weight_type=weight_type,
317
  results={},
318
- error=str(e)
319
  )
320
 
 
321
  def process_evaluation_queue():
322
- """Process all pending evaluations in the queue"""
 
 
 
 
323
  print(f"\n=== Starting evaluation queue processing ===")
324
  print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  print(f"Looking for evaluation requests in: {EVAL_REQUESTS_PATH}")
326
 
327
- # Get all pending evaluations
328
  if not os.path.exists(EVAL_REQUESTS_PATH):
329
  print(f"Evaluation requests path does not exist: {EVAL_REQUESTS_PATH}")
330
  return
331
 
332
- # Find all model directories (each model has its own directory)
333
- model_dirs = [d for d in os.listdir(EVAL_REQUESTS_PATH) if os.path.isdir(os.path.join(EVAL_REQUESTS_PATH, d))]
334
- print(f"Found {len(model_dirs)} model directories")
335
-
336
- for model_dir in model_dirs:
337
- model_dir_path = os.path.join(EVAL_REQUESTS_PATH, model_dir)
338
- print(f"\nChecking model directory: {model_dir_path}")
339
-
340
- # Find all JSON files in the model directory
341
- json_files = [f for f in os.listdir(model_dir_path) if f.endswith('.json')]
342
- print(f"Found {len(json_files)} pending evaluation requests")
343
- for file in json_files:
344
- file_path = os.path.join(model_dir_path, file)
345
- print(f" - {file_path}")
346
- try:
347
- with open(file_path, 'r') as f:
348
- eval_entry = json.load(f)
349
 
350
- # Check if this is a pending or running evaluation
351
- status = eval_entry.get('status', '')
352
- if status == EvaluationStatus.PENDING.value:
353
- print(f"\n=== Found pending evaluation ===")
354
- print(f"Model: {eval_entry['model']}")
355
- print(f"Revision: {eval_entry['revision']}")
356
- print(f"Precision: {eval_entry['precision']}")
357
- print(f"Weight type: {eval_entry['weight_type']}")
358
 
359
- # Update status to RUNNING
360
- eval_entry['status'] = EvaluationStatus.RUNNING.value
361
- with open(file_path, 'w') as f:
362
- json.dump(eval_entry, f, indent=2)
363
 
364
- # Run evaluation
365
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  print("\n=== Starting evaluation ===")
367
  eval_result = evaluate_model(
368
  model_name=eval_entry['model'],
@@ -370,121 +194,58 @@ def process_evaluation_queue():
370
  precision=eval_entry['precision'],
371
  weight_type=eval_entry['weight_type']
372
  )
373
-
374
  print("\n=== Evaluation completed ===")
375
- print(f"Results: {eval_result.results}")
376
-
377
- # Update status to FINISHED and add results
378
- eval_entry['status'] = EvaluationStatus.FINISHED.value
379
- eval_entry['results'] = eval_result.results
380
 
 
381
  if eval_result.error:
 
382
  eval_entry['error'] = eval_result.error
 
 
 
 
 
383
 
384
- # Save updated entry
385
  with open(file_path, 'w') as f:
386
  json.dump(eval_entry, f, indent=2)
387
 
388
- # Move file to results directory
389
- if not os.path.exists(EVAL_RESULTS_PATH):
390
- os.makedirs(EVAL_RESULTS_PATH)
391
-
392
- result_filename = os.path.basename(file_path)
393
- result_path = os.path.join(EVAL_RESULTS_PATH, result_filename)
394
-
395
- os.rename(file_path, result_path)
396
- print(f"\nMoved evaluation result to: {result_path}")
397
-
398
- # Upload to Hugging Face
399
  try:
 
 
400
  API.upload_file(
401
- path_or_fileobj=result_path,
402
- path_in_repo=result_filename,
403
  repo_id=RESULTS_REPO,
404
  repo_type="dataset",
405
- commit_message=f"Add evaluation results for {eval_entry['model']}"
406
  )
407
- print("\nResults uploaded to Hugging Face")
 
408
  except Exception as upload_error:
409
  print(f"Error uploading results: {str(upload_error)}")
410
- eval_entry['error'] = f"Evaluation completed but failed to upload results: {str(upload_error)}"
411
- with open(file_path, 'w') as f:
412
- json.dump(eval_entry, f, indent=2)
413
- except Exception as eval_error:
414
- print(f"\n=== Error during evaluation ===")
415
- print(f"Error: {str(eval_error)}")
416
- print(f"Full traceback: {traceback.format_exc()}")
417
-
418
- # Update status to FAILED and add error
419
- eval_entry['status'] = EvaluationStatus.FAILED.value
420
- eval_entry['error'] = str(eval_error)
421
-
422
- with open(file_path, 'w') as f:
423
- json.dump(eval_entry, f, indent=2)
424
-
425
- # Move failed evaluation to results directory
426
- if not os.path.exists(EVAL_RESULTS_PATH):
427
- os.makedirs(EVAL_RESULTS_PATH)
428
-
429
- result_filename = os.path.basename(file_path)
430
- result_path = os.path.join(EVAL_RESULTS_PATH, result_filename)
431
-
432
- os.rename(file_path, result_path)
433
- print(f"\nMoved failed evaluation to: {result_path}")
434
-
435
- # Upload error file
436
  try:
437
- API.upload_file(
438
- path_or_fileobj=result_path,
439
- path_in_repo=result_filename,
440
- repo_id=RESULTS_REPO,
441
  repo_type="dataset",
442
- commit_message=f"Add evaluation error for {eval_entry['model']}"
443
  )
444
- print("\nError file uploaded to Hugging Face")
445
- except Exception as upload_error:
446
- print(f"Error uploading error file: {str(upload_error)}")
447
- elif status == EvaluationStatus.RUNNING.value:
448
- print(f"\n=== Found running evaluation ===")
449
- print(f"Model: {eval_entry['model']}")
450
- print(f"Revision: {eval_entry['revision']}")
451
- print(f"Precision: {eval_entry['precision']}")
452
- print(f"Weight type: {eval_entry['weight_type']}")
453
-
454
- try:
455
- # Check if we have results for this evaluation
456
- result_filename = os.path.basename(file_path)
457
- result_path = os.path.join(EVAL_RESULTS_PATH, result_filename)
458
-
459
- if os.path.exists(result_path):
460
- print(f"\nFound existing results file: {result_path}")
461
- # Update status to FINISHED
462
- eval_entry['status'] = EvaluationStatus.FINISHED.value
463
- with open(file_path, 'w') as f:
464
- json.dump(eval_entry, f, indent=2)
465
- else:
466
- print("\nNo results found. Restarting evaluation...")
467
- # Restart the evaluation
468
- eval_entry['status'] = EvaluationStatus.PENDING.value
469
- with open(file_path, 'w') as f:
470
- json.dump(eval_entry, f, indent=2)
471
- except Exception as check_error:
472
- print(f"\n=== Error checking running evaluation ===")
473
- print(f"Error: {str(check_error)}")
474
- print(f"Full traceback: {traceback.format_exc()}")
475
-
476
- # If we can't check the status, restart the evaluation
477
- eval_entry['status'] = EvaluationStatus.PENDING.value
478
- with open(file_path, 'w') as f:
479
- json.dump(eval_entry, f, indent=2)
480
- except Exception as e:
481
- print(f"Error processing file {file}: {str(e)}")
482
- print(f"Full traceback: {traceback.format_exc()}")
483
- continue
484
 
485
- print(f"\n=== Evaluation queue summary ===")
486
- print(f"Total directories checked: {len(model_dirs)}")
487
- print(f"Total files processed: {len(json_files)}")
488
- print(f"\nEvaluation queue processed. Sleeping for 5 minutes...")
489
- return
 
490
 
 
 
 
 
1
  import json
2
  import os
3
+ import time
4
  from typing import Dict, Any
5
  from dataclasses import dataclass
6
  from enum import Enum
7
  from datetime import datetime
8
  import torch
9
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
10
  import traceback
11
 
12
+
13
+ from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO, QUEUE_REPO
14
+ from src.evaluator.tunisian_corpus_coverage import evaluate_tunisian_corpus_coverage
15
+ from src.evaluator.tsac import evaluate_tsac_sentiment
16
+ from huggingface_hub import snapshot_download
17
+
18
 
19
  class EvaluationStatus(Enum):
20
  PENDING = "PENDING"
 
24
 
25
  @dataclass
26
  class EvaluationResult:
27
+ """Dataclass to hold the results of a single model evaluation."""
28
  model: str
29
  revision: str
30
  precision: str
 
32
  results: Dict[str, float]
33
  error: str = None
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
37
+ """
38
+ Evaluates a single model on all defined tasks.
39
+
40
+ Args:
41
+ model_name (str): The name of the model on the Hugging Face Hub.
42
+ revision (str): The specific revision (commit hash or branch name) to use.
43
+ precision (str): The precision (e.g., 'float16') for model loading.
44
+ weight_type (str): The type of weights ('Original' or 'Adapter').
45
+
46
+ Returns:
47
+ EvaluationResult: A dataclass containing the evaluation results or an error message.
48
+ """
49
  try:
50
  print(f"\nStarting evaluation for model: {model_name} (revision: {revision}, precision: {precision}, weight_type: {weight_type})")
 
 
 
51
 
 
52
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
53
  print(f"Using device: {device}")
54
 
 
55
  try:
56
+ print(f"\nLoading model and tokenizer for: {model_name}")
 
57
 
58
+ model = AutoModelForSequenceClassification.from_pretrained(
59
+ model_name,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  revision=revision,
61
+ torch_dtype=getattr(torch, precision),
62
+ trust_remote_code=True
63
+ ).to(device)
64
+ tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
65
+
66
+ print(f"Successfully loaded model and tokenizer.")
 
67
  except Exception as e:
68
+ error_msg = f"Failed to load model or tokenizer: {str(e)}"
69
+ print(f"Error: {error_msg}")
70
  print(f"Full traceback: {traceback.format_exc()}")
71
  return EvaluationResult(
72
  model=model_name,
 
74
  precision=precision,
75
  weight_type=weight_type,
76
  results={},
77
+ error=error_msg
78
  )
79
+
80
+ tsac_results = {"accuracy": None}
81
+ tunisian_results = {"coverage": None}
82
+
83
+ print("\nStarting TSAC sentiment evaluation...")
84
+ try:
85
+ tsac_results = evaluate_tsac_sentiment(model, tokenizer, device)
86
+ print(f"TSAC results: {tsac_results}")
87
+ except Exception as e:
88
+ print(f"Error in TSAC evaluation for {model_name}: {str(e)}")
89
+ print(f"Full traceback: {traceback.format_exc()}")
90
+
91
+ print("\nStarting Tunisian Corpus evaluation...")
92
+ try:
93
+ tunisian_results = evaluate_tunisian_corpus_coverage(model, tokenizer, device)
94
+ print(f"Tunisian Corpus results: {tunisian_results}")
95
+ except Exception as e:
96
+ print(f"Error in Tunisian Corpus evaluation for {model_name}: {str(e)}")
97
+ print(f"Full traceback: {traceback.format_exc()}")
98
+
99
+ print("\nEvaluation completed successfully!")
100
+
101
+ return EvaluationResult(
102
+ model=model_name,
103
+ revision=revision,
104
+ precision=precision,
105
+ weight_type=weight_type,
106
+ results={
107
+ "accuracy": tsac_results.get("fbougares/tsac"),
108
+ "coverage": tunisian_results.get("arbml/Tunisian_Dialect_Corpus")
109
+ }
110
+ )
111
  except Exception as e:
112
+ error_msg = f"An unexpected error occurred during evaluation: {str(e)}"
113
+ print(f"Error: {error_msg}")
114
  print(f"Full traceback: {traceback.format_exc()}")
115
  return EvaluationResult(
116
  model=model_name,
 
118
  precision=precision,
119
  weight_type=weight_type,
120
  results={},
121
+ error=error_msg
122
  )
123
 
124
+
125
  def process_evaluation_queue():
126
+ """
127
+ Processes all pending evaluations in the queue.
128
+ This function acts as a worker that finds a PENDING job, runs it,
129
+ and updates the status on the Hugging Face Hub.
130
+ """
131
  print(f"\n=== Starting evaluation queue processing ===")
132
  print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
133
+
134
+ # --- NEW STEP: Download the latest queue from Hugging Face Hub ---
135
+ try:
136
+ print(f"Downloading evaluation requests from: {QUEUE_REPO}")
137
+ snapshot_download(
138
+ repo_id=QUEUE_REPO,
139
+ repo_type="dataset",
140
+ local_dir=EVAL_REQUESTS_PATH,
141
+ local_dir_use_symlinks=False,
142
+ token=API.token
143
+ )
144
+ print("Successfully downloaded evaluation queue.")
145
+ except Exception as e:
146
+ print(f"Error downloading evaluation queue: {str(e)}")
147
+ print(f"Full traceback: {traceback.format_exc()}")
148
+ return
149
+
150
  print(f"Looking for evaluation requests in: {EVAL_REQUESTS_PATH}")
151
 
 
152
  if not os.path.exists(EVAL_REQUESTS_PATH):
153
  print(f"Evaluation requests path does not exist: {EVAL_REQUESTS_PATH}")
154
  return
155
 
156
+ for root, _, files in os.walk(EVAL_REQUESTS_PATH):
157
+ for filename in files:
158
+ if filename.endswith('.json'):
159
+ file_path = os.path.join(root, filename)
160
+ print(f"\nProcessing file: {file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
+ try:
163
+ with open(file_path, 'r') as f:
164
+ eval_entry = json.load(f)
 
 
 
 
 
165
 
166
+ status = eval_entry.get('status', '')
 
 
 
167
 
168
+ if status == EvaluationStatus.PENDING.value:
169
+ print(f"Found pending evaluation for model: {eval_entry['model']}")
170
+
171
+ # --- Step 1: Update status to RUNNING locally and on Hub ---
172
+ eval_entry['status'] = EvaluationStatus.RUNNING.value
173
+ with open(file_path, 'w') as f:
174
+ json.dump(eval_entry, f, indent=2)
175
+
176
+ user_name = os.path.basename(root)
177
+ path_in_repo_queue = os.path.join(user_name, filename)
178
+
179
+ # Upload the updated file to the queue repo to reflect 'RUNNING' status
180
+ API.upload_file(
181
+ path_or_fileobj=file_path,
182
+ path_in_repo=path_in_repo_queue,
183
+ repo_id=QUEUE_REPO,
184
+ repo_type="dataset",
185
+ commit_message=f"Update status to RUNNING for {eval_entry['model']}"
186
+ )
187
+ print(f"Updated status to RUNNING in queue: {path_in_repo_queue}")
188
+
189
+ # --- Step 2: Run the evaluation ---
190
  print("\n=== Starting evaluation ===")
191
  eval_result = evaluate_model(
192
  model_name=eval_entry['model'],
 
194
  precision=eval_entry['precision'],
195
  weight_type=eval_entry['weight_type']
196
  )
 
197
  print("\n=== Evaluation completed ===")
 
 
 
 
 
198
 
199
+ # --- Step 3: Update file with final status and results locally ---
200
  if eval_result.error:
201
+ eval_entry['status'] = EvaluationStatus.FAILED.value
202
  eval_entry['error'] = eval_result.error
203
+ print(f"Evaluation failed with error: {eval_result.error}")
204
+ else:
205
+ eval_entry['status'] = EvaluationStatus.FINISHED.value
206
+ eval_entry['results'] = eval_result.results
207
+ print(f"Evaluation finished successfully. Results: {eval_result.results}")
208
 
 
209
  with open(file_path, 'w') as f:
210
  json.dump(eval_entry, f, indent=2)
211
 
212
+ # --- Step 4: Upload the final file to the results directory on the Hub ---
 
 
 
 
 
 
 
 
 
 
213
  try:
214
+ # Use the local file with its final status as the basis for the results file
215
+ path_in_repo_results = os.path.join(user_name, filename)
216
  API.upload_file(
217
+ path_or_fileobj=file_path,
218
+ path_in_repo=path_in_repo_results,
219
  repo_id=RESULTS_REPO,
220
  repo_type="dataset",
221
+ commit_message=f"Evaluation {'results' if not eval_result.error else 'error'} for {eval_entry['model']}"
222
  )
223
+ print("\nResults uploaded to Hugging Face successfully.")
224
+
225
  except Exception as upload_error:
226
  print(f"Error uploading results: {str(upload_error)}")
227
+
228
+ # --- Step 5: Update the status of the request in the queue to FINISHED/FAILED ---
229
+ # This keeps a record of all processed jobs in the queue repo.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  try:
231
+ API.upload_file(
232
+ path_or_fileobj=file_path,
233
+ path_in_repo=path_in_repo_queue,
234
+ repo_id=QUEUE_REPO,
235
  repo_type="dataset",
236
+ commit_message=f"Final status update for {eval_entry['model']}"
237
  )
238
+ print(f"Final status for {eval_entry['model']} updated in the queue repository.")
239
+ except Exception as status_update_error:
240
+ print(f"Error updating status in queue: {str(status_update_error)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
+ else:
243
+ print(f"Skipping file with status: {status}")
244
+ except Exception as e:
245
+ print(f"Error processing file {file_path}: {str(e)}")
246
+ print(f"Full traceback: {traceback.format_exc()}")
247
+ continue
248
 
249
+ print("\n=== Evaluation queue processed. ===")
250
+ print("No more pending jobs found.")
251
+ return
src/evaluator/run_evaluator.py CHANGED
@@ -17,11 +17,11 @@ def main():
17
  try:
18
  process_evaluation_queue()
19
  print("Evaluation queue processed. Sleeping for 5 minutes...")
20
- time.sleep(300) # Sleep for 5 minutes
21
  except Exception as e:
22
  print(f"Error in evaluation process: {e}")
23
  print("Retrying in 5 minutes...")
24
- time.sleep(300)
25
 
26
  if __name__ == "__main__":
27
  main()
 
17
  try:
18
  process_evaluation_queue()
19
  print("Evaluation queue processed. Sleeping for 5 minutes...")
20
+ time.sleep(20) # Sleep for 5 minutes
21
  except Exception as e:
22
  print(f"Error in evaluation process: {e}")
23
  print("Retrying in 5 minutes...")
24
+ time.sleep(20)
25
 
26
  if __name__ == "__main__":
27
  main()
src/evaluator/tsac.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from datasets import load_dataset
3
+ import traceback
4
+ import time
5
+
6
+
7
+ def evaluate_tsac_sentiment(model, tokenizer, device):
8
+ """Evaluate model on TSAC sentiment analysis task"""
9
+ try:
10
+ print("\n=== Starting TSAC sentiment evaluation ===")
11
+ print(f"Current device: {device}")
12
+
13
+ # Load and preprocess dataset
14
+ print("\nLoading and preprocessing TSAC dataset...")
15
+ dataset = load_dataset("fbougares/tsac", split="test", trust_remote_code=True)
16
+ dataset = dataset.select(range(10)) # Only evaluate on 200 samples
17
+
18
+ # print(f"Dataset size: {len(dataset)} examples")
19
+
20
+ def preprocess(examples):
21
+ return tokenizer(
22
+ examples['sentence'],
23
+ padding=True,
24
+ truncation=True,
25
+ max_length=512,
26
+ return_tensors=None
27
+ )
28
+ print(dataset.column_names)
29
+ dataset = dataset.map(preprocess, batched=True)
30
+ dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target'])
31
+
32
+ # Check first example
33
+ first_example = dataset[0]
34
+ print("\nFirst example details:")
35
+ print(f"Input IDs shape: {first_example['input_ids'].shape}")
36
+ print(f"Attention mask shape: {first_example['attention_mask'].shape}")
37
+ print(f"Target: {first_example['target']}")
38
+
39
+ model.eval()
40
+ print(f"\nModel class: {model.__class__.__name__}")
41
+ print(f"Model device: {next(model.parameters()).device}")
42
+
43
+ with torch.no_grad():
44
+ predictions = []
45
+ targets = []
46
+
47
+ # Create DataLoader with batch size 16
48
+ from torch.utils.data import DataLoader
49
+
50
+ # Define a custom collate function
51
+ def collate_fn(batch):
52
+ input_ids = torch.stack([sample['input_ids'] for sample in batch])
53
+ attention_mask = torch.stack([sample['attention_mask'] for sample in batch])
54
+ targets = torch.stack([sample['target'] for sample in batch])
55
+ return {
56
+ 'input_ids': input_ids,
57
+ 'attention_mask': attention_mask,
58
+ 'target': targets
59
+ }
60
+
61
+
62
+
63
+ dataloader = DataLoader(
64
+ dataset,
65
+ batch_size=16,
66
+ shuffle=False,
67
+ collate_fn=collate_fn
68
+ )
69
+
70
+ for i, batch in enumerate(dataloader):
71
+ if i % 10 == 0 :
72
+ print("\nProcessing first batch...")
73
+ print(f"Batch keys: {list(batch.keys())}")
74
+ print(f"Target shape: {batch['target'].shape}")
75
+
76
+ inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'}
77
+ target = batch['target'].to(device)
78
+ before = time.time()
79
+ outputs = model(**inputs)
80
+ # print(f"\nBatch {i} output type: {type(outputs)}")
81
+
82
+ # Handle different model output formats
83
+ if isinstance(outputs, dict):
84
+ # print(f"Output keys: {list(outputs.keys())}")
85
+ if 'logits' in outputs:
86
+ logits = outputs['logits']
87
+ elif 'prediction_logits' in outputs:
88
+ logits = outputs['prediction_logits']
89
+ else:
90
+ raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}")
91
+ elif isinstance(outputs, tuple):
92
+ print(f"Output tuple length: {len(outputs)}")
93
+ logits = outputs[0]
94
+ else:
95
+ logits = outputs
96
+
97
+ # print(f"Logits shape: {logits.shape}")
98
+
99
+ # For sequence classification, we typically use the [CLS] token's prediction
100
+ if len(logits.shape) == 3: # [batch_size, sequence_length, num_classes]
101
+ logits = logits[:, 0, :] # Take the [CLS] token prediction
102
+
103
+ # print(f"Final logits shape: {logits.shape}")
104
+
105
+ batch_predictions = logits.argmax(dim=-1).cpu().tolist()
106
+ batch_targets = target.cpu().tolist()
107
+
108
+ predictions.extend(batch_predictions)
109
+ targets.extend(batch_targets)
110
+
111
+ if i % 10 == 0:
112
+ print("\nFirst batch predictions:")
113
+ print(f"Predictions: {batch_predictions[:5]}")
114
+ print(f"Targets: {batch_targets[:5]}")
115
+
116
+ print(f"\nTotal predictions: {len(predictions)}")
117
+ print(f"Total targets: {len(targets)}")
118
+
119
+ # Calculate accuracy
120
+ correct = sum(p == t for p, t in zip(predictions, targets))
121
+ total = len(predictions)
122
+ accuracy = correct / total if total > 0 else 0.0
123
+
124
+ print(f"\nEvaluation results:")
125
+ print(f"Correct predictions: {correct}")
126
+ print(f"Total predictions: {total}")
127
+ print(f"Accuracy: {accuracy:.4f}")
128
+
129
+ return {"fbougares/tsac": accuracy}
130
+ except Exception as e:
131
+ print(f"\n=== Error in TSAC evaluation: {str(e)} ===")
132
+ print(f"Full traceback: {traceback.format_exc()}")
133
+ raise e
src/evaluator/tunisian_corpus_coverage.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import Dict, Any
4
+ from dataclasses import dataclass
5
+ from enum import Enum
6
+ from datetime import datetime
7
+ import torch
8
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
9
+ from datasets import load_dataset
10
+ import traceback
11
+ from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
12
+
13
+
14
+
15
+ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
16
+ """Evaluate model's coverage on Tunisian Dialect Corpus"""
17
+ try:
18
+ dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train")
19
+
20
+ def preprocess(examples):
21
+ # print("Tunisian Corpus preprocess exemples -------------",examples)
22
+ # Use 'Tweet' field as per dataset structure
23
+ return tokenizer(
24
+ examples['Tweet'],
25
+ padding=False, # We don't need padding for token coverage
26
+ truncation=False, # Don't truncate long sequences
27
+ max_length=None # Let tokenizer handle the length
28
+ )
29
+
30
+ dataset = dataset.map(preprocess, batched=True)
31
+
32
+ total_tokens = 0
33
+ covered_tokens = 0
34
+
35
+ for example in dataset:
36
+ input_ids = example['input_ids']
37
+
38
+ tokens = tokenizer.convert_ids_to_tokens(input_ids)
39
+ total_tokens += len(tokens)
40
+ covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
41
+
42
+ coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
43
+ print(f"Tunisian Corpus Coverage: {coverage:.2%}")
44
+ return {"arbml/Tunisian_Dialect_Corpus": coverage}
45
+ except Exception as e:
46
+ print(f"Error in Tunisian Corpus evaluation: {str(e)}")
47
+ print(f"Full traceback: {traceback.format_exc()}")
48
+ raise e
src/leaderboard/read_evals.py CHANGED
@@ -1,11 +1,7 @@
1
- import glob
2
  import json
3
- import math
4
  import os
5
  from dataclasses import dataclass
6
 
7
- import dateutil
8
- import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
@@ -14,8 +10,7 @@ from src.submission.check_validity import is_model_on_hub
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
  eval_name: str # org_model_precision (uid)
20
  full_model: str # org/model (path on hub)
21
  org: str
@@ -38,108 +33,61 @@ class EvalResult:
38
  try:
39
  with open(json_filepath) as fp:
40
  data = json.load(fp)
41
-
42
- # Get model info
43
- model_name = data.get('model')
44
- org_and_model = model_name.split("/", 1)
45
  org = org_and_model[0]
46
  model = org_and_model[1]
47
 
48
- # Get results
 
 
 
 
 
 
 
 
49
  results = data.get('results', {})
50
- precision = Precision.from_str(data.get('precision', 'Unknown'))
 
 
 
51
 
52
- # Create EvalResult
 
 
53
  return EvalResult(
54
  eval_name=f"{org}_{model}_{precision.value}",
55
- full_model=model_name,
56
  org=org,
57
  model=model,
58
- revision=data.get('revision', ''),
59
  results=results,
60
  precision=precision,
61
- model_type=ModelType.from_str(data.get('model_type', 'Unknown')),
62
- weight_type=WeightType.from_str(data.get('weight_type', 'Original')),
63
- date=data.get('submitted_at', ''),
64
- still_on_hub=is_model_on_hub(model_name, revision="main")
 
 
 
 
65
  )
66
  except Exception as e:
67
  print(f"Error reading evaluation file {json_filepath}: {str(e)}")
68
  return None
69
 
70
- # Precision
71
- precision = Precision.from_str(config.get("model_dtype"))
72
-
73
- # Get model and org
74
- org_and_model = config.get("model_name", config.get("model_args", None))
75
- org_and_model = org_and_model.split("/", 1)
76
-
77
- if len(org_and_model) == 1:
78
- org = None
79
- model = org_and_model[0]
80
- result_key = f"{model}_{precision.value.name}"
81
- else:
82
- org = org_and_model[0]
83
- model = org_and_model[1]
84
- result_key = f"{org}_{model}_{precision.value.name}"
85
- full_model = "/".join(org_and_model)
86
-
87
- still_on_hub, _, model_config = is_model_on_hub(
88
- full_model, revision=config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
89
- )
90
- architecture = "?"
91
- if model_config is not None:
92
- architectures = getattr(model_config, "architectures", None)
93
- if architectures:
94
- architecture = ";".join(architectures)
95
-
96
- # Extract results available in this file (some results are split in several files)
97
- results = {}
98
- for task in Tasks:
99
- task = task.value
100
-
101
- # We average all scores of a given metric (not all metrics are present in all files)
102
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
103
- if accs.size == 0 or any([acc is None for acc in accs]):
104
- continue
105
-
106
- mean_acc = np.mean(accs) * 100.0
107
- results[task.benchmark] = mean_acc
108
-
109
- return self(
110
- eval_name=result_key,
111
- full_model=full_model,
112
- org=org,
113
- model=model,
114
- results=results,
115
- precision=precision,
116
- revision= config.get("model_sha", ""),
117
- still_on_hub=still_on_hub,
118
- architecture=architecture
119
- )
120
-
121
- def update_with_request_file(self, requests_path):
122
- """Finds the relevant request file for the current model and updates info with it"""
123
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
124
-
125
- try:
126
- with open(request_file, "r") as f:
127
- request = json.load(f)
128
- self.model_type = ModelType.from_str(request.get("model_type", ""))
129
- self.weight_type = WeightType[request.get("weight_type", "Original")]
130
- self.license = request.get("license", "?")
131
- self.likes = request.get("likes", 0)
132
- self.num_params = request.get("params", 0)
133
- self.date = request.get("submitted_time", "")
134
- except Exception:
135
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
136
-
137
  def to_dict(self):
138
  """Converts the Eval Result to a dict compatible with our dataframe display"""
139
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
 
140
  AutoEvalColumnInstance = AutoEvalColumn()
141
  data_dict = {
142
- "eval_name": self.eval_name, # not a column, just a save name,
143
  AutoEvalColumnInstance.precision.name: self.precision.value.name,
144
  AutoEvalColumnInstance.model_type.name: self.model_type.value.name,
145
  AutoEvalColumnInstance.model_type_symbol.name: self.model_type.value.symbol,
@@ -151,124 +99,39 @@ class EvalResult:
151
  AutoEvalColumnInstance.license.name: self.license,
152
  AutoEvalColumnInstance.likes.name: self.likes,
153
  AutoEvalColumnInstance.params.name: self.num_params,
154
- AutoEvalColumnInstance.still_on_hub.name: True if isinstance(self.still_on_hub, tuple) and self.still_on_hub[0] else False,
155
  }
156
 
157
- # Map dataset names to their metric values
158
- tsac_result = self.results.get("fbougares/tsac")
159
- tunisian_result = self.results.get("arbml/Tunisian_Dialect_Corpus")
160
-
161
- # Map metric values to their corresponding dataset names
162
  for task in Tasks:
163
- if task.value.benchmark == "fbougares/tsac":
164
- data_dict[task.value.col_name] = self.results.get("accuracy")
165
- elif task.value.benchmark == "arbml/Tunisian_Dialect_Corpus":
166
- data_dict[task.value.col_name] = self.results.get("coverage")
167
- print("data_dict : ", data_dict)
168
  return data_dict
169
 
170
 
171
- def get_request_file_for_model(requests_path, model_name, precision):
172
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
173
- request_files = os.path.join(
174
- requests_path,
175
- f"{model_name}_eval_request_*.json",
176
- )
177
- request_files = glob.glob(request_files)
178
-
179
- # Select correct request file (precision)
180
- request_file = ""
181
- request_files = sorted(request_files, reverse=True)
182
- for tmp_request_file in request_files:
183
- with open(tmp_request_file, "r") as f:
184
- req_content = json.load(f)
185
- if (
186
- req_content["status"] in ["FINISHED"]
187
- and req_content["precision"] == precision.split(".")[-1]
188
- ):
189
- request_file = tmp_request_file
190
- return request_file
191
 
192
 
193
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
194
  """From the path of the results folder root, extract all needed info for results"""
195
  model_result_filepaths = []
 
196
  for root, _, files in os.walk(results_path):
197
- # Only process .json files
198
  json_files = [f for f in files if f.endswith(".json")]
199
- print(json_files)
200
  for file in json_files:
201
  model_result_filepaths.append(os.path.join(root, file))
202
- print(model_result_filepaths)
203
 
204
- eval_results = {}
205
  for model_result_filepath in model_result_filepaths:
206
  try:
207
- # Creation of result
208
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
209
- # print(eval_result)
210
- if eval_result is None:
 
211
  print(f"Skipping invalid evaluation file: {model_result_filepath}")
212
- continue
213
-
214
- eval_result.update_with_request_file(requests_path)
215
- # print(eval_result)
216
- # Store results of same eval together
217
- if eval_result.eval_name not in eval_results:
218
- eval_results[eval_result.eval_name] = []
219
- eval_results[eval_result.eval_name].append(eval_result)
220
- # print(eval_results)
221
-
222
  except Exception as e:
223
  print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
224
  continue
225
-
226
- # Store results of same eval together
227
- eval_name = eval_result.eval_name
228
- print("eval_name : ", eval_name)
229
- if eval_name in eval_results.keys():
230
- # If we already have results for this eval, append to list
231
- eval_results[eval_name].append(eval_result)
232
- else:
233
- # Initialize list for this eval name
234
- eval_results[eval_name] = [eval_result]
235
- print("eval_results : ", eval_results)
236
- # Process final results
237
- final_results = {}
238
- for eval_name, eval_list in eval_results.items():
239
- # Create merged results from all evaluations, ensuring all required task keys are present
240
- merged_results = {task.value.metric: None for task in Tasks}
241
- for eval_result in eval_list:
242
- merged_results.update({k: v for k, v in eval_result.results.items() if v is not None})
243
-
244
- # Take the first eval_result as base and update with merged results
245
- print("evaluation list : ", len(eval_list))
246
- base_result = eval_list[0]
247
- print("base_result : ", base_result)
248
- # print(base_result)
249
- final_results[eval_name] = EvalResult(
250
- eval_name=eval_name,
251
- full_model=base_result.full_model,
252
- org=base_result.org,
253
- model=base_result.model,
254
- revision=base_result.revision,
255
- results=merged_results,
256
- precision=base_result.precision,
257
- model_type=base_result.model_type,
258
- weight_type=base_result.weight_type,
259
- date=base_result.date,
260
- still_on_hub=base_result.still_on_hub
261
- )
262
- print(len(final_results))
263
- print(final_results.keys())
264
- print(final_results.values())
265
-
266
- results = []
267
- for v in final_results.values():
268
- try:
269
- v.to_dict() # we test if the dict version is complete
270
- results.append(v)
271
- except KeyError as e: # not all eval values present
272
- print("error in v",e)
273
- continue
274
- return results
 
 
1
  import json
 
2
  import os
3
  from dataclasses import dataclass
4
 
 
 
5
 
6
  from src.display.formatting import make_clickable_model
7
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 
10
 
11
  @dataclass
12
  class EvalResult:
13
+ """Represents one full evaluation. Built from a single result file for a given run."""
 
14
  eval_name: str # org_model_precision (uid)
15
  full_model: str # org/model (path on hub)
16
  org: str
 
33
  try:
34
  with open(json_filepath) as fp:
35
  data = json.load(fp)
36
+
37
+ # Extract model information from the JSON data
38
+ full_model_name = data.get('model')
39
+ org_and_model = full_model_name.split("/", 1)
40
  org = org_and_model[0]
41
  model = org_and_model[1]
42
 
43
+ # Extract other metadata
44
+ precision_str = data.get('precision', 'Unknown')
45
+ precision = Precision.from_str(precision_str)
46
+ model_type = ModelType.from_str(data.get('model_type', 'Unknown'))
47
+ weight_type = WeightType.from_str(data.get('weight_type', 'Original'))
48
+ revision = data.get('revision', '')
49
+ date = data.get('submitted_at', '')
50
+
51
+ # Extract results and metadata
52
  results = data.get('results', {})
53
+ license = data.get('license', '?')
54
+ likes = data.get('likes', 0)
55
+ num_params = data.get('params', 0)
56
+ architecture = data.get('architecture', 'Unknown')
57
 
58
+ # Check if the model is still on the hub
59
+ still_on_hub, _, _ = is_model_on_hub(full_model_name, revision=revision)
60
+
61
  return EvalResult(
62
  eval_name=f"{org}_{model}_{precision.value}",
63
+ full_model=full_model_name,
64
  org=org,
65
  model=model,
66
+ revision=revision,
67
  results=results,
68
  precision=precision,
69
+ model_type=model_type,
70
+ weight_type=weight_type,
71
+ architecture=architecture,
72
+ license=license,
73
+ likes=likes,
74
+ num_params=num_params,
75
+ date=date,
76
+ still_on_hub=still_on_hub
77
  )
78
  except Exception as e:
79
  print(f"Error reading evaluation file {json_filepath}: {str(e)}")
80
  return None
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def to_dict(self):
83
  """Converts the Eval Result to a dict compatible with our dataframe display"""
84
+ # Calculate the average score for the leaderboard
85
+ scores = [v for k, v in self.results.items() if v is not None and k in [task.value.metric for task in Tasks]]
86
+ average = sum(scores) / len(scores) if scores else 0
87
+
88
  AutoEvalColumnInstance = AutoEvalColumn()
89
  data_dict = {
90
+ "eval_name": self.eval_name,
91
  AutoEvalColumnInstance.precision.name: self.precision.value.name,
92
  AutoEvalColumnInstance.model_type.name: self.model_type.value.name,
93
  AutoEvalColumnInstance.model_type_symbol.name: self.model_type.value.symbol,
 
99
  AutoEvalColumnInstance.license.name: self.license,
100
  AutoEvalColumnInstance.likes.name: self.likes,
101
  AutoEvalColumnInstance.params.name: self.num_params,
102
+ AutoEvalColumnInstance.still_on_hub.name: self.still_on_hub,
103
  }
104
 
105
+ # Dynamically map metric values to their corresponding column names
 
 
 
 
106
  for task in Tasks:
107
+ task_metric = task.value.metric
108
+ task_col_name = task.value.col_name
109
+ data_dict[task_col_name] = self.results.get(task_metric)
110
+
 
111
  return data_dict
112
 
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
 
116
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
117
  """From the path of the results folder root, extract all needed info for results"""
118
  model_result_filepaths = []
119
+ # Recursively find all result files
120
  for root, _, files in os.walk(results_path):
 
121
  json_files = [f for f in files if f.endswith(".json")]
 
122
  for file in json_files:
123
  model_result_filepaths.append(os.path.join(root, file))
 
124
 
125
+ eval_results = []
126
  for model_result_filepath in model_result_filepaths:
127
  try:
 
128
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
129
+ if eval_result is not None:
130
+ eval_results.append(eval_result)
131
+ else:
132
  print(f"Skipping invalid evaluation file: {model_result_filepath}")
 
 
 
 
 
 
 
 
 
 
133
  except Exception as e:
134
  print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
135
  continue
136
+
137
+ return eval_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py CHANGED
@@ -1,26 +1,23 @@
1
  import json
2
  import os
3
-
4
  import pandas as pd
 
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
- print(raw_data)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
- print(all_data_json)
17
  df = pd.DataFrame.from_records(all_data_json)
18
- print(df)
19
  if df.empty:
20
  print("No evaluation results found. Returning empty DataFrame with correct columns.")
21
  return pd.DataFrame(columns=cols)
22
  df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
23
- # print(df)
24
  df = df[cols].round(decimals=4)
25
  df = df[has_no_nan_values(df, benchmark_cols)]
26
  return df
@@ -28,34 +25,44 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
28
 
29
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
30
  """Creates the different dataframes for the evaluation queues requestes"""
31
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
32
  all_evals = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- for entry in entries:
35
- if ".json" in entry:
36
- file_path = os.path.join(save_path, entry)
37
- with open(file_path) as fp:
38
- data = json.load(fp)
39
-
40
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
41
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
42
-
43
- all_evals.append(data)
44
- elif ".md" not in entry and os.path.isdir(os.path.join(save_path, entry)):
45
- # this is a folder
46
- sub_entries = [e for e in os.listdir(os.path.join(save_path, entry)) if os.path.isfile(os.path.join(save_path, entry, e)) and not e.startswith(".")]
47
- for sub_entry in sub_entries:
48
- file_path = os.path.join(save_path, entry, sub_entry)
49
- with open(file_path) as fp:
50
- data = json.load(fp)
51
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
52
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
53
- all_evals.append(data)
54
 
55
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
56
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
57
  finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
58
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
59
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
60
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
 
 
61
  return df_finished[cols], df_running[cols], df_pending[cols]
 
 
1
  import json
2
  import os
 
3
  import pandas as pd
4
+ from datetime import datetime, timedelta
5
+ import dateutil
6
 
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, ModelType, Tasks, Precision, WeightType
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
 
15
  all_data_json = [v.to_dict() for v in raw_data]
 
16
  df = pd.DataFrame.from_records(all_data_json)
 
17
  if df.empty:
18
  print("No evaluation results found. Returning empty DataFrame with correct columns.")
19
  return pd.DataFrame(columns=cols)
20
  df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
 
21
  df = df[cols].round(decimals=4)
22
  df = df[has_no_nan_values(df, benchmark_cols)]
23
  return df
 
25
 
26
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
27
  """Creates the different dataframes for the evaluation queues requestes"""
 
28
  all_evals = []
29
+
30
+ # Define a threshold to identify "stuck" jobs
31
+ time_threshold = datetime.now() - timedelta(hours=1)
32
+
33
+ # Use os.walk for a robust way to find all files recursively
34
+ for root, _, files in os.walk(save_path):
35
+ for filename in files:
36
+ if filename.endswith(".json"):
37
+ file_path = os.path.join(root, filename)
38
+ try:
39
+ with open(file_path, "r") as fp:
40
+ data = json.load(fp)
41
+
42
+ # Check for "stuck" jobs
43
+ if data.get("status") == "RUNNING":
44
+ submitted_time_str = data.get("submitted_at")
45
+ if submitted_time_str:
46
+ submitted_time = dateutil.parser.isoparse(submitted_time_str)
47
+ if submitted_time < time_threshold:
48
+ print(f"Stuck job detected for {data['model']}. Changing status to PENDING.")
49
+ data["status"] = "PENDING"
50
+
51
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
52
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
53
+ all_evals.append(data)
54
 
55
+ except Exception as e:
56
+ print(f"Error processing file {file_path}: {e}")
57
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
60
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
61
  finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
62
+
63
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols) if pending_list else pd.DataFrame(columns=cols)
64
+ df_running = pd.DataFrame.from_records(running_list, columns=cols) if running_list else pd.DataFrame(columns=cols)
65
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols) if finished_list else pd.DataFrame(columns=cols)
66
+
67
  return df_finished[cols], df_running[cols], df_pending[cols]
68
+
src/submission/submit.py CHANGED
@@ -1,5 +1,7 @@
1
  import json
2
  import os
 
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
@@ -10,27 +12,27 @@ from src.submission.check_validity import (
10
  get_model_size,
11
  is_model_on_hub,
12
  )
13
- from src.evaluator.evaluate import evaluate_model, EvaluationStatus, EvaluationResult
14
- from src.display.utils import Tasks
15
- import torch
16
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
17
- from datasets import load_dataset
18
- import time
19
 
20
  REQUESTED_MODELS = None
21
  USERS_TO_SUBMISSION_DATES = None
22
 
23
- def create_eval_request(
 
24
  model: str,
25
  base_model: str,
26
  revision: str,
27
  precision: str,
28
  weight_type: str,
29
  model_type: str,
 
30
  ):
31
- """Create and upload an evaluation request"""
 
 
 
32
  try:
33
- # Create evaluation request file
34
  request_data = {
35
  'model': model,
36
  'base_model': base_model,
@@ -39,345 +41,141 @@ def create_eval_request(
39
  'weight_type': weight_type,
40
  'model_type': model_type,
41
  'status': EvaluationStatus.PENDING.value,
42
- 'submitted_time': datetime.now(timezone.utc).isoformat()
 
 
 
 
43
  }
44
 
45
- # Create filename
46
- username = model.split('/')[0] if '/' in model else None
47
- request_filename = f"{username or 'unknown'}_{model.replace('/', '_')}_eval_request_{revision}_{precision}_{weight_type}.json"
48
- request_path = os.path.join(EVAL_REQUESTS_PATH, request_filename)
49
 
50
- # Write request file
51
- with open(request_path, 'w') as f:
52
- json.dump(request_data, f, indent=2)
53
-
54
- print(f"Created evaluation request: {request_filename}")
55
-
56
- # Upload to Hugging Face
57
- API.upload_file(
58
- path_or_fileobj=request_path,
59
- path_in_repo=request_filename if not username else os.path.join(username, request_filename),
60
- repo_id=QUEUE_REPO,
61
- repo_type="dataset",
62
- commit_message=f"Add evaluation request for {model}",
63
- token=TOKEN
64
- )
65
-
66
- print(f"Uploaded evaluation request to {QUEUE_REPO}")
67
 
68
- return styled_message(
69
- "Evaluation request created! Please wait for the evaluation to complete."
70
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  except Exception as e:
72
- print(f"Error creating evaluation request: {str(e)}")
 
73
  return styled_error(f"Failed to create evaluation request: {str(e)}")
74
 
75
- def add_new_eval(
76
- model: str,
77
- base_model: str,
78
- revision: str,
79
- precision: str,
80
- weight_type: str,
81
- model_type: str,
82
- ):
83
- """Validate model and create evaluation request"""
84
  try:
85
- print("\n=== Starting evaluation submission ===")
86
  print(f"Submission time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC")
87
- print(f"Model: {model}")
88
- print(f"Base model: {base_model}")
89
- print(f"Revision: {revision}")
90
- print(f"Precision: {precision}")
91
- print(f"Weight type: {weight_type}")
92
- print(f"Model type: {model_type}")
93
- print(f"Evaluation requests path: {EVAL_REQUESTS_PATH}")
94
- print(f"Queue repo: {QUEUE_REPO}")
95
 
96
- # Always refresh the cache before checking for duplicates
97
- print("\n=== Checking for duplicate submissions ===")
 
 
 
 
 
98
  global REQUESTED_MODELS
99
  global USERS_TO_SUBMISSION_DATES
100
  start_time = time.time()
101
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
102
- print(f"Cache refresh completed in {time.time() - start_time:.2f} seconds")
103
- print(f"Found {len(REQUESTED_MODELS)} existing submissions")
104
-
105
- user_name = ""
106
- model_path = model
107
- if "/" in model:
108
- user_name = model.split("/")[0]
109
- model_path = model.split("/")[1]
110
- print(f"\nUser name: {user_name}")
111
- print(f"Model path: {model_path}")
112
-
113
- precision = precision.split(" ")[0]
114
- if revision == "":
115
- revision = "main"
116
- print("Using default revision: main")
117
 
118
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
119
-
120
- # Check if model is already submitted
121
- print("\n=== Checking for existing submission ===")
122
  model_key = f"{model}_{revision}_{precision}"
123
  if model_key in REQUESTED_MODELS:
124
- print(f"Found existing submission with key: {model_key}")
125
- # Get the status from the queue file
126
- queue_file = REQUESTED_MODELS[model_key]
127
  try:
128
- with open(queue_file, 'r') as f:
129
  queue_entry = json.load(f)
130
  status = queue_entry.get('status')
131
- print(f"Found existing submission with status: {status}")
132
- if status is None:
133
- print(f"Warning: No status found in queue file {queue_file}")
134
- return styled_warning("Error checking model status. Please try again later.")
135
-
136
- if status != EvaluationStatus.FAILED.value:
137
- print(f"Model already submitted and in {status} status")
138
- return styled_warning(f"This model has been already submitted and is in {status} status.")
139
  except Exception as e:
140
  print(f"Error reading queue file: {e}")
141
- print(f"Full traceback: {traceback.format_exc()}")
142
  return styled_warning("Error checking model status. Please try again later.")
143
- except Exception as e:
144
- print(f"Error during evaluation: {str(e)}")
145
- raise
146
 
147
- print("\n=== Validating model type ===")
148
- if model_type is None or model_type == "":
149
- print("Error: Model type is missing")
150
  return styled_error("Please select a model type.")
151
 
152
- print("\n=== Validating model existence ===")
153
- if revision == "":
154
- revision = "main"
155
- print("Using default revision: main")
156
-
157
- print("\n=== Validating model on Hugging Face ===")
158
  try:
 
159
  if weight_type in ["Delta", "Adapter"]:
160
- print(f"Checking base model {base_model} on Hugging Face...")
161
- base_model_on_hub, error, _ = is_model_on_hub(
162
- model_name=base_model,
163
- revision=revision,
164
- token=TOKEN,
165
- test_tokenizer=True
166
- )
167
- print(f"Base model check result: {base_model_on_hub}")
168
  if not base_model_on_hub:
169
- print(f"Error: Base model not found: {error}")
170
- return styled_error(f'Base model "{base_model}" {error}')
171
 
172
- if not weight_type == "Adapter":
173
- print(f"Checking model {model} on Hugging Face...")
174
- model_on_hub, error, _ = is_model_on_hub(
175
- model_name=model,
176
- revision=revision,
177
- token=TOKEN,
178
- test_tokenizer=True
179
- )
180
- print(f"Model check result: {model_on_hub}")
181
- if not model_on_hub:
182
- print(f"Error: Model not found: {error}")
183
- return styled_error(f'Model "{model}" {error}')
184
- except Exception as e:
185
- print(f"Error checking model on Hugging Face: {e}")
186
- print(f"Full traceback: {traceback.format_exc()}")
187
- return styled_error(f"Failed to validate model on Hugging Face: {str(e)}")
188
 
189
- print("\n=== Getting model info ===")
190
- try:
191
  model_info = API.model_info(repo_id=model, revision=revision)
192
- print(f"Successfully retrieved model info for {model}")
193
- except Exception as e:
194
- print(f"Error getting model info: {e}")
195
- print(f"Full traceback: {traceback.format_exc()}")
196
- return styled_error("Could not get your model information. Please fill it up properly.")
197
-
198
- print("\n=== Getting model size ===")
199
- try:
200
- model_size = get_model_size(model_info=model_info, precision=precision)
201
- print(f"Model size: {model_size}")
202
- except Exception as e:
203
- print(f"Error getting model size: {e}")
204
- print(f"Full traceback: {traceback.format_exc()}")
205
- model_size = "?"
206
-
207
- print("\n=== Validating model card and license ===")
208
- try:
209
- license = model_info.cardData["license"]
210
- print(f"Model license: {license}")
211
- except Exception as e:
212
- print(f"Error getting model license: {e}")
213
- print(f"Full traceback: {traceback.format_exc()}")
214
- return styled_error("Please select a license for your model")
215
-
216
- print("\n=== Checking model card ===")
217
- try:
218
- modelcard_OK, error_msg = check_model_card(model)
219
- print(f"Model card check result: {modelcard_OK}")
220
- if not modelcard_OK:
221
- print(f"Model card error: {error_msg}")
222
  return styled_error(error_msg)
223
- except Exception as e:
224
- print(f"Error checking model card: {e}")
225
- print(f"Full traceback: {traceback.format_exc()}")
226
- return styled_error("Failed to validate model card")
227
-
228
- print("\n=== Creating evaluation entry ===")
229
- eval_entry = {
230
- "model": model,
231
- "base_model": base_model,
232
- "revision": revision,
233
- "precision": precision,
234
- "weight_type": weight_type,
235
- "status": "PENDING",
236
- "submitted_time": current_time,
237
- "model_type": model_type,
238
- "likes": model_info.likes,
239
- "params": model_size,
240
- "license": license,
241
- "private": False,
242
- }
243
- print(f"\nEvaluation entry created: {json.dumps(eval_entry, indent=2)}")
244
-
245
- print("\n=== Checking for duplicate submission ===")
246
- model_key = f"{model}_{revision}_{precision}"
247
- if model_key in REQUESTED_MODELS:
248
- print(f"Found existing submission with key: {model_key}")
249
- # Get the status from the queue file
250
- queue_file = REQUESTED_MODELS[model_key]
251
- try:
252
- with open(queue_file, 'r') as f:
253
- queue_entry = json.load(f)
254
- status = queue_entry.get('status')
255
- print(f"Found existing submission with status: {status}")
256
- if status is None:
257
- print(f"Warning: No status found in queue file {queue_file}")
258
- return styled_warning("Error checking model status. Please try again later.")
259
-
260
- if status != EvaluationStatus.FAILED.value:
261
- print(f"Model already submitted and in {status} status")
262
- return styled_warning(f"This model has been already submitted and is in {status} status.")
263
- except Exception as e:
264
- print(f"Error reading queue file: {e}")
265
- print(f"Full traceback: {traceback.format_exc()}")
266
- return styled_warning("Error checking model status. Please try again later.")
267
-
268
- print("\n=== Creating evaluation file ===")
269
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
270
- print(f"Creating output directory: {OUT_DIR}")
271
- os.makedirs(OUT_DIR, exist_ok=True)
272
-
273
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
274
- print(f"Output file path: {out_path}")
275
-
276
- # Write evaluation entry to file
277
- try:
278
- with open(out_path, "w") as f:
279
- f.write(json.dumps(eval_entry))
280
- print("\nEvaluation file created successfully")
281
-
282
- # Upload to Hugging Face
283
- print("\n=== Uploading evaluation file ===")
284
- API.upload_file(
285
- path_or_fileobj=out_path,
286
- path_in_repo=out_path.split("eval-queue/")[1],
287
- repo_id=QUEUE_REPO,
288
- repo_type="dataset",
289
- commit_message=f"Add evaluation request for {model}",
290
- token=TOKEN
291
- )
292
- print(f"\nEvaluation request uploaded successfully to {QUEUE_REPO}")
293
 
294
- # Clean up local file
295
- os.remove(out_path)
296
- print("\nLocal evaluation file removed")
297
 
298
- return styled_message(
299
- "Evaluation request created successfully! Please wait for the evaluation to complete."
300
- )
301
  except Exception as e:
302
- print(f"Error during file operations: {str(e)}")
303
- print(f"Full traceback: {traceback.format_exc()}")
304
- return styled_error(f"Failed to create evaluation request: {str(e)}")
305
-
306
-
307
 
308
- dataloader = DataLoader(tsac_dataset, batch_size=32, shuffle=False)
309
-
310
- model_obj.eval()
311
- with torch.no_grad():
312
- predictions = []
313
- targets = []
314
-
315
- for batch in dataloader:
316
- inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'}
317
- target = batch['target'].to(device)
318
-
319
- # Log the first batch details
320
- if len(predictions) == 0: # Only log for the first batch
321
- print(f"\nFirst batch example:")
322
- print(f"Input keys: {list(inputs.keys())}")
323
- print(f"Target shape: {target.shape}")
324
-
325
- outputs = model_obj(**inputs)
326
- print(f"\nModel output type: {type(outputs)}")
327
-
328
- # Try to get logits from different possible formats
329
- if isinstance(outputs, dict):
330
- print(f"Output keys: {list(outputs.keys())}")
331
- # Try different common keys
332
- if 'logits' in outputs:
333
- logits = outputs['logits']
334
- elif 'prediction_logits' in outputs:
335
- logits = outputs['prediction_logits']
336
- else:
337
- raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}")
338
- elif isinstance(outputs, tuple):
339
- print(f"Output tuple length: {len(outputs)}")
340
- # Try different positions in the tuple
341
- if len(outputs) > 0:
342
- logits = outputs[0]
343
- else:
344
- raise ValueError("Empty output tuple")
345
- else:
346
- # If it's a single tensor, assume it's the logits
347
- logits = outputs
348
-
349
- print(f"Logits shape: {logits.shape}")
350
- # For sequence classification, we typically use the [CLS] token's prediction
351
- # Get the first token's prediction (CLS token)
352
- cls_logits = logits[:, 0, :] # Shape: [batch_size, num_classes]
353
- predictions.extend(cls_logits.argmax(dim=-1).cpu().tolist())
354
- targets.extend(target.cpu().tolist())
355
-
356
- accuracy = sum(p == t for p, t in zip(predictions, targets)) / len(predictions)
357
-
358
- eval_entry['results'] = {'accuracy': accuracy}
359
-
360
- # Update the queue file with results
361
- with open(out_path, "w") as f:
362
- f.write(json.dumps(eval_entry))
363
 
364
- # Evaluate on ArabML
365
- print("Evaluating on ArabML Tunisian Corpus...")
366
- arabml_dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train", trust_remote_code=True)
367
-
368
- def preprocess_arabml(examples):
369
- return tokenizer(examples['Tweet'], padding=True, truncation=True, max_length=512)
370
-
371
- arabml_dataset = arabml_dataset.map(preprocess_arabml, batched=True)
372
-
373
- total_tokens = 0
374
- covered_tokens = 0
375
-
376
- for example in arabml_dataset:
377
- tokens = tokenizer.tokenize(example['Tweet'])
378
- total_tokens += len(tokens)
379
- covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
380
-
381
- arabml_coverage = covered_tokens / total_tokens if total_tokens > 0 else 0
382
-
383
- # Store results
 
1
  import json
2
  import os
3
+ import time
4
+ import traceback
5
  from datetime import datetime, timezone
6
 
7
  from src.display.formatting import styled_error, styled_message, styled_warning
 
12
  get_model_size,
13
  is_model_on_hub,
14
  )
15
+ from src.evaluator.evaluate import EvaluationStatus
16
+
 
 
 
 
17
 
18
  REQUESTED_MODELS = None
19
  USERS_TO_SUBMISSION_DATES = None
20
 
21
+
22
+ def _create_eval_request(
23
  model: str,
24
  base_model: str,
25
  revision: str,
26
  precision: str,
27
  weight_type: str,
28
  model_type: str,
29
+ model_info: dict,
30
  ):
31
+ """
32
+ Creates and uploads a JSON file for a new model evaluation request.
33
+ This function is a helper for add_new_eval and should not be called directly.
34
+ """
35
  try:
 
36
  request_data = {
37
  'model': model,
38
  'base_model': base_model,
 
41
  'weight_type': weight_type,
42
  'model_type': model_type,
43
  'status': EvaluationStatus.PENDING.value,
44
+ 'submitted_time': datetime.now(timezone.utc).isoformat(),
45
+ 'likes': model_info.likes,
46
+ 'params': get_model_size(model_info, precision),
47
+ 'license': model_info.cardData.get("license"),
48
+ 'private': model_info.private,
49
  }
50
 
51
+ user_name = model.split('/')[0] if '/' in model else 'unknown'
52
+ safe_revision = revision.replace('/', '_')
53
+ request_filename = f"{model.replace('/', '_')}_eval_request_{safe_revision}_{precision}_{weight_type}.json"
 
54
 
55
+ local_dir = os.path.join(EVAL_REQUESTS_PATH, user_name)
56
+ os.makedirs(local_dir, exist_ok=True)
57
+ local_path = os.path.join(local_dir, request_filename)
58
+
59
+ print(f"Creating local evaluation request file: {local_path}")
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ # Use a try-finally block to ensure the local file is always removed
62
+ try:
63
+ with open(local_path, 'w') as f:
64
+ json.dump(request_data, f, indent=2)
65
+
66
+ # Upload the request file to the Hugging Face queue repository
67
+ print(f"Uploading evaluation request to {QUEUE_REPO}")
68
+ path_in_repo = os.path.join(user_name, request_filename)
69
+ API.upload_file(
70
+ path_or_fileobj=local_path,
71
+ path_in_repo=path_in_repo,
72
+ repo_id=QUEUE_REPO,
73
+ repo_type="dataset",
74
+ commit_message=f"Add evaluation request for {model}",
75
+ token=TOKEN
76
+ )
77
+
78
+ print(f"Uploaded successfully to {path_in_repo} in {QUEUE_REPO}")
79
+
80
+ return styled_message(
81
+ "Evaluation request created successfully! Please wait for the evaluation to complete."
82
+ )
83
+ finally:
84
+ if os.path.exists(local_path):
85
+ os.remove(local_path)
86
+ print(f"Local file {local_path} removed.")
87
+
88
  except Exception as e:
89
+ print(f"Error creating or uploading evaluation request: {str(e)}")
90
+ print(f"Full traceback:\n{traceback.format_exc()}")
91
  return styled_error(f"Failed to create evaluation request: {str(e)}")
92
 
93
+
94
+ def add_new_eval(model: str, base_model: str, revision: str, precision: str, weight_type: str, model_type: str):
95
+ """
96
+ Validates a model and creates an evaluation request for it.
97
+ This is the main function to be called by the user.
98
+ """
 
 
 
99
  try:
100
+ print("\n=== Starting Evaluation Submission ===")
101
  print(f"Submission time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC")
102
+ print(f"Model: {model}, Base: {base_model}, Revision: {revision}, Precision: {precision}")
 
 
 
 
 
 
 
103
 
104
+ precision = precision.split(" ")[0]
105
+ if not revision:
106
+ revision = "main"
107
+ print("Using default revision: main")
108
+
109
+ # --- Step 1: Check for existing submissions ---
110
+ print("\n=== Checking for existing submissions ===")
111
  global REQUESTED_MODELS
112
  global USERS_TO_SUBMISSION_DATES
113
  start_time = time.time()
114
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
115
+ print(f"Cache refresh completed in {time.time() - start_time:.2f} seconds. Found {len(REQUESTED_MODELS)} existing submissions.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
 
 
 
 
117
  model_key = f"{model}_{revision}_{precision}"
118
  if model_key in REQUESTED_MODELS:
119
+ queue_file_path = REQUESTED_MODELS[model_key]
 
 
120
  try:
121
+ with open(queue_file_path, 'r') as f:
122
  queue_entry = json.load(f)
123
  status = queue_entry.get('status')
124
+ if status is not None and status != EvaluationStatus.FAILED.value:
125
+ return styled_warning(f"This model has already been submitted and is in a '{status}' status.")
 
 
 
 
 
 
126
  except Exception as e:
127
  print(f"Error reading queue file: {e}")
128
+ print(f"Full traceback:\n{traceback.format_exc()}")
129
  return styled_warning("Error checking model status. Please try again later.")
130
+ print(f"No existing submission found for key: {model_key} or previous submission had a FAILED status.")
 
 
131
 
132
+ # --- Step 2: Validate model type and existence on the Hub ---
133
+ print("\n=== Validating model existence and card === ")
134
+ if not model_type:
135
  return styled_error("Please select a model type.")
136
 
 
 
 
 
 
 
137
  try:
138
+ # Validate the base model first for delta/adapter weights
139
  if weight_type in ["Delta", "Adapter"]:
140
+ print(f"Checking base model '{base_model}' on Hugging Face...")
141
+ base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN)
 
 
 
 
 
 
142
  if not base_model_on_hub:
143
+ return styled_error(f'Base model "{base_model}" was not found on the Hugging Face Hub: {error}')
 
144
 
145
+ # Validate the main model
146
+ print(f"Checking model '{model}' on Hugging Face...")
147
+ model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN)
148
+ if not model_on_hub:
149
+ return styled_error(f'Model "{model}" was not found on the Hugging Face Hub: {error}')
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ # Get model information and validate the model card and license
 
152
  model_info = API.model_info(repo_id=model, revision=revision)
153
+ model_card_ok, error_msg = check_model_card(model)
154
+ if not model_card_ok:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  return styled_error(error_msg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
+ if "license" not in model_info.cardData:
158
+ return styled_error("Please select a license for your model in its model card.")
 
159
 
 
 
 
160
  except Exception as e:
161
+ print(f"Error during model validation: {e}")
162
+ print(f"Full traceback:\n{traceback.format_exc()}")
163
+ return styled_error(f"Failed to validate model on Hugging Face: {str(e)}")
 
 
164
 
165
+ # --- Step 3: Create the evaluation request ---
166
+ print("\n=== Creating and uploading evaluation request ===")
167
+ # This function encapsulates the file creation and upload logic.
168
+ return _create_eval_request(
169
+ model=model,
170
+ base_model=base_model,
171
+ revision=revision,
172
+ precision=precision,
173
+ weight_type=weight_type,
174
+ model_type=model_type,
175
+ model_info=model_info,
176
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
+ except Exception as e:
179
+ print(f"An unexpected error occurred during submission: {e}")
180
+ print(f"Full traceback:\n{traceback.format_exc()}")
181
+ return styled_error(f"An unexpected error occurred during submission: {str(e)}")