import json import os from datetime import datetime, timezone from src.display.formatting import styled_error, styled_message, styled_warning from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO from src.submission.check_validity import ( already_submitted_models, check_model_card, get_model_size, is_model_on_hub, ) from src.evaluator.evaluate import evaluate_model, EvaluationStatus, EvaluationResult from src.display.utils import Tasks import torch from transformers import AutoModelForSequenceClassification, AutoTokenizer from datasets import load_dataset import time REQUESTED_MODELS = None USERS_TO_SUBMISSION_DATES = None def create_eval_request( model: str, base_model: str, revision: str, precision: str, weight_type: str, model_type: str, ): """Create and upload an evaluation request""" try: # Create evaluation request file request_data = { 'model': model, 'base_model': base_model, 'revision': revision, 'precision': precision, 'weight_type': weight_type, 'model_type': model_type, 'status': EvaluationStatus.PENDING.value, 'submitted_time': datetime.now(timezone.utc).isoformat() } # Create filename username = model.split('/')[0] if '/' in model else None request_filename = f"{username or 'unknown'}_{model.replace('/', '_')}_eval_request_{revision}_{precision}_{weight_type}.json" request_path = os.path.join(EVAL_REQUESTS_PATH, request_filename) # Write request file with open(request_path, 'w') as f: json.dump(request_data, f, indent=2) print(f"Created evaluation request: {request_filename}") # Upload to Hugging Face API.upload_file( path_or_fileobj=request_path, path_in_repo=request_filename if not username else os.path.join(username, request_filename), repo_id=QUEUE_REPO, repo_type="dataset", commit_message=f"Add evaluation request for {model}", token=TOKEN ) print(f"Uploaded evaluation request to {QUEUE_REPO}") return styled_message( "Evaluation request created! Please wait for the evaluation to complete." ) except Exception as e: print(f"Error creating evaluation request: {str(e)}") return styled_error(f"Failed to create evaluation request: {str(e)}") def add_new_eval( model: str, base_model: str, revision: str, precision: str, weight_type: str, model_type: str, ): """Validate model and create evaluation request""" try: print("\n=== Starting evaluation submission ===") print(f"Submission time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC") print(f"Model: {model}") print(f"Base model: {base_model}") print(f"Revision: {revision}") print(f"Precision: {precision}") print(f"Weight type: {weight_type}") print(f"Model type: {model_type}") print(f"Evaluation requests path: {EVAL_REQUESTS_PATH}") print(f"Queue repo: {QUEUE_REPO}") # Always refresh the cache before checking for duplicates print("\n=== Checking for duplicate submissions ===") global REQUESTED_MODELS global USERS_TO_SUBMISSION_DATES start_time = time.time() REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH) print(f"Cache refresh completed in {time.time() - start_time:.2f} seconds") print(f"Found {len(REQUESTED_MODELS)} existing submissions") user_name = "" model_path = model if "/" in model: user_name = model.split("/")[0] model_path = model.split("/")[1] print(f"\nUser name: {user_name}") print(f"Model path: {model_path}") precision = precision.split(" ")[0] if revision == "": revision = "main" print("Using default revision: main") current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") # Check if model is already submitted print("\n=== Checking for existing submission ===") model_key = f"{model}_{revision}_{precision}" if model_key in REQUESTED_MODELS: print(f"Found existing submission with key: {model_key}") # Get the status from the queue file queue_file = REQUESTED_MODELS[model_key] try: with open(queue_file, 'r') as f: queue_entry = json.load(f) status = queue_entry.get('status') print(f"Found existing submission with status: {status}") if status is None: print(f"Warning: No status found in queue file {queue_file}") return styled_warning("Error checking model status. Please try again later.") if status != EvaluationStatus.FAILED.value: print(f"Model already submitted and in {status} status") return styled_warning(f"This model has been already submitted and is in {status} status.") except Exception as e: print(f"Error reading queue file: {e}") print(f"Full traceback: {traceback.format_exc()}") return styled_warning("Error checking model status. Please try again later.") except Exception as e: print(f"Error during evaluation: {str(e)}") raise print("\n=== Validating model type ===") if model_type is None or model_type == "": print("Error: Model type is missing") return styled_error("Please select a model type.") print("\n=== Validating model existence ===") if revision == "": revision = "main" print("Using default revision: main") print("\n=== Validating model on Hugging Face ===") try: if weight_type in ["Delta", "Adapter"]: print(f"Checking base model {base_model} on Hugging Face...") base_model_on_hub, error, _ = is_model_on_hub( model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True ) print(f"Base model check result: {base_model_on_hub}") if not base_model_on_hub: print(f"Error: Base model not found: {error}") return styled_error(f'Base model "{base_model}" {error}') if not weight_type == "Adapter": print(f"Checking model {model} on Hugging Face...") model_on_hub, error, _ = is_model_on_hub( model_name=model, revision=revision, token=TOKEN, test_tokenizer=True ) print(f"Model check result: {model_on_hub}") if not model_on_hub: print(f"Error: Model not found: {error}") return styled_error(f'Model "{model}" {error}') except Exception as e: print(f"Error checking model on Hugging Face: {e}") print(f"Full traceback: {traceback.format_exc()}") return styled_error(f"Failed to validate model on Hugging Face: {str(e)}") print("\n=== Getting model info ===") try: model_info = API.model_info(repo_id=model, revision=revision) print(f"Successfully retrieved model info for {model}") except Exception as e: print(f"Error getting model info: {e}") print(f"Full traceback: {traceback.format_exc()}") return styled_error("Could not get your model information. Please fill it up properly.") print("\n=== Getting model size ===") try: model_size = get_model_size(model_info=model_info, precision=precision) print(f"Model size: {model_size}") except Exception as e: print(f"Error getting model size: {e}") print(f"Full traceback: {traceback.format_exc()}") model_size = "?" print("\n=== Validating model card and license ===") try: license = model_info.cardData["license"] print(f"Model license: {license}") except Exception as e: print(f"Error getting model license: {e}") print(f"Full traceback: {traceback.format_exc()}") return styled_error("Please select a license for your model") print("\n=== Checking model card ===") try: modelcard_OK, error_msg = check_model_card(model) print(f"Model card check result: {modelcard_OK}") if not modelcard_OK: print(f"Model card error: {error_msg}") return styled_error(error_msg) except Exception as e: print(f"Error checking model card: {e}") print(f"Full traceback: {traceback.format_exc()}") return styled_error("Failed to validate model card") print("\n=== Creating evaluation entry ===") eval_entry = { "model": model, "base_model": base_model, "revision": revision, "precision": precision, "weight_type": weight_type, "status": "PENDING", "submitted_time": current_time, "model_type": model_type, "likes": model_info.likes, "params": model_size, "license": license, "private": False, } print(f"\nEvaluation entry created: {json.dumps(eval_entry, indent=2)}") print("\n=== Checking for duplicate submission ===") model_key = f"{model}_{revision}_{precision}" if model_key in REQUESTED_MODELS: print(f"Found existing submission with key: {model_key}") # Get the status from the queue file queue_file = REQUESTED_MODELS[model_key] try: with open(queue_file, 'r') as f: queue_entry = json.load(f) status = queue_entry.get('status') print(f"Found existing submission with status: {status}") if status is None: print(f"Warning: No status found in queue file {queue_file}") return styled_warning("Error checking model status. Please try again later.") if status != EvaluationStatus.FAILED.value: print(f"Model already submitted and in {status} status") return styled_warning(f"This model has been already submitted and is in {status} status.") except Exception as e: print(f"Error reading queue file: {e}") print(f"Full traceback: {traceback.format_exc()}") return styled_warning("Error checking model status. Please try again later.") print("\n=== Creating evaluation file ===") OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}" print(f"Creating output directory: {OUT_DIR}") os.makedirs(OUT_DIR, exist_ok=True) out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json" print(f"Output file path: {out_path}") # Write evaluation entry to file try: with open(out_path, "w") as f: f.write(json.dumps(eval_entry)) print("\nEvaluation file created successfully") # Upload to Hugging Face print("\n=== Uploading evaluation file ===") API.upload_file( path_or_fileobj=out_path, path_in_repo=out_path.split("eval-queue/")[1], repo_id=QUEUE_REPO, repo_type="dataset", commit_message=f"Add evaluation request for {model}", token=TOKEN ) print(f"\nEvaluation request uploaded successfully to {QUEUE_REPO}") # Clean up local file os.remove(out_path) print("\nLocal evaluation file removed") return styled_message( "Evaluation request created successfully! Please wait for the evaluation to complete." ) except Exception as e: print(f"Error during file operations: {str(e)}") print(f"Full traceback: {traceback.format_exc()}") return styled_error(f"Failed to create evaluation request: {str(e)}") dataloader = DataLoader(tsac_dataset, batch_size=32, shuffle=False) model_obj.eval() with torch.no_grad(): predictions = [] targets = [] for batch in dataloader: inputs = {k: v.to(device) for k, v in batch.items() if k != 'target'} target = batch['target'].to(device) # Log the first batch details if len(predictions) == 0: # Only log for the first batch print(f"\nFirst batch example:") print(f"Input keys: {list(inputs.keys())}") print(f"Target shape: {target.shape}") outputs = model_obj(**inputs) print(f"\nModel output type: {type(outputs)}") # Try to get logits from different possible formats if isinstance(outputs, dict): print(f"Output keys: {list(outputs.keys())}") # Try different common keys if 'logits' in outputs: logits = outputs['logits'] elif 'prediction_logits' in outputs: logits = outputs['prediction_logits'] else: raise ValueError(f"Unknown output format. Available keys: {list(outputs.keys())}") elif isinstance(outputs, tuple): print(f"Output tuple length: {len(outputs)}") # Try different positions in the tuple if len(outputs) > 0: logits = outputs[0] else: raise ValueError("Empty output tuple") else: # If it's a single tensor, assume it's the logits logits = outputs print(f"Logits shape: {logits.shape}") # For sequence classification, we typically use the [CLS] token's prediction # Get the first token's prediction (CLS token) cls_logits = logits[:, 0, :] # Shape: [batch_size, num_classes] predictions.extend(cls_logits.argmax(dim=-1).cpu().tolist()) targets.extend(target.cpu().tolist()) accuracy = sum(p == t for p, t in zip(predictions, targets)) / len(predictions) eval_entry['results'] = {'accuracy': accuracy} # Update the queue file with results with open(out_path, "w") as f: f.write(json.dumps(eval_entry)) # Evaluate on ArabML print("Evaluating on ArabML Tunisian Corpus...") arabml_dataset = load_dataset("arbml/Tunisian_Dialect_Corpus", split="train", trust_remote_code=True) def preprocess_arabml(examples): return tokenizer(examples['Tweet'], padding=True, truncation=True, max_length=512) arabml_dataset = arabml_dataset.map(preprocess_arabml, batched=True) total_tokens = 0 covered_tokens = 0 for example in arabml_dataset: tokens = tokenizer.tokenize(example['Tweet']) total_tokens += len(tokens) covered_tokens += len([t for t in tokens if t != tokenizer.unk_token]) arabml_coverage = covered_tokens / total_tokens if total_tokens > 0 else 0 # Store results