Spaces:
Runtime error
Runtime error
| """ | |
| Initialize the leaderboard with specific models and compute their p-values. | |
| This module ensures only the specified models are included in the leaderboard | |
| and their model trace p-values are computed. | |
| """ | |
| import os | |
| import json | |
| import sys | |
| from src.evaluation.model_trace_eval import compute_model_trace_p_value | |
| from src.envs import EVAL_RESULTS_PATH | |
| # The specific models we want to include | |
| ALLOWED_MODELS = [ | |
| "lmsys/vicuna-7b-v1.5", | |
| "ibm-granite/granite-7b-base", | |
| "EleutherAI/llemma_7b" | |
| ] | |
| def create_model_result_file(model_name, precision="float16"): | |
| """ | |
| Create a result file for a model with computed p-value. | |
| Args: | |
| model_name: HuggingFace model identifier | |
| precision: Model precision | |
| """ | |
| sys.stderr.write(f"\n๐ง CREATING RESULT FILE FOR: {model_name}\n") | |
| sys.stderr.flush() | |
| # Create the results directory if it doesn't exist | |
| os.makedirs(EVAL_RESULTS_PATH, exist_ok=True) | |
| # Generate a safe filename | |
| safe_name = model_name.replace("/", "_").replace("-", "_") | |
| result_file = os.path.join(EVAL_RESULTS_PATH, f"{safe_name}_{precision}.json") | |
| sys.stderr.write(f"๐ Result file path: {result_file}\n") | |
| sys.stderr.flush() | |
| # Check if file already exists | |
| if os.path.exists(result_file): | |
| sys.stderr.write(f"โ Result file already exists: {result_file}\n") | |
| sys.stderr.flush() | |
| return result_file | |
| # Create basic result structure | |
| result_data = { | |
| "config": { | |
| "model_dtype": f"torch.{precision}", | |
| "model_name": model_name, | |
| "model_sha": "main" | |
| }, | |
| "results": { | |
| # No perplexity - we only care about p-values | |
| } | |
| } | |
| # Save the result file | |
| try: | |
| with open(result_file, 'w') as f: | |
| json.dump(result_data, f, indent=2) | |
| sys.stderr.write(f"โ Created result file: {result_file}\n") | |
| sys.stderr.flush() | |
| return result_file | |
| except Exception as e: | |
| sys.stderr.write(f"โ Failed to create result file: {e}\n") | |
| sys.stderr.flush() | |
| return None | |
| def clean_non_allowed_results(): | |
| """ | |
| Remove result files for models that are not in the allowed list. | |
| """ | |
| sys.stderr.write(f"\n๐งน CLEANING NON-ALLOWED RESULT FILES\n") | |
| sys.stderr.flush() | |
| if not os.path.exists(EVAL_RESULTS_PATH): | |
| sys.stderr.write("๐ Results directory doesn't exist, nothing to clean\n") | |
| sys.stderr.flush() | |
| return | |
| removed_count = 0 | |
| # Walk through all files in the results directory | |
| for root, dirs, files in os.walk(EVAL_RESULTS_PATH): | |
| for file in files: | |
| if not file.endswith('.json'): | |
| continue | |
| file_path = os.path.join(root, file) | |
| try: | |
| # Try to extract model name from the result file | |
| with open(file_path, 'r') as f: | |
| data = json.load(f) | |
| config = data.get("config", {}) | |
| model_name = config.get("model_name", "") | |
| if model_name and not is_model_allowed(model_name): | |
| sys.stderr.write(f"๐๏ธ Removing non-allowed model result: {file_path} (model: {model_name})\n") | |
| os.remove(file_path) | |
| removed_count += 1 | |
| elif not model_name: | |
| sys.stderr.write(f"โ ๏ธ Skipping file with no model_name: {file_path}\n") | |
| except Exception as e: | |
| sys.stderr.write(f"โ ๏ธ Error processing file {file_path}: {e}\n") | |
| continue | |
| sys.stderr.write(f"โ Removed {removed_count} non-allowed result files\n") | |
| sys.stderr.flush() | |
| def initialize_allowed_models(): | |
| """ | |
| Initialize result files for all allowed models. | |
| """ | |
| sys.stderr.write(f"\n๐ INITIALIZING ALLOWED MODELS\n") | |
| sys.stderr.write(f"๐ Models to initialize: {ALLOWED_MODELS}\n") | |
| sys.stderr.flush() | |
| # First, clean up any existing non-allowed results | |
| clean_non_allowed_results() | |
| created_files = [] | |
| for model_name in ALLOWED_MODELS: | |
| try: | |
| result_file = create_model_result_file(model_name) | |
| if result_file: | |
| created_files.append(result_file) | |
| except Exception as e: | |
| sys.stderr.write(f"โ Failed to initialize {model_name}: {e}\n") | |
| sys.stderr.flush() | |
| continue | |
| sys.stderr.write(f"โ Initialized {len(created_files)} model result files\n") | |
| sys.stderr.flush() | |
| return created_files | |
| def is_model_allowed(model_name): | |
| """ | |
| Check if a model is in the allowed list. | |
| Args: | |
| model_name: HuggingFace model identifier | |
| Returns: | |
| bool: True if model is allowed | |
| """ | |
| return model_name in ALLOWED_MODELS | |
| def get_allowed_models(): | |
| """ | |
| Get the list of allowed models. | |
| Returns: | |
| list: List of allowed model names | |
| """ | |
| return ALLOWED_MODELS.copy() |