Spaces:
Running
Running
| """ | |
| Data Loader: Load from HuggingFace, parse JSON files, and build tables. | |
| """ | |
| import json | |
| import pandas as pd | |
| from pathlib import Path | |
| from datasets import load_dataset | |
| # Global caches | |
| HF_DATASET_CACHE = {} | |
| LEADERBOARD_CACHE = {} | |
| DATA_DIR = Path("leaderboard_data") | |
| def load_hf_dataset_on_startup(): | |
| """Load all splits from HuggingFace dataset at startup.""" | |
| print("Loading dataset from HuggingFace...") | |
| try: | |
| dataset = load_dataset("evaleval/every_eval_ever") | |
| for split_name, split_data in dataset.items(): | |
| print(f"Loading split: {split_name} ({len(split_data)} rows)") | |
| df = split_data.to_pandas() | |
| parsed_items = [] | |
| for _, row in df.iterrows(): | |
| evaluation_results = json.loads(row['evaluation_results']) | |
| results = {} | |
| for eval_result in evaluation_results: | |
| eval_name = eval_result.get("evaluation_name") | |
| score = eval_result.get("score_details", {}).get("score") | |
| if eval_name and score is not None: | |
| results[eval_name] = score | |
| additional_details = {} | |
| if pd.notna(row.get('additional_details')): | |
| additional_details = json.loads(row['additional_details']) | |
| parsed_item = { | |
| "leaderboard": row['_leaderboard'], | |
| "provider": row['source_organization_name'], | |
| "model": row['model_id'], | |
| "developer": row['model_developer'], | |
| "params": additional_details.get('params_billions'), | |
| "architecture": additional_details.get('architecture', 'Unknown'), | |
| "precision": additional_details.get('precision', 'Unknown'), | |
| "results": results, | |
| "raw_data": { | |
| "schema_version": row['schema_version'], | |
| "evaluation_id": row['evaluation_id'], | |
| "retrieved_timestamp": row['retrieved_timestamp'], | |
| "source_data": json.loads(row['source_data']), | |
| "evaluation_source": { | |
| "evaluation_source_name": row['evaluation_source_name'], | |
| "evaluation_source_type": row['evaluation_source_type'] | |
| }, | |
| "source_metadata": { | |
| "source_organization_name": row['source_organization_name'], | |
| "evaluator_relationship": row['evaluator_relationship'], | |
| }, | |
| "model_info": { | |
| "name": row['model_name'], | |
| "id": row['model_id'], | |
| "developer": row['model_developer'], | |
| }, | |
| "evaluation_results": evaluation_results, | |
| "additional_details": additional_details | |
| } | |
| } | |
| if pd.notna(row.get('source_organization_url')): | |
| parsed_item["raw_data"]["source_metadata"]["source_organization_url"] = row['source_organization_url'] | |
| if pd.notna(row.get('source_organization_logo_url')): | |
| parsed_item["raw_data"]["source_metadata"]["source_organization_logo_url"] = row['source_organization_logo_url'] | |
| if pd.notna(row.get('model_inference_platform')): | |
| parsed_item["raw_data"]["model_info"]["inference_platform"] = row['model_inference_platform'] | |
| parsed_items.append(parsed_item) | |
| HF_DATASET_CACHE[split_name] = parsed_items | |
| print(f"Loaded {len(HF_DATASET_CACHE)} leaderboard(s) from HuggingFace") | |
| return True | |
| except Exception as e: | |
| print(f"Warning: Could not load HuggingFace dataset: {e}") | |
| print("Falling back to local file system...") | |
| return False | |
| def parse_eval_json(file_path): | |
| """Parses a single JSON file to extract model, provider, and results.""" | |
| try: | |
| with open(file_path, 'r') as f: | |
| data = json.load(f) | |
| leaderboard_name = data.get("evaluation_source", {}).get("evaluation_source_name", "Unknown Leaderboard") | |
| provider_name = data.get("source_metadata", {}).get("source_organization_name", "Unknown Provider") | |
| model_id = data.get("model_info", {}).get("id", "Unknown Model") | |
| developer_name = data.get("model_info", {}).get("developer", "Unknown Developer") | |
| params = data.get("model_info", {}).get("params_billions", None) | |
| architecture = data.get("model_info", {}).get("architecture", "Unknown") | |
| precision = data.get("additional_details", {}).get("precision", "Unknown") | |
| if precision == "Unknown": | |
| precision = data.get("model_info", {}).get("precision", "Unknown") | |
| results = {} | |
| if "evaluation_results" in data: | |
| for res in data["evaluation_results"]: | |
| eval_name = res.get("evaluation_name", "Unknown Metric") | |
| score = res.get("score_details", {}).get("score", None) | |
| if score is not None: | |
| results[eval_name] = score | |
| return { | |
| "leaderboard": leaderboard_name, | |
| "provider": provider_name, | |
| "model": model_id, | |
| "developer": developer_name, | |
| "params": params, | |
| "architecture": architecture, | |
| "precision": precision, | |
| "results": results, | |
| "raw_data": data | |
| } | |
| except Exception as e: | |
| print(f"Error parsing {file_path}: {e}") | |
| return None | |
| def get_available_leaderboards(): | |
| """Returns available leaderboards from HF cache or local directory.""" | |
| if HF_DATASET_CACHE: | |
| return list(HF_DATASET_CACHE.keys()) | |
| if not DATA_DIR.exists(): | |
| return [] | |
| return [d.name for d in DATA_DIR.iterdir() if d.is_dir()] | |
| def walk_eval_files(leaderboard_name): | |
| """Generator that walks through Leaderboard directory recursively.""" | |
| lb_path = DATA_DIR / leaderboard_name | |
| if not lb_path.exists(): | |
| return | |
| yield from lb_path.rglob("*.json") | |
| def get_eval_metadata(selected_leaderboard): | |
| """Extracts evaluation metadata from the leaderboard data.""" | |
| if not selected_leaderboard: | |
| return {} | |
| eval_metadata = {"evals": {}, "source_info": {}} | |
| if selected_leaderboard in HF_DATASET_CACHE: | |
| parsed_items = HF_DATASET_CACHE[selected_leaderboard] | |
| if parsed_items: | |
| parsed = parsed_items[0] | |
| source_meta = parsed["raw_data"].get("source_metadata", {}) | |
| source_data_list = parsed["raw_data"].get("source_data", []) | |
| url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#" | |
| eval_metadata["source_info"] = { | |
| "organization": source_meta.get("source_organization_name", "Unknown"), | |
| "relationship": source_meta.get("evaluator_relationship", "Unknown"), | |
| "url": url | |
| } | |
| if "evaluation_results" in parsed["raw_data"]: | |
| for res in parsed["raw_data"]["evaluation_results"]: | |
| eval_name = res.get("evaluation_name", "Unknown Metric") | |
| if eval_name not in eval_metadata["evals"]: | |
| metric_config = res.get("metric_config", {}) | |
| eval_metadata["evals"][eval_name] = { | |
| "description": metric_config.get("evaluation_description", "No description available"), | |
| "score_type": metric_config.get("score_type", "unknown"), | |
| "lower_is_better": metric_config.get("lower_is_better", False), | |
| "min_score": metric_config.get("min_score"), | |
| "max_score": metric_config.get("max_score"), | |
| "level_names": metric_config.get("level_names", []), | |
| "level_metadata": metric_config.get("level_metadata", []), | |
| "has_unknown_level": metric_config.get("has_unknown_level", False) | |
| } | |
| return eval_metadata | |
| # Fall back to file system | |
| for json_file in walk_eval_files(selected_leaderboard): | |
| parsed = parse_eval_json(json_file) | |
| if parsed: | |
| if not eval_metadata["source_info"]: | |
| source_meta = parsed["raw_data"].get("source_metadata", {}) | |
| source_data_list = parsed["raw_data"].get("source_data", []) | |
| url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#" | |
| eval_metadata["source_info"] = { | |
| "organization": source_meta.get("source_organization_name", "Unknown"), | |
| "relationship": source_meta.get("evaluator_relationship", "Unknown"), | |
| "url": url | |
| } | |
| if "evaluation_results" in parsed["raw_data"]: | |
| for res in parsed["raw_data"]["evaluation_results"]: | |
| eval_name = res.get("evaluation_name", "Unknown Metric") | |
| if eval_name not in eval_metadata["evals"]: | |
| metric_config = res.get("metric_config", {}) | |
| eval_metadata["evals"][eval_name] = { | |
| "description": metric_config.get("evaluation_description", "No description available"), | |
| "score_type": metric_config.get("score_type", "unknown"), | |
| "lower_is_better": metric_config.get("lower_is_better", False), | |
| "min_score": metric_config.get("min_score"), | |
| "max_score": metric_config.get("max_score"), | |
| "level_names": metric_config.get("level_names", []), | |
| "level_metadata": metric_config.get("level_metadata", []), | |
| "has_unknown_level": metric_config.get("has_unknown_level", False) | |
| } | |
| break | |
| return eval_metadata | |
| def build_leaderboard_table(selected_leaderboard, search_query="", progress_callback=None): | |
| """Builds the leaderboard DataFrame from cache or files.""" | |
| if not selected_leaderboard: | |
| return pd.DataFrame() | |
| if selected_leaderboard in LEADERBOARD_CACHE: | |
| df, _ = LEADERBOARD_CACHE[selected_leaderboard] | |
| else: | |
| rows = [] | |
| if selected_leaderboard in HF_DATASET_CACHE: | |
| if progress_callback: | |
| progress_callback(0, desc=f"Loading {selected_leaderboard} from cache...") | |
| parsed_items = HF_DATASET_CACHE[selected_leaderboard] | |
| for i, parsed in enumerate(parsed_items): | |
| if i % 100 == 0 and progress_callback: | |
| progress_callback((i / len(parsed_items)), desc=f"Processing {selected_leaderboard}...") | |
| row = { | |
| "Model": parsed["model"], | |
| "Developer": parsed["developer"], | |
| "Params (B)": parsed["params"], | |
| "Arch": parsed["architecture"], | |
| "Precision": parsed["precision"] | |
| } | |
| row.update(parsed["results"]) | |
| rows.append(row) | |
| else: | |
| # Fall back to file system | |
| if progress_callback: | |
| progress_callback(0, desc=f"Scanning {selected_leaderboard}...") | |
| all_files = list(walk_eval_files(selected_leaderboard)) | |
| total_files = len(all_files) | |
| for i, json_file in enumerate(all_files): | |
| if i % 100 == 0 and progress_callback: | |
| progress_callback((i / total_files), desc=f"Loading {selected_leaderboard}...") | |
| parsed = parse_eval_json(json_file) | |
| if parsed: | |
| row = { | |
| "Model": parsed["model"], | |
| "Developer": parsed["developer"], | |
| "Params (B)": parsed["params"], | |
| "Arch": parsed["architecture"], | |
| "Precision": parsed["precision"] | |
| } | |
| row.update(parsed["results"]) | |
| rows.append(row) | |
| if not rows: | |
| df = pd.DataFrame(columns=["Model", "Developer", "Params (B)", "Arch", "Precision"]) | |
| LEADERBOARD_CACHE[selected_leaderboard] = (df, None) | |
| return df | |
| df = pd.DataFrame(rows) | |
| df = df.dropna(axis=1, how='all') | |
| if df.empty: | |
| LEADERBOARD_CACHE[selected_leaderboard] = (df, None) | |
| return df | |
| numeric_cols = df.select_dtypes(include=['float', 'int']).columns | |
| df[numeric_cols] = df[numeric_cols].round(2) | |
| # Add Average Score | |
| eval_only_cols = [c for c in numeric_cols if c not in ["Params (B)"]] | |
| if len(eval_only_cols) > 0: | |
| df["Average"] = df[eval_only_cols].mean(axis=1).round(2) | |
| # Base columns: Model, Developer, Params, Average | |
| # Eval columns: all evaluation scores | |
| # Model detail columns: Arch, Precision (moved to end) | |
| base_cols = ["Model", "Developer", "Params (B)", "Average"] | |
| model_detail_cols = ["Arch", "Precision"] | |
| eval_cols = [c for c in df.columns if c not in base_cols and c not in model_detail_cols] | |
| base_cols = [c for c in base_cols if c in df.columns] | |
| model_detail_cols = [c for c in model_detail_cols if c in df.columns] | |
| final_cols = base_cols + sorted(eval_cols) + model_detail_cols | |
| df = df[final_cols] | |
| if "Average" in df.columns: | |
| df = df.sort_values("Average", ascending=False) | |
| LEADERBOARD_CACHE[selected_leaderboard] = (df, None) | |
| return df | |
| def clear_cache(): | |
| """Clears all caches.""" | |
| LEADERBOARD_CACHE.clear() | |
| def search_model_across_leaderboards(model_query): | |
| """Search for a model across all leaderboards and return aggregated results.""" | |
| if not model_query or not HF_DATASET_CACHE: | |
| return {}, [] | |
| model_query_lower = model_query.lower().strip() | |
| results = {} | |
| all_matches = [] | |
| for leaderboard_name, parsed_items in HF_DATASET_CACHE.items(): | |
| for item in parsed_items: | |
| model_id = item.get("model", "") | |
| # Check if query matches model name (case insensitive, partial match) | |
| if model_query_lower in model_id.lower(): | |
| all_matches.append(model_id) | |
| # Exact match gets priority | |
| if model_id.lower() == model_query_lower or model_id == model_query: | |
| if model_id not in results: | |
| results[model_id] = {} | |
| results[model_id][leaderboard_name] = { | |
| "developer": item.get("developer"), | |
| "params": item.get("params"), | |
| "architecture": item.get("architecture"), | |
| "precision": item.get("precision"), | |
| "results": item.get("results", {}) | |
| } | |
| # If no exact match, use partial matches | |
| if not results and all_matches: | |
| # Get the first partial match | |
| for leaderboard_name, parsed_items in HF_DATASET_CACHE.items(): | |
| for item in parsed_items: | |
| model_id = item.get("model", "") | |
| if model_query_lower in model_id.lower(): | |
| if model_id not in results: | |
| results[model_id] = {} | |
| results[model_id][leaderboard_name] = { | |
| "developer": item.get("developer"), | |
| "params": item.get("params"), | |
| "architecture": item.get("architecture"), | |
| "precision": item.get("precision"), | |
| "results": item.get("results", {}) | |
| } | |
| # Return unique matches for autocomplete | |
| unique_matches = sorted(set(all_matches))[:20] # Limit to 20 suggestions | |
| return results, unique_matches | |
| def get_all_model_names(): | |
| """Get all unique model names across all leaderboards.""" | |
| if not HF_DATASET_CACHE: | |
| return [] | |
| models = set() | |
| for parsed_items in HF_DATASET_CACHE.values(): | |
| for item in parsed_items: | |
| models.add(item.get("model", "")) | |
| return sorted(models) | |