""" HuggingFace Operations: Upload data, create PRs, validate schemas. """ from huggingface_hub import HfApi, login import pandas as pd import json from pathlib import Path from jsonschema import validate, ValidationError, Draft7Validator # Load schema once at module level SCHEMA_PATH = Path(__file__).parent / "eval.schema.json" with open(SCHEMA_PATH, 'r') as f: EVAL_SCHEMA = json.load(f) def validate_json_against_schema(json_data): """ Validate a JSON object against eval.schema.json. Args: json_data: Dict containing the evaluation data Returns: (bool, str): (is_valid, error_message) """ try: validate(instance=json_data, schema=EVAL_SCHEMA) return True, "Schema validation passed" except ValidationError as e: # Extract the most relevant error message error_path = " → ".join(str(p) for p in e.path) if e.path else "root" return False, f"❌ Schema validation failed at '{error_path}': {e.message}" except Exception as e: return False, f"❌ Validation error: {str(e)}" def upload_to_hf_dataset(parquet_file, split_name, repo_id): """ Upload a parquet file as a new split to the HF dataset. Args: parquet_file: Path to parquet file split_name: Name of the split (leaderboard name) repo_id: HuggingFace dataset repository ID """ # TODO: Implement upload logic pass def check_hf_authentication(): """ Check if user is authenticated with HuggingFace. Returns: (bool, str): (is_authenticated, username or error_message) """ try: api = HfApi() user_info = api.whoami() return True, user_info['name'] except Exception as e: return False, "Not authenticated. Run: huggingface-cli login" def check_duplicate_pr_exists(leaderboard_name, repo_id): """ Check if a PR already exists for this leaderboard. Args: leaderboard_name: Name of the leaderboard repo_id: HuggingFace dataset repository ID Returns: (bool, str or None): (exists, pr_url if exists) """ try: api = HfApi() discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset") # Check for open PRs with matching title pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}" for discussion in discussions: if discussion.is_pull_request and discussion.status == "open": if pr_title_pattern in discussion.title.lower(): pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}" return True, pr_url return False, None except Exception as e: # If we can't check, assume no duplicate (fail open) print(f"Warning: Could not check for duplicate PRs: {e}") return False, None def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id): """ Create a pull request to add a new leaderboard split. Args: leaderboard_name: Name of the new leaderboard parquet_file: Path to parquet file repo_id: HuggingFace dataset repository ID Returns: (success, pr_url or error_message) """ # 1. Check authentication is_auth, auth_result = check_hf_authentication() if not is_auth: return False, f"❌ {auth_result}" # 2. Check for duplicate PR has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id) if has_duplicate: return False, f"⚠️ PR already exists: {duplicate_url}" # 3. Validate parquet file exists and has data parquet_path = Path(parquet_file) if not parquet_path.exists(): return False, "❌ Parquet file not found" df = pd.read_parquet(parquet_file) if len(df) == 0: return False, "❌ Parquet file is empty" # 4. Create PR try: api = HfApi() # Upload the parquet file to the branch commit_message = f"Add new leaderboard: {leaderboard_name}" # Upload file and create PR commit_info = api.upload_file( path_or_fileobj=parquet_file, path_in_repo=f"data/{leaderboard_name}.parquet", repo_id=repo_id, repo_type="dataset", commit_message=commit_message, create_pr=True, ) # Extract PR URL from commit info pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions" return True, f"PR created ({len(df)} rows): {pr_url}" except Exception as e: return False, f"❌ Failed to create PR: {str(e)}" def validate_schema(parquet_file): """ Validate that a parquet file matches the expected schema. Args: parquet_file: Path to parquet file to validate Returns: (bool, str): (is_valid, error_message) """ try: df = pd.read_parquet(parquet_file) # Required columns required_cols = [ '_leaderboard', '_developer', '_model', '_uuid', 'schema_version', 'evaluation_id', 'retrieved_timestamp', 'source_data', 'evaluation_source_name', 'evaluation_source_type', 'source_organization_name', 'evaluator_relationship', 'model_name', 'model_id', 'model_developer', 'evaluation_results' ] missing = [col for col in required_cols if col not in df.columns] if missing: return False, f"Missing required columns: {', '.join(missing)}" # Check data types (all should be strings) for col in df.columns: if df[col].dtype not in ['object', 'string']: return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)" return True, "Schema validation passed" except Exception as e: return False, f"Validation error: {str(e)}" def export_to_json(parquet_file, output_dir): """ Export parquet data back to JSON files. Uses the parquet_to_folder function from json_to_parquet.py Args: parquet_file: Path to parquet file output_dir: Directory to write JSON files to """ from json_to_parquet import parquet_to_folder parquet_to_folder(parquet_file, output_dir)