Spaces:
Running
Running
| """ | |
| HuggingFace Operations: Upload data, create PRs, validate schemas. | |
| """ | |
| from huggingface_hub import HfApi, login | |
| import pandas as pd | |
| import json | |
| from pathlib import Path | |
| from jsonschema import validate, ValidationError, Draft7Validator | |
| # Load schema once at module level | |
| SCHEMA_PATH = Path(__file__).parent / "eval.schema.json" | |
| with open(SCHEMA_PATH, 'r') as f: | |
| EVAL_SCHEMA = json.load(f) | |
| def validate_json_against_schema(json_data): | |
| """ | |
| Validate a JSON object against eval.schema.json. | |
| Args: | |
| json_data: Dict containing the evaluation data | |
| Returns: | |
| (bool, str): (is_valid, error_message) | |
| """ | |
| try: | |
| validate(instance=json_data, schema=EVAL_SCHEMA) | |
| return True, "Schema validation passed" | |
| except ValidationError as e: | |
| # Extract the most relevant error message | |
| error_path = " β ".join(str(p) for p in e.path) if e.path else "root" | |
| return False, f"β Schema validation failed at '{error_path}': {e.message}" | |
| except Exception as e: | |
| return False, f"β Validation error: {str(e)}" | |
| def upload_to_hf_dataset(parquet_file, split_name, repo_id): | |
| """ | |
| Upload a parquet file as a new split to the HF dataset. | |
| Args: | |
| parquet_file: Path to parquet file | |
| split_name: Name of the split (leaderboard name) | |
| repo_id: HuggingFace dataset repository ID | |
| """ | |
| # TODO: Implement upload logic | |
| pass | |
| def check_hf_authentication(): | |
| """ | |
| Check if user is authenticated with HuggingFace. | |
| Returns: | |
| (bool, str): (is_authenticated, username or error_message) | |
| """ | |
| try: | |
| api = HfApi() | |
| user_info = api.whoami() | |
| return True, user_info['name'] | |
| except Exception as e: | |
| return False, "Not authenticated. Run: huggingface-cli login" | |
| def check_duplicate_pr_exists(leaderboard_name, repo_id): | |
| """ | |
| Check if a PR already exists for this leaderboard. | |
| Args: | |
| leaderboard_name: Name of the leaderboard | |
| repo_id: HuggingFace dataset repository ID | |
| Returns: | |
| (bool, str or None): (exists, pr_url if exists) | |
| """ | |
| try: | |
| api = HfApi() | |
| discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset") | |
| # Check for open PRs with matching title | |
| pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}" | |
| for discussion in discussions: | |
| if discussion.is_pull_request and discussion.status == "open": | |
| if pr_title_pattern in discussion.title.lower(): | |
| pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}" | |
| return True, pr_url | |
| return False, None | |
| except Exception as e: | |
| # If we can't check, assume no duplicate (fail open) | |
| print(f"Warning: Could not check for duplicate PRs: {e}") | |
| return False, None | |
| def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id): | |
| """ | |
| Create a pull request to add a new leaderboard split. | |
| Args: | |
| leaderboard_name: Name of the new leaderboard | |
| parquet_file: Path to parquet file | |
| repo_id: HuggingFace dataset repository ID | |
| Returns: | |
| (success, pr_url or error_message) | |
| """ | |
| # 1. Check authentication | |
| is_auth, auth_result = check_hf_authentication() | |
| if not is_auth: | |
| return False, f"β {auth_result}" | |
| # 2. Check for duplicate PR | |
| has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id) | |
| if has_duplicate: | |
| return False, f"β οΈ PR already exists: {duplicate_url}" | |
| # 3. Validate parquet file exists and has data | |
| parquet_path = Path(parquet_file) | |
| if not parquet_path.exists(): | |
| return False, "β Parquet file not found" | |
| df = pd.read_parquet(parquet_file) | |
| if len(df) == 0: | |
| return False, "β Parquet file is empty" | |
| # 4. Create PR | |
| try: | |
| api = HfApi() | |
| # Upload the parquet file to the branch | |
| commit_message = f"Add new leaderboard: {leaderboard_name}" | |
| # Upload file and create PR | |
| commit_info = api.upload_file( | |
| path_or_fileobj=parquet_file, | |
| path_in_repo=f"data/{leaderboard_name}.parquet", | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| commit_message=commit_message, | |
| create_pr=True, | |
| ) | |
| # Extract PR URL from commit info | |
| pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions" | |
| return True, f"PR created ({len(df)} rows): {pr_url}" | |
| except Exception as e: | |
| return False, f"β Failed to create PR: {str(e)}" | |
| def validate_schema(parquet_file): | |
| """ | |
| Validate that a parquet file matches the expected schema. | |
| Args: | |
| parquet_file: Path to parquet file to validate | |
| Returns: | |
| (bool, str): (is_valid, error_message) | |
| """ | |
| try: | |
| df = pd.read_parquet(parquet_file) | |
| # Required columns | |
| required_cols = [ | |
| '_leaderboard', '_developer', '_model', '_uuid', | |
| 'schema_version', 'evaluation_id', 'retrieved_timestamp', | |
| 'source_data', 'evaluation_source_name', 'evaluation_source_type', | |
| 'source_organization_name', 'evaluator_relationship', | |
| 'model_name', 'model_id', 'model_developer', | |
| 'evaluation_results' | |
| ] | |
| missing = [col for col in required_cols if col not in df.columns] | |
| if missing: | |
| return False, f"Missing required columns: {', '.join(missing)}" | |
| # Check data types (all should be strings) | |
| for col in df.columns: | |
| if df[col].dtype not in ['object', 'string']: | |
| return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)" | |
| return True, "Schema validation passed" | |
| except Exception as e: | |
| return False, f"Validation error: {str(e)}" | |
| def export_to_json(parquet_file, output_dir): | |
| """ | |
| Export parquet data back to JSON files. | |
| Uses the parquet_to_folder function from json_to_parquet.py | |
| Args: | |
| parquet_file: Path to parquet file | |
| output_dir: Directory to write JSON files to | |
| """ | |
| from json_to_parquet import parquet_to_folder | |
| parquet_to_folder(parquet_file, output_dir) | |