Spaces:

evaleval
/

every_eval_ever_space

Running

File size: 6,540 Bytes

a92080e

"""
HuggingFace Operations: Upload data, create PRs, validate schemas.
"""
from huggingface_hub import HfApi, login
import pandas as pd
import json
from pathlib import Path
from jsonschema import validate, ValidationError, Draft7Validator


# Load schema once at module level
SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
with open(SCHEMA_PATH, 'r') as f:
    EVAL_SCHEMA = json.load(f)
    

def validate_json_against_schema(json_data):
    """
    Validate a JSON object against eval.schema.json.
    
    Args:
        json_data: Dict containing the evaluation data
        
    Returns:
        (bool, str): (is_valid, error_message)
    """
    try:
        validate(instance=json_data, schema=EVAL_SCHEMA)
        return True, "Schema validation passed"
    except ValidationError as e:
        # Extract the most relevant error message
        error_path = " → ".join(str(p) for p in e.path) if e.path else "root"
        return False, f"❌ Schema validation failed at '{error_path}': {e.message}"
    except Exception as e:
        return False, f"❌ Validation error: {str(e)}"


def upload_to_hf_dataset(parquet_file, split_name, repo_id):
    """
    Upload a parquet file as a new split to the HF dataset.
    
    Args:
        parquet_file: Path to parquet file
        split_name: Name of the split (leaderboard name)
        repo_id: HuggingFace dataset repository ID
    """
    # TODO: Implement upload logic
    pass


def check_hf_authentication():
    """
    Check if user is authenticated with HuggingFace.
    
    Returns:
        (bool, str): (is_authenticated, username or error_message)
    """
    try:
        api = HfApi()
        user_info = api.whoami()
        return True, user_info['name']
    except Exception as e:
        return False, "Not authenticated. Run: huggingface-cli login"


def check_duplicate_pr_exists(leaderboard_name, repo_id):
    """
    Check if a PR already exists for this leaderboard.
    
    Args:
        leaderboard_name: Name of the leaderboard
        repo_id: HuggingFace dataset repository ID
        
    Returns:
        (bool, str or None): (exists, pr_url if exists)
    """
    try:
        api = HfApi()
        discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")
        
        # Check for open PRs with matching title
        pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
        for discussion in discussions:
            if discussion.is_pull_request and discussion.status == "open":
                if pr_title_pattern in discussion.title.lower():
                    pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
                    return True, pr_url
        
        return False, None
    except Exception as e:
        # If we can't check, assume no duplicate (fail open)
        print(f"Warning: Could not check for duplicate PRs: {e}")
        return False, None


def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
    """
    Create a pull request to add a new leaderboard split.
    
    Args:
        leaderboard_name: Name of the new leaderboard
        parquet_file: Path to parquet file
        repo_id: HuggingFace dataset repository ID
        
    Returns:
        (success, pr_url or error_message)
    """
    # 1. Check authentication
    is_auth, auth_result = check_hf_authentication()
    if not is_auth:
        return False, f"❌ {auth_result}"
    
    # 2. Check for duplicate PR
    has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
    if has_duplicate:
        return False, f"⚠️ PR already exists: {duplicate_url}"
    
    # 3. Validate parquet file exists and has data
    parquet_path = Path(parquet_file)
    if not parquet_path.exists():
        return False, "❌ Parquet file not found"
    
    df = pd.read_parquet(parquet_file)
    if len(df) == 0:
        return False, "❌ Parquet file is empty"
    
    # 4. Create PR
    try:
        api = HfApi()
        
        # Upload the parquet file to the branch
        commit_message = f"Add new leaderboard: {leaderboard_name}"
        
        # Upload file and create PR
        commit_info = api.upload_file(
            path_or_fileobj=parquet_file,
            path_in_repo=f"data/{leaderboard_name}.parquet",
            repo_id=repo_id,
            repo_type="dataset",
            commit_message=commit_message,
            create_pr=True,
        )
        
        # Extract PR URL from commit info
        pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions"
        
        return True, f"PR created ({len(df)} rows): {pr_url}"
        
    except Exception as e:
        return False, f"❌ Failed to create PR: {str(e)}"


def validate_schema(parquet_file):
    """
    Validate that a parquet file matches the expected schema.
    
    Args:
        parquet_file: Path to parquet file to validate
        
    Returns:
        (bool, str): (is_valid, error_message)
    """
    try:
        df = pd.read_parquet(parquet_file)
        
        # Required columns
        required_cols = [
            '_leaderboard', '_developer', '_model', '_uuid',
            'schema_version', 'evaluation_id', 'retrieved_timestamp',
            'source_data', 'evaluation_source_name', 'evaluation_source_type',
            'source_organization_name', 'evaluator_relationship',
            'model_name', 'model_id', 'model_developer',
            'evaluation_results'
        ]
        
        missing = [col for col in required_cols if col not in df.columns]
        if missing:
            return False, f"Missing required columns: {', '.join(missing)}"
        
        # Check data types (all should be strings)
        for col in df.columns:
            if df[col].dtype not in ['object', 'string']:
                return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)"
        
        return True, "Schema validation passed"
    
    except Exception as e:
        return False, f"Validation error: {str(e)}"


def export_to_json(parquet_file, output_dir):
    """
    Export parquet data back to JSON files.
    Uses the parquet_to_folder function from json_to_parquet.py
    
    Args:
        parquet_file: Path to parquet file
        output_dir: Directory to write JSON files to
    """
    from json_to_parquet import parquet_to_folder
    parquet_to_folder(parquet_file, output_dir)