every_eval_ever_space / hf_operations.py
deepmage121's picture
moving to EEE hf org
a92080e
"""
HuggingFace Operations: Upload data, create PRs, validate schemas.
"""
from huggingface_hub import HfApi, login
import pandas as pd
import json
from pathlib import Path
from jsonschema import validate, ValidationError, Draft7Validator
# Load schema once at module level
SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
with open(SCHEMA_PATH, 'r') as f:
EVAL_SCHEMA = json.load(f)
def validate_json_against_schema(json_data):
"""
Validate a JSON object against eval.schema.json.
Args:
json_data: Dict containing the evaluation data
Returns:
(bool, str): (is_valid, error_message)
"""
try:
validate(instance=json_data, schema=EVAL_SCHEMA)
return True, "Schema validation passed"
except ValidationError as e:
# Extract the most relevant error message
error_path = " β†’ ".join(str(p) for p in e.path) if e.path else "root"
return False, f"❌ Schema validation failed at '{error_path}': {e.message}"
except Exception as e:
return False, f"❌ Validation error: {str(e)}"
def upload_to_hf_dataset(parquet_file, split_name, repo_id):
"""
Upload a parquet file as a new split to the HF dataset.
Args:
parquet_file: Path to parquet file
split_name: Name of the split (leaderboard name)
repo_id: HuggingFace dataset repository ID
"""
# TODO: Implement upload logic
pass
def check_hf_authentication():
"""
Check if user is authenticated with HuggingFace.
Returns:
(bool, str): (is_authenticated, username or error_message)
"""
try:
api = HfApi()
user_info = api.whoami()
return True, user_info['name']
except Exception as e:
return False, "Not authenticated. Run: huggingface-cli login"
def check_duplicate_pr_exists(leaderboard_name, repo_id):
"""
Check if a PR already exists for this leaderboard.
Args:
leaderboard_name: Name of the leaderboard
repo_id: HuggingFace dataset repository ID
Returns:
(bool, str or None): (exists, pr_url if exists)
"""
try:
api = HfApi()
discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")
# Check for open PRs with matching title
pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
for discussion in discussions:
if discussion.is_pull_request and discussion.status == "open":
if pr_title_pattern in discussion.title.lower():
pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
return True, pr_url
return False, None
except Exception as e:
# If we can't check, assume no duplicate (fail open)
print(f"Warning: Could not check for duplicate PRs: {e}")
return False, None
def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
"""
Create a pull request to add a new leaderboard split.
Args:
leaderboard_name: Name of the new leaderboard
parquet_file: Path to parquet file
repo_id: HuggingFace dataset repository ID
Returns:
(success, pr_url or error_message)
"""
# 1. Check authentication
is_auth, auth_result = check_hf_authentication()
if not is_auth:
return False, f"❌ {auth_result}"
# 2. Check for duplicate PR
has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
if has_duplicate:
return False, f"⚠️ PR already exists: {duplicate_url}"
# 3. Validate parquet file exists and has data
parquet_path = Path(parquet_file)
if not parquet_path.exists():
return False, "❌ Parquet file not found"
df = pd.read_parquet(parquet_file)
if len(df) == 0:
return False, "❌ Parquet file is empty"
# 4. Create PR
try:
api = HfApi()
# Upload the parquet file to the branch
commit_message = f"Add new leaderboard: {leaderboard_name}"
# Upload file and create PR
commit_info = api.upload_file(
path_or_fileobj=parquet_file,
path_in_repo=f"data/{leaderboard_name}.parquet",
repo_id=repo_id,
repo_type="dataset",
commit_message=commit_message,
create_pr=True,
)
# Extract PR URL from commit info
pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions"
return True, f"PR created ({len(df)} rows): {pr_url}"
except Exception as e:
return False, f"❌ Failed to create PR: {str(e)}"
def validate_schema(parquet_file):
"""
Validate that a parquet file matches the expected schema.
Args:
parquet_file: Path to parquet file to validate
Returns:
(bool, str): (is_valid, error_message)
"""
try:
df = pd.read_parquet(parquet_file)
# Required columns
required_cols = [
'_leaderboard', '_developer', '_model', '_uuid',
'schema_version', 'evaluation_id', 'retrieved_timestamp',
'source_data', 'evaluation_source_name', 'evaluation_source_type',
'source_organization_name', 'evaluator_relationship',
'model_name', 'model_id', 'model_developer',
'evaluation_results'
]
missing = [col for col in required_cols if col not in df.columns]
if missing:
return False, f"Missing required columns: {', '.join(missing)}"
# Check data types (all should be strings)
for col in df.columns:
if df[col].dtype not in ['object', 'string']:
return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)"
return True, "Schema validation passed"
except Exception as e:
return False, f"Validation error: {str(e)}"
def export_to_json(parquet_file, output_dir):
"""
Export parquet data back to JSON files.
Uses the parquet_to_folder function from json_to_parquet.py
Args:
parquet_file: Path to parquet file
output_dir: Directory to write JSON files to
"""
from json_to_parquet import parquet_to_folder
parquet_to_folder(parquet_file, output_dir)