Spaces:
Running
Running
File size: 6,540 Bytes
a92080e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
"""
HuggingFace Operations: Upload data, create PRs, validate schemas.
"""
from huggingface_hub import HfApi, login
import pandas as pd
import json
from pathlib import Path
from jsonschema import validate, ValidationError, Draft7Validator
# Load schema once at module level
SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
with open(SCHEMA_PATH, 'r') as f:
EVAL_SCHEMA = json.load(f)
def validate_json_against_schema(json_data):
"""
Validate a JSON object against eval.schema.json.
Args:
json_data: Dict containing the evaluation data
Returns:
(bool, str): (is_valid, error_message)
"""
try:
validate(instance=json_data, schema=EVAL_SCHEMA)
return True, "Schema validation passed"
except ValidationError as e:
# Extract the most relevant error message
error_path = " β ".join(str(p) for p in e.path) if e.path else "root"
return False, f"β Schema validation failed at '{error_path}': {e.message}"
except Exception as e:
return False, f"β Validation error: {str(e)}"
def upload_to_hf_dataset(parquet_file, split_name, repo_id):
"""
Upload a parquet file as a new split to the HF dataset.
Args:
parquet_file: Path to parquet file
split_name: Name of the split (leaderboard name)
repo_id: HuggingFace dataset repository ID
"""
# TODO: Implement upload logic
pass
def check_hf_authentication():
"""
Check if user is authenticated with HuggingFace.
Returns:
(bool, str): (is_authenticated, username or error_message)
"""
try:
api = HfApi()
user_info = api.whoami()
return True, user_info['name']
except Exception as e:
return False, "Not authenticated. Run: huggingface-cli login"
def check_duplicate_pr_exists(leaderboard_name, repo_id):
"""
Check if a PR already exists for this leaderboard.
Args:
leaderboard_name: Name of the leaderboard
repo_id: HuggingFace dataset repository ID
Returns:
(bool, str or None): (exists, pr_url if exists)
"""
try:
api = HfApi()
discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")
# Check for open PRs with matching title
pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
for discussion in discussions:
if discussion.is_pull_request and discussion.status == "open":
if pr_title_pattern in discussion.title.lower():
pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
return True, pr_url
return False, None
except Exception as e:
# If we can't check, assume no duplicate (fail open)
print(f"Warning: Could not check for duplicate PRs: {e}")
return False, None
def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
"""
Create a pull request to add a new leaderboard split.
Args:
leaderboard_name: Name of the new leaderboard
parquet_file: Path to parquet file
repo_id: HuggingFace dataset repository ID
Returns:
(success, pr_url or error_message)
"""
# 1. Check authentication
is_auth, auth_result = check_hf_authentication()
if not is_auth:
return False, f"β {auth_result}"
# 2. Check for duplicate PR
has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
if has_duplicate:
return False, f"β οΈ PR already exists: {duplicate_url}"
# 3. Validate parquet file exists and has data
parquet_path = Path(parquet_file)
if not parquet_path.exists():
return False, "β Parquet file not found"
df = pd.read_parquet(parquet_file)
if len(df) == 0:
return False, "β Parquet file is empty"
# 4. Create PR
try:
api = HfApi()
# Upload the parquet file to the branch
commit_message = f"Add new leaderboard: {leaderboard_name}"
# Upload file and create PR
commit_info = api.upload_file(
path_or_fileobj=parquet_file,
path_in_repo=f"data/{leaderboard_name}.parquet",
repo_id=repo_id,
repo_type="dataset",
commit_message=commit_message,
create_pr=True,
)
# Extract PR URL from commit info
pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions"
return True, f"PR created ({len(df)} rows): {pr_url}"
except Exception as e:
return False, f"β Failed to create PR: {str(e)}"
def validate_schema(parquet_file):
"""
Validate that a parquet file matches the expected schema.
Args:
parquet_file: Path to parquet file to validate
Returns:
(bool, str): (is_valid, error_message)
"""
try:
df = pd.read_parquet(parquet_file)
# Required columns
required_cols = [
'_leaderboard', '_developer', '_model', '_uuid',
'schema_version', 'evaluation_id', 'retrieved_timestamp',
'source_data', 'evaluation_source_name', 'evaluation_source_type',
'source_organization_name', 'evaluator_relationship',
'model_name', 'model_id', 'model_developer',
'evaluation_results'
]
missing = [col for col in required_cols if col not in df.columns]
if missing:
return False, f"Missing required columns: {', '.join(missing)}"
# Check data types (all should be strings)
for col in df.columns:
if df[col].dtype not in ['object', 'string']:
return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)"
return True, "Schema validation passed"
except Exception as e:
return False, f"Validation error: {str(e)}"
def export_to_json(parquet_file, output_dir):
"""
Export parquet data back to JSON files.
Uses the parquet_to_folder function from json_to_parquet.py
Args:
parquet_file: Path to parquet file
output_dir: Directory to write JSON files to
"""
from json_to_parquet import parquet_to_folder
parquet_to_folder(parquet_file, output_dir)
|