Spaces:

evaleval
/

every_eval_ever_space

Running

App Files Files Community

every_eval_ever_space / hf_operations.py

deepmage121

moving to EEE hf org

a92080e 3 days ago

raw

history blame

6.54 kB

	"""
	HuggingFace Operations: Upload data, create PRs, validate schemas.
	"""
	from huggingface_hub import HfApi, login
	import pandas as pd
	import json
	from pathlib import Path
	from jsonschema import validate, ValidationError, Draft7Validator


	# Load schema once at module level
	SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
	with open(SCHEMA_PATH, 'r') as f:
	EVAL_SCHEMA = json.load(f)


	def validate_json_against_schema(json_data):
	"""
	Validate a JSON object against eval.schema.json.

	Args:
	json_data: Dict containing the evaluation data

	Returns:
	(bool, str): (is_valid, error_message)
	"""
	try:
	validate(instance=json_data, schema=EVAL_SCHEMA)
	return True, "Schema validation passed"
	except ValidationError as e:
	# Extract the most relevant error message
	error_path = " → ".join(str(p) for p in e.path) if e.path else "root"
	return False, f"❌ Schema validation failed at '{error_path}': {e.message}"
	except Exception as e:
	return False, f"❌ Validation error: {str(e)}"


	def upload_to_hf_dataset(parquet_file, split_name, repo_id):
	"""
	Upload a parquet file as a new split to the HF dataset.

	Args:
	parquet_file: Path to parquet file
	split_name: Name of the split (leaderboard name)
	repo_id: HuggingFace dataset repository ID
	"""
	# TODO: Implement upload logic
	pass


	def check_hf_authentication():
	"""
	Check if user is authenticated with HuggingFace.

	Returns:
	(bool, str): (is_authenticated, username or error_message)
	"""
	try:
	api = HfApi()
	user_info = api.whoami()
	return True, user_info['name']
	except Exception as e:
	return False, "Not authenticated. Run: huggingface-cli login"


	def check_duplicate_pr_exists(leaderboard_name, repo_id):
	"""
	Check if a PR already exists for this leaderboard.

	Args:
	leaderboard_name: Name of the leaderboard
	repo_id: HuggingFace dataset repository ID

	Returns:
	(bool, str or None): (exists, pr_url if exists)
	"""
	try:
	api = HfApi()
	discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")

	# Check for open PRs with matching title
	pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
	for discussion in discussions:
	if discussion.is_pull_request and discussion.status == "open":
	if pr_title_pattern in discussion.title.lower():
	pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
	return True, pr_url

	return False, None
	except Exception as e:
	# If we can't check, assume no duplicate (fail open)
	print(f"Warning: Could not check for duplicate PRs: {e}")
	return False, None


	def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
	"""
	Create a pull request to add a new leaderboard split.

	Args:
	leaderboard_name: Name of the new leaderboard
	parquet_file: Path to parquet file
	repo_id: HuggingFace dataset repository ID

	Returns:
	(success, pr_url or error_message)
	"""
	# 1. Check authentication
	is_auth, auth_result = check_hf_authentication()
	if not is_auth:
	return False, f"❌ {auth_result}"

	# 2. Check for duplicate PR
	has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
	if has_duplicate:
	return False, f"⚠️ PR already exists: {duplicate_url}"

	# 3. Validate parquet file exists and has data
	parquet_path = Path(parquet_file)
	if not parquet_path.exists():
	return False, "❌ Parquet file not found"

	df = pd.read_parquet(parquet_file)
	if len(df) == 0:
	return False, "❌ Parquet file is empty"

	# 4. Create PR
	try:
	api = HfApi()

	# Upload the parquet file to the branch
	commit_message = f"Add new leaderboard: {leaderboard_name}"

	# Upload file and create PR
	commit_info = api.upload_file(
	path_or_fileobj=parquet_file,
	path_in_repo=f"data/{leaderboard_name}.parquet",
	repo_id=repo_id,
	repo_type="dataset",
	commit_message=commit_message,
	create_pr=True,
	)

	# Extract PR URL from commit info
	pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions"

	return True, f"PR created ({len(df)} rows): {pr_url}"

	except Exception as e:
	return False, f"❌ Failed to create PR: {str(e)}"


	def validate_schema(parquet_file):
	"""
	Validate that a parquet file matches the expected schema.

	Args:
	parquet_file: Path to parquet file to validate

	Returns:
	(bool, str): (is_valid, error_message)
	"""
	try:
	df = pd.read_parquet(parquet_file)

	# Required columns
	required_cols = [
	'_leaderboard', '_developer', '_model', '_uuid',
	'schema_version', 'evaluation_id', 'retrieved_timestamp',
	'source_data', 'evaluation_source_name', 'evaluation_source_type',
	'source_organization_name', 'evaluator_relationship',
	'model_name', 'model_id', 'model_developer',
	'evaluation_results'
	]

	missing = [col for col in required_cols if col not in df.columns]
	if missing:
	return False, f"Missing required columns: {', '.join(missing)}"

	# Check data types (all should be strings)
	for col in df.columns:
	if df[col].dtype not in ['object', 'string']:
	return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)"

	return True, "Schema validation passed"

	except Exception as e:
	return False, f"Validation error: {str(e)}"


	def export_to_json(parquet_file, output_dir):
	"""
	Export parquet data back to JSON files.
	Uses the parquet_to_folder function from json_to_parquet.py

	Args:
	parquet_file: Path to parquet file
	output_dir: Directory to write JSON files to
	"""
	from json_to_parquet import parquet_to_folder
	parquet_to_folder(parquet_file, output_dir)