Spaces:

microsoft
/

paza-bench

Running

App Files Files Community

paza-bench / src /submission /submit.py

muchai-mercy

update pazabench space

0aa9a49 14 days ago

raw

history blame contribute delete

7.16 kB

	import json
	import os
	import re
	from datetime import datetime, timezone
	from urllib.parse import urlparse

	from huggingface_hub import dataset_info
	from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError

	from src.display.formatting import styled_error, styled_message, styled_warning
	from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO


	def is_dataset_on_hub(dataset_name: str, token: str = None) -> tuple[bool, str \| None]:
	"""Check if a dataset exists on the Hugging Face Hub and is publicly available."""
	try:
	info = dataset_info(dataset_name, token=token)
	if info.private:
	return False, "is private. Please make the dataset publicly available on Hugging Face Hub."
	return True, None
	except RepositoryNotFoundError:
	return False, "was not found on the Hub!"
	except HfHubHTTPError as e:
	return False, f"could not be accessed: {str(e)}"
	except Exception as e:
	return False, f"error checking dataset: {str(e)}"


	def is_valid_url(url: str) -> tuple[bool, str \| None]:
	"""Validate URL using urllib.parse with strict scheme enforcement."""
	if not url or not url.strip():
	return False, "URL cannot be empty."

	url = url.strip()

	try:
	parsed = urlparse(url)

	# Strict scheme validation - only http/https allowed
	if parsed.scheme not in ('http', 'https'):
	return False, "URL must start with http:// or https://"

	# Must have a valid network location (domain)
	if not parsed.netloc:
	return False, "Invalid URL domain. Please provide a complete URL."

	# Extract hostname (remove port if present)
	hostname = parsed.hostname
	if not hostname or '.' not in hostname:
	return False, "Invalid URL domain. Please provide a complete URL."

	# Validate hostname format (alphanumeric, dots, hyphens only)
	# This blocks javascript:, data:, vbscript: and other injection schemes
	hostname_parts = hostname.split('.')
	for part in hostname_parts:
	if not part or not all(c.isalnum() or c == '-' for c in part):
	return False, "Invalid domain name in URL."
	if part.startswith('-') or part.endswith('-'):
	return False, "Invalid domain name in URL."

	return True, None
	except Exception:
	return False, "Invalid URL format."


	def add_language_eval_request(
	location: str,
	dataset_name: str,
	dataset_url: str,
	dataset_config: str,
	dataset_split: str,
	audio_column: str,
	text_column: str,
	license: str,
	):
	"""Submit a request to evaluate a new language/dataset on all models."""
	# Validate required fields based on location
	if not license or not license.strip():
	return styled_error("Please provide a license for the dataset.")

	if location == "HuggingFace":
	# Validate HuggingFace dataset
	if not dataset_name or not dataset_name.strip():
	return styled_error("Please provide a dataset name.")

	dataset_name = dataset_name.strip()
	if "/" not in dataset_name:
	return styled_error("Dataset name must be in the format 'owner/dataset-name' (e.g., 'mozilla-foundation/common_voice_13_0').")

	# Check if dataset exists on Hub
	dataset_on_hub, error = is_dataset_on_hub(dataset_name, token=TOKEN)
	if not dataset_on_hub:
	return styled_error(f'Dataset "{dataset_name}" {error}')

	dataset_identifier = dataset_name
	dataset_source = "huggingface"
	else:
	# Validate external URL
	if not dataset_url or not dataset_url.strip():
	return styled_error("Please provide a dataset URL.")

	valid_url, error = is_valid_url(dataset_url)
	if not valid_url:
	return styled_error(error)

	dataset_url = dataset_url.strip()
	dataset_identifier = dataset_url
	dataset_source = "external"

	current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

	# Use defaults for optional fields
	config = dataset_config.strip() if dataset_config and dataset_config.strip() else "default"
	split = dataset_split.strip() if dataset_split and dataset_split.strip() else "test"
	audio_col = audio_column.strip() if audio_column and audio_column.strip() else "audio"
	text_col = text_column.strip() if text_column and text_column.strip() else "text"

	# Create safe identifier for filename
	if dataset_source == "huggingface":
	safe_dataset = dataset_identifier.replace("/", "_").replace(" ", "_")
	else:
	# For external URLs, create a safe identifier from the URL
	safe_dataset = re.sub(r'[^a-zA-Z0-9]', '_', dataset_identifier)[:100]
	safe_config = config.replace(" ", "_").lower()

	OUT_DIR = f"{EVAL_REQUESTS_PATH}/language_requests"
	os.makedirs(OUT_DIR, exist_ok=True)

	# Check if similar request already exists
	existing_files = os.listdir(OUT_DIR) if os.path.exists(OUT_DIR) else []
	for existing_file in existing_files:
	if existing_file.startswith(f"lang_eval_{safe_dataset}_{safe_config}_"):
	return styled_warning("A similar evaluation request for this dataset configuration already exists.")

	# Create language evaluation request entry
	eval_entry = {
	"type": "language_evaluation",
	"source": dataset_source,
	"dataset": dataset_identifier,
	"config": config,
	"split": split,
	"audio_column": audio_col,
	"text_column": text_col,
	"license": license.strip() if license else "",
	"status": "PENDING",
	"submitted_time": current_time,
	}

	# Create unique filename
	filename = f"lang_eval_{safe_dataset}_{safe_config}_{current_time.replace(':', '-')}.json"

	print(f"Creating language eval request: {filename}")
	out_path = f"{OUT_DIR}/{filename}"

	with open(out_path, "w") as f:
	f.write(json.dumps(eval_entry, indent=2))

	# Upload to Hub if API is available
	if API:
	try:
	print("Uploading language eval request")
	API.upload_file(
	path_or_fileobj=out_path,
	path_in_repo=f"language_requests/{filename}",
	repo_id=QUEUE_REPO,
	repo_type="dataset",
	commit_message=f"Add language evaluation request for {dataset_identifier} ({config})",
	)
	os.remove(out_path)
	except Exception as e:
	print(f"Could not upload to Hub: {e}")
	# Keep local file if upload fails

	source_label = "Hugging Face" if dataset_source == "huggingface" else "External URL"
	return styled_message(
	f"✅ Your language evaluation request has been submitted!\n\n"
	f"Source: {source_label}\n"
	f"Dataset: {dataset_identifier}\n"
	f"Config: {config}\n\n"
	f"We will review your request and run evaluations on all supported models."
	)