muchai-mercy's picture
update pazabench space
0aa9a49
import json
import os
import re
from datetime import datetime, timezone
from urllib.parse import urlparse
from huggingface_hub import dataset_info
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
from src.display.formatting import styled_error, styled_message, styled_warning
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
def is_dataset_on_hub(dataset_name: str, token: str = None) -> tuple[bool, str | None]:
"""Check if a dataset exists on the Hugging Face Hub and is publicly available."""
try:
info = dataset_info(dataset_name, token=token)
if info.private:
return False, "is private. Please make the dataset publicly available on Hugging Face Hub."
return True, None
except RepositoryNotFoundError:
return False, "was not found on the Hub!"
except HfHubHTTPError as e:
return False, f"could not be accessed: {str(e)}"
except Exception as e:
return False, f"error checking dataset: {str(e)}"
def is_valid_url(url: str) -> tuple[bool, str | None]:
"""Validate URL using urllib.parse with strict scheme enforcement."""
if not url or not url.strip():
return False, "URL cannot be empty."
url = url.strip()
try:
parsed = urlparse(url)
# Strict scheme validation - only http/https allowed
if parsed.scheme not in ('http', 'https'):
return False, "URL must start with http:// or https://"
# Must have a valid network location (domain)
if not parsed.netloc:
return False, "Invalid URL domain. Please provide a complete URL."
# Extract hostname (remove port if present)
hostname = parsed.hostname
if not hostname or '.' not in hostname:
return False, "Invalid URL domain. Please provide a complete URL."
# Validate hostname format (alphanumeric, dots, hyphens only)
# This blocks javascript:, data:, vbscript: and other injection schemes
hostname_parts = hostname.split('.')
for part in hostname_parts:
if not part or not all(c.isalnum() or c == '-' for c in part):
return False, "Invalid domain name in URL."
if part.startswith('-') or part.endswith('-'):
return False, "Invalid domain name in URL."
return True, None
except Exception:
return False, "Invalid URL format."
def add_language_eval_request(
location: str,
dataset_name: str,
dataset_url: str,
dataset_config: str,
dataset_split: str,
audio_column: str,
text_column: str,
license: str,
):
"""Submit a request to evaluate a new language/dataset on all models."""
# Validate required fields based on location
if not license or not license.strip():
return styled_error("Please provide a license for the dataset.")
if location == "HuggingFace":
# Validate HuggingFace dataset
if not dataset_name or not dataset_name.strip():
return styled_error("Please provide a dataset name.")
dataset_name = dataset_name.strip()
if "/" not in dataset_name:
return styled_error("Dataset name must be in the format 'owner/dataset-name' (e.g., 'mozilla-foundation/common_voice_13_0').")
# Check if dataset exists on Hub
dataset_on_hub, error = is_dataset_on_hub(dataset_name, token=TOKEN)
if not dataset_on_hub:
return styled_error(f'Dataset "{dataset_name}" {error}')
dataset_identifier = dataset_name
dataset_source = "huggingface"
else:
# Validate external URL
if not dataset_url or not dataset_url.strip():
return styled_error("Please provide a dataset URL.")
valid_url, error = is_valid_url(dataset_url)
if not valid_url:
return styled_error(error)
dataset_url = dataset_url.strip()
dataset_identifier = dataset_url
dataset_source = "external"
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
# Use defaults for optional fields
config = dataset_config.strip() if dataset_config and dataset_config.strip() else "default"
split = dataset_split.strip() if dataset_split and dataset_split.strip() else "test"
audio_col = audio_column.strip() if audio_column and audio_column.strip() else "audio"
text_col = text_column.strip() if text_column and text_column.strip() else "text"
# Create safe identifier for filename
if dataset_source == "huggingface":
safe_dataset = dataset_identifier.replace("/", "_").replace(" ", "_")
else:
# For external URLs, create a safe identifier from the URL
safe_dataset = re.sub(r'[^a-zA-Z0-9]', '_', dataset_identifier)[:100]
safe_config = config.replace(" ", "_").lower()
OUT_DIR = f"{EVAL_REQUESTS_PATH}/language_requests"
os.makedirs(OUT_DIR, exist_ok=True)
# Check if similar request already exists
existing_files = os.listdir(OUT_DIR) if os.path.exists(OUT_DIR) else []
for existing_file in existing_files:
if existing_file.startswith(f"lang_eval_{safe_dataset}_{safe_config}_"):
return styled_warning("A similar evaluation request for this dataset configuration already exists.")
# Create language evaluation request entry
eval_entry = {
"type": "language_evaluation",
"source": dataset_source,
"dataset": dataset_identifier,
"config": config,
"split": split,
"audio_column": audio_col,
"text_column": text_col,
"license": license.strip() if license else "",
"status": "PENDING",
"submitted_time": current_time,
}
# Create unique filename
filename = f"lang_eval_{safe_dataset}_{safe_config}_{current_time.replace(':', '-')}.json"
print(f"Creating language eval request: {filename}")
out_path = f"{OUT_DIR}/{filename}"
with open(out_path, "w") as f:
f.write(json.dumps(eval_entry, indent=2))
# Upload to Hub if API is available
if API:
try:
print("Uploading language eval request")
API.upload_file(
path_or_fileobj=out_path,
path_in_repo=f"language_requests/{filename}",
repo_id=QUEUE_REPO,
repo_type="dataset",
commit_message=f"Add language evaluation request for {dataset_identifier} ({config})",
)
os.remove(out_path)
except Exception as e:
print(f"Could not upload to Hub: {e}")
# Keep local file if upload fails
source_label = "Hugging Face" if dataset_source == "huggingface" else "External URL"
return styled_message(
f"✅ Your language evaluation request has been submitted!\n\n"
f"**Source:** {source_label}\n"
f"**Dataset:** {dataset_identifier}\n"
f"**Config:** {config}\n\n"
f"We will review your request and run evaluations on all supported models."
)