import json import os import re from datetime import datetime, timezone from urllib.parse import urlparse from huggingface_hub import dataset_info from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError from src.display.formatting import styled_error, styled_message, styled_warning from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO def is_dataset_on_hub(dataset_name: str, token: str = None) -> tuple[bool, str | None]: """Check if a dataset exists on the Hugging Face Hub and is publicly available.""" try: info = dataset_info(dataset_name, token=token) if info.private: return False, "is private. Please make the dataset publicly available on Hugging Face Hub." return True, None except RepositoryNotFoundError: return False, "was not found on the Hub!" except HfHubHTTPError as e: return False, f"could not be accessed: {str(e)}" except Exception as e: return False, f"error checking dataset: {str(e)}" def is_valid_url(url: str) -> tuple[bool, str | None]: """Validate URL using urllib.parse with strict scheme enforcement.""" if not url or not url.strip(): return False, "URL cannot be empty." url = url.strip() try: parsed = urlparse(url) # Strict scheme validation - only http/https allowed if parsed.scheme not in ('http', 'https'): return False, "URL must start with http:// or https://" # Must have a valid network location (domain) if not parsed.netloc: return False, "Invalid URL domain. Please provide a complete URL." # Extract hostname (remove port if present) hostname = parsed.hostname if not hostname or '.' not in hostname: return False, "Invalid URL domain. Please provide a complete URL." # Validate hostname format (alphanumeric, dots, hyphens only) # This blocks javascript:, data:, vbscript: and other injection schemes hostname_parts = hostname.split('.') for part in hostname_parts: if not part or not all(c.isalnum() or c == '-' for c in part): return False, "Invalid domain name in URL." if part.startswith('-') or part.endswith('-'): return False, "Invalid domain name in URL." return True, None except Exception: return False, "Invalid URL format." def add_language_eval_request( location: str, dataset_name: str, dataset_url: str, dataset_config: str, dataset_split: str, audio_column: str, text_column: str, license: str, ): """Submit a request to evaluate a new language/dataset on all models.""" # Validate required fields based on location if not license or not license.strip(): return styled_error("Please provide a license for the dataset.") if location == "HuggingFace": # Validate HuggingFace dataset if not dataset_name or not dataset_name.strip(): return styled_error("Please provide a dataset name.") dataset_name = dataset_name.strip() if "/" not in dataset_name: return styled_error("Dataset name must be in the format 'owner/dataset-name' (e.g., 'mozilla-foundation/common_voice_13_0').") # Check if dataset exists on Hub dataset_on_hub, error = is_dataset_on_hub(dataset_name, token=TOKEN) if not dataset_on_hub: return styled_error(f'Dataset "{dataset_name}" {error}') dataset_identifier = dataset_name dataset_source = "huggingface" else: # Validate external URL if not dataset_url or not dataset_url.strip(): return styled_error("Please provide a dataset URL.") valid_url, error = is_valid_url(dataset_url) if not valid_url: return styled_error(error) dataset_url = dataset_url.strip() dataset_identifier = dataset_url dataset_source = "external" current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") # Use defaults for optional fields config = dataset_config.strip() if dataset_config and dataset_config.strip() else "default" split = dataset_split.strip() if dataset_split and dataset_split.strip() else "test" audio_col = audio_column.strip() if audio_column and audio_column.strip() else "audio" text_col = text_column.strip() if text_column and text_column.strip() else "text" # Create safe identifier for filename if dataset_source == "huggingface": safe_dataset = dataset_identifier.replace("/", "_").replace(" ", "_") else: # For external URLs, create a safe identifier from the URL safe_dataset = re.sub(r'[^a-zA-Z0-9]', '_', dataset_identifier)[:100] safe_config = config.replace(" ", "_").lower() OUT_DIR = f"{EVAL_REQUESTS_PATH}/language_requests" os.makedirs(OUT_DIR, exist_ok=True) # Check if similar request already exists existing_files = os.listdir(OUT_DIR) if os.path.exists(OUT_DIR) else [] for existing_file in existing_files: if existing_file.startswith(f"lang_eval_{safe_dataset}_{safe_config}_"): return styled_warning("A similar evaluation request for this dataset configuration already exists.") # Create language evaluation request entry eval_entry = { "type": "language_evaluation", "source": dataset_source, "dataset": dataset_identifier, "config": config, "split": split, "audio_column": audio_col, "text_column": text_col, "license": license.strip() if license else "", "status": "PENDING", "submitted_time": current_time, } # Create unique filename filename = f"lang_eval_{safe_dataset}_{safe_config}_{current_time.replace(':', '-')}.json" print(f"Creating language eval request: {filename}") out_path = f"{OUT_DIR}/{filename}" with open(out_path, "w") as f: f.write(json.dumps(eval_entry, indent=2)) # Upload to Hub if API is available if API: try: print("Uploading language eval request") API.upload_file( path_or_fileobj=out_path, path_in_repo=f"language_requests/{filename}", repo_id=QUEUE_REPO, repo_type="dataset", commit_message=f"Add language evaluation request for {dataset_identifier} ({config})", ) os.remove(out_path) except Exception as e: print(f"Could not upload to Hub: {e}") # Keep local file if upload fails source_label = "Hugging Face" if dataset_source == "huggingface" else "External URL" return styled_message( f"✅ Your language evaluation request has been submitted!\n\n" f"**Source:** {source_label}\n" f"**Dataset:** {dataset_identifier}\n" f"**Config:** {config}\n\n" f"We will review your request and run evaluations on all supported models." )