Spaces:
Running
Running
| import json | |
| import os | |
| import re | |
| from datetime import datetime, timezone | |
| from urllib.parse import urlparse | |
| from huggingface_hub import dataset_info | |
| from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError | |
| from src.display.formatting import styled_error, styled_message, styled_warning | |
| from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO | |
| def is_dataset_on_hub(dataset_name: str, token: str = None) -> tuple[bool, str | None]: | |
| """Check if a dataset exists on the Hugging Face Hub and is publicly available.""" | |
| try: | |
| info = dataset_info(dataset_name, token=token) | |
| if info.private: | |
| return False, "is private. Please make the dataset publicly available on Hugging Face Hub." | |
| return True, None | |
| except RepositoryNotFoundError: | |
| return False, "was not found on the Hub!" | |
| except HfHubHTTPError as e: | |
| return False, f"could not be accessed: {str(e)}" | |
| except Exception as e: | |
| return False, f"error checking dataset: {str(e)}" | |
| def is_valid_url(url: str) -> tuple[bool, str | None]: | |
| """Validate URL using urllib.parse with strict scheme enforcement.""" | |
| if not url or not url.strip(): | |
| return False, "URL cannot be empty." | |
| url = url.strip() | |
| try: | |
| parsed = urlparse(url) | |
| # Strict scheme validation - only http/https allowed | |
| if parsed.scheme not in ('http', 'https'): | |
| return False, "URL must start with http:// or https://" | |
| # Must have a valid network location (domain) | |
| if not parsed.netloc: | |
| return False, "Invalid URL domain. Please provide a complete URL." | |
| # Extract hostname (remove port if present) | |
| hostname = parsed.hostname | |
| if not hostname or '.' not in hostname: | |
| return False, "Invalid URL domain. Please provide a complete URL." | |
| # Validate hostname format (alphanumeric, dots, hyphens only) | |
| # This blocks javascript:, data:, vbscript: and other injection schemes | |
| hostname_parts = hostname.split('.') | |
| for part in hostname_parts: | |
| if not part or not all(c.isalnum() or c == '-' for c in part): | |
| return False, "Invalid domain name in URL." | |
| if part.startswith('-') or part.endswith('-'): | |
| return False, "Invalid domain name in URL." | |
| return True, None | |
| except Exception: | |
| return False, "Invalid URL format." | |
| def add_language_eval_request( | |
| location: str, | |
| dataset_name: str, | |
| dataset_url: str, | |
| dataset_config: str, | |
| dataset_split: str, | |
| audio_column: str, | |
| text_column: str, | |
| license: str, | |
| ): | |
| """Submit a request to evaluate a new language/dataset on all models.""" | |
| # Validate required fields based on location | |
| if not license or not license.strip(): | |
| return styled_error("Please provide a license for the dataset.") | |
| if location == "HuggingFace": | |
| # Validate HuggingFace dataset | |
| if not dataset_name or not dataset_name.strip(): | |
| return styled_error("Please provide a dataset name.") | |
| dataset_name = dataset_name.strip() | |
| if "/" not in dataset_name: | |
| return styled_error("Dataset name must be in the format 'owner/dataset-name' (e.g., 'mozilla-foundation/common_voice_13_0').") | |
| # Check if dataset exists on Hub | |
| dataset_on_hub, error = is_dataset_on_hub(dataset_name, token=TOKEN) | |
| if not dataset_on_hub: | |
| return styled_error(f'Dataset "{dataset_name}" {error}') | |
| dataset_identifier = dataset_name | |
| dataset_source = "huggingface" | |
| else: | |
| # Validate external URL | |
| if not dataset_url or not dataset_url.strip(): | |
| return styled_error("Please provide a dataset URL.") | |
| valid_url, error = is_valid_url(dataset_url) | |
| if not valid_url: | |
| return styled_error(error) | |
| dataset_url = dataset_url.strip() | |
| dataset_identifier = dataset_url | |
| dataset_source = "external" | |
| current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") | |
| # Use defaults for optional fields | |
| config = dataset_config.strip() if dataset_config and dataset_config.strip() else "default" | |
| split = dataset_split.strip() if dataset_split and dataset_split.strip() else "test" | |
| audio_col = audio_column.strip() if audio_column and audio_column.strip() else "audio" | |
| text_col = text_column.strip() if text_column and text_column.strip() else "text" | |
| # Create safe identifier for filename | |
| if dataset_source == "huggingface": | |
| safe_dataset = dataset_identifier.replace("/", "_").replace(" ", "_") | |
| else: | |
| # For external URLs, create a safe identifier from the URL | |
| safe_dataset = re.sub(r'[^a-zA-Z0-9]', '_', dataset_identifier)[:100] | |
| safe_config = config.replace(" ", "_").lower() | |
| OUT_DIR = f"{EVAL_REQUESTS_PATH}/language_requests" | |
| os.makedirs(OUT_DIR, exist_ok=True) | |
| # Check if similar request already exists | |
| existing_files = os.listdir(OUT_DIR) if os.path.exists(OUT_DIR) else [] | |
| for existing_file in existing_files: | |
| if existing_file.startswith(f"lang_eval_{safe_dataset}_{safe_config}_"): | |
| return styled_warning("A similar evaluation request for this dataset configuration already exists.") | |
| # Create language evaluation request entry | |
| eval_entry = { | |
| "type": "language_evaluation", | |
| "source": dataset_source, | |
| "dataset": dataset_identifier, | |
| "config": config, | |
| "split": split, | |
| "audio_column": audio_col, | |
| "text_column": text_col, | |
| "license": license.strip() if license else "", | |
| "status": "PENDING", | |
| "submitted_time": current_time, | |
| } | |
| # Create unique filename | |
| filename = f"lang_eval_{safe_dataset}_{safe_config}_{current_time.replace(':', '-')}.json" | |
| print(f"Creating language eval request: {filename}") | |
| out_path = f"{OUT_DIR}/{filename}" | |
| with open(out_path, "w") as f: | |
| f.write(json.dumps(eval_entry, indent=2)) | |
| # Upload to Hub if API is available | |
| if API: | |
| try: | |
| print("Uploading language eval request") | |
| API.upload_file( | |
| path_or_fileobj=out_path, | |
| path_in_repo=f"language_requests/{filename}", | |
| repo_id=QUEUE_REPO, | |
| repo_type="dataset", | |
| commit_message=f"Add language evaluation request for {dataset_identifier} ({config})", | |
| ) | |
| os.remove(out_path) | |
| except Exception as e: | |
| print(f"Could not upload to Hub: {e}") | |
| # Keep local file if upload fails | |
| source_label = "Hugging Face" if dataset_source == "huggingface" else "External URL" | |
| return styled_message( | |
| f"✅ Your language evaluation request has been submitted!\n\n" | |
| f"**Source:** {source_label}\n" | |
| f"**Dataset:** {dataset_identifier}\n" | |
| f"**Config:** {config}\n\n" | |
| f"We will review your request and run evaluations on all supported models." | |
| ) | |