Spaces:

microsoft
/

paza-bench

Running

File size: 7,159 Bytes

7a31ba6

import json
import os
import re
from datetime import datetime, timezone
from urllib.parse import urlparse

from huggingface_hub import dataset_info
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError

from src.display.formatting import styled_error, styled_message, styled_warning
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO


def is_dataset_on_hub(dataset_name: str, token: str = None) -> tuple[bool, str | None]:
    """Check if a dataset exists on the Hugging Face Hub and is publicly available."""
    try:
        info = dataset_info(dataset_name, token=token)
        if info.private:
            return False, "is private. Please make the dataset publicly available on Hugging Face Hub."
        return True, None
    except RepositoryNotFoundError:
        return False, "was not found on the Hub!"
    except HfHubHTTPError as e:
        return False, f"could not be accessed: {str(e)}"
    except Exception as e:
        return False, f"error checking dataset: {str(e)}"


def is_valid_url(url: str) -> tuple[bool, str | None]:
    """Validate URL using urllib.parse with strict scheme enforcement."""
    if not url or not url.strip():
        return False, "URL cannot be empty."
    
    url = url.strip()
    
    try:
        parsed = urlparse(url)
        
        # Strict scheme validation - only http/https allowed
        if parsed.scheme not in ('http', 'https'):
            return False, "URL must start with http:// or https://"
        
        # Must have a valid network location (domain)
        if not parsed.netloc:
            return False, "Invalid URL domain. Please provide a complete URL."
        
        # Extract hostname (remove port if present)
        hostname = parsed.hostname
        if not hostname or '.' not in hostname:
            return False, "Invalid URL domain. Please provide a complete URL."
        
        # Validate hostname format (alphanumeric, dots, hyphens only)
        # This blocks javascript:, data:, vbscript: and other injection schemes
        hostname_parts = hostname.split('.')
        for part in hostname_parts:
            if not part or not all(c.isalnum() or c == '-' for c in part):
                return False, "Invalid domain name in URL."
            if part.startswith('-') or part.endswith('-'):
                return False, "Invalid domain name in URL."
        
        return True, None
    except Exception:
        return False, "Invalid URL format."


def add_language_eval_request(
    location: str,
    dataset_name: str,
    dataset_url: str,
    dataset_config: str,
    dataset_split: str,
    audio_column: str,
    text_column: str,
    license: str,
):
    """Submit a request to evaluate a new language/dataset on all models."""
    # Validate required fields based on location
    if not license or not license.strip():
        return styled_error("Please provide a license for the dataset.")
    
    if location == "HuggingFace":
        # Validate HuggingFace dataset
        if not dataset_name or not dataset_name.strip():
            return styled_error("Please provide a dataset name.")
        
        dataset_name = dataset_name.strip()
        if "/" not in dataset_name:
            return styled_error("Dataset name must be in the format 'owner/dataset-name' (e.g., 'mozilla-foundation/common_voice_13_0').")

        # Check if dataset exists on Hub
        dataset_on_hub, error = is_dataset_on_hub(dataset_name, token=TOKEN)
        if not dataset_on_hub:
            return styled_error(f'Dataset "{dataset_name}" {error}')
        
        dataset_identifier = dataset_name
        dataset_source = "huggingface"
    else:
        # Validate external URL
        if not dataset_url or not dataset_url.strip():
            return styled_error("Please provide a dataset URL.")
        
        valid_url, error = is_valid_url(dataset_url)
        if not valid_url:
            return styled_error(error)
        
        dataset_url = dataset_url.strip()
        dataset_identifier = dataset_url
        dataset_source = "external"

    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
    
    # Use defaults for optional fields
    config = dataset_config.strip() if dataset_config and dataset_config.strip() else "default"
    split = dataset_split.strip() if dataset_split and dataset_split.strip() else "test"
    audio_col = audio_column.strip() if audio_column and audio_column.strip() else "audio"
    text_col = text_column.strip() if text_column and text_column.strip() else "text"
    
    # Create safe identifier for filename
    if dataset_source == "huggingface":
        safe_dataset = dataset_identifier.replace("/", "_").replace(" ", "_")
    else:
        # For external URLs, create a safe identifier from the URL
        safe_dataset = re.sub(r'[^a-zA-Z0-9]', '_', dataset_identifier)[:100]
    safe_config = config.replace(" ", "_").lower()
    
    OUT_DIR = f"{EVAL_REQUESTS_PATH}/language_requests"
    os.makedirs(OUT_DIR, exist_ok=True)
    
    # Check if similar request already exists
    existing_files = os.listdir(OUT_DIR) if os.path.exists(OUT_DIR) else []
    for existing_file in existing_files:
        if existing_file.startswith(f"lang_eval_{safe_dataset}_{safe_config}_"):
            return styled_warning("A similar evaluation request for this dataset configuration already exists.")
    
    # Create language evaluation request entry
    eval_entry = {
        "type": "language_evaluation",
        "source": dataset_source,
        "dataset": dataset_identifier,
        "config": config,
        "split": split,
        "audio_column": audio_col,
        "text_column": text_col,
        "license": license.strip() if license else "",
        "status": "PENDING",
        "submitted_time": current_time,
    }

    # Create unique filename
    filename = f"lang_eval_{safe_dataset}_{safe_config}_{current_time.replace(':', '-')}.json"
    
    print(f"Creating language eval request: {filename}")
    out_path = f"{OUT_DIR}/{filename}"

    with open(out_path, "w") as f:
        f.write(json.dumps(eval_entry, indent=2))

    # Upload to Hub if API is available
    if API:
        try:
            print("Uploading language eval request")
            API.upload_file(
                path_or_fileobj=out_path,
                path_in_repo=f"language_requests/{filename}",
                repo_id=QUEUE_REPO,
                repo_type="dataset",
                commit_message=f"Add language evaluation request for {dataset_identifier} ({config})",
            )
            os.remove(out_path)
        except Exception as e:
            print(f"Could not upload to Hub: {e}")
            # Keep local file if upload fails

    source_label = "Hugging Face" if dataset_source == "huggingface" else "External URL"
    return styled_message(
        f"✅ Your language evaluation request has been submitted!\n\n"
        f"**Source:** {source_label}\n"
        f"**Dataset:** {dataset_identifier}\n"
        f"**Config:** {config}\n\n"
        f"We will review your request and run evaluations on all supported models."
    )