Spaces:
Running
Running
File size: 7,159 Bytes
7a31ba6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | import json
import os
import re
from datetime import datetime, timezone
from urllib.parse import urlparse
from huggingface_hub import dataset_info
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
from src.display.formatting import styled_error, styled_message, styled_warning
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
def is_dataset_on_hub(dataset_name: str, token: str = None) -> tuple[bool, str | None]:
"""Check if a dataset exists on the Hugging Face Hub and is publicly available."""
try:
info = dataset_info(dataset_name, token=token)
if info.private:
return False, "is private. Please make the dataset publicly available on Hugging Face Hub."
return True, None
except RepositoryNotFoundError:
return False, "was not found on the Hub!"
except HfHubHTTPError as e:
return False, f"could not be accessed: {str(e)}"
except Exception as e:
return False, f"error checking dataset: {str(e)}"
def is_valid_url(url: str) -> tuple[bool, str | None]:
"""Validate URL using urllib.parse with strict scheme enforcement."""
if not url or not url.strip():
return False, "URL cannot be empty."
url = url.strip()
try:
parsed = urlparse(url)
# Strict scheme validation - only http/https allowed
if parsed.scheme not in ('http', 'https'):
return False, "URL must start with http:// or https://"
# Must have a valid network location (domain)
if not parsed.netloc:
return False, "Invalid URL domain. Please provide a complete URL."
# Extract hostname (remove port if present)
hostname = parsed.hostname
if not hostname or '.' not in hostname:
return False, "Invalid URL domain. Please provide a complete URL."
# Validate hostname format (alphanumeric, dots, hyphens only)
# This blocks javascript:, data:, vbscript: and other injection schemes
hostname_parts = hostname.split('.')
for part in hostname_parts:
if not part or not all(c.isalnum() or c == '-' for c in part):
return False, "Invalid domain name in URL."
if part.startswith('-') or part.endswith('-'):
return False, "Invalid domain name in URL."
return True, None
except Exception:
return False, "Invalid URL format."
def add_language_eval_request(
location: str,
dataset_name: str,
dataset_url: str,
dataset_config: str,
dataset_split: str,
audio_column: str,
text_column: str,
license: str,
):
"""Submit a request to evaluate a new language/dataset on all models."""
# Validate required fields based on location
if not license or not license.strip():
return styled_error("Please provide a license for the dataset.")
if location == "HuggingFace":
# Validate HuggingFace dataset
if not dataset_name or not dataset_name.strip():
return styled_error("Please provide a dataset name.")
dataset_name = dataset_name.strip()
if "/" not in dataset_name:
return styled_error("Dataset name must be in the format 'owner/dataset-name' (e.g., 'mozilla-foundation/common_voice_13_0').")
# Check if dataset exists on Hub
dataset_on_hub, error = is_dataset_on_hub(dataset_name, token=TOKEN)
if not dataset_on_hub:
return styled_error(f'Dataset "{dataset_name}" {error}')
dataset_identifier = dataset_name
dataset_source = "huggingface"
else:
# Validate external URL
if not dataset_url or not dataset_url.strip():
return styled_error("Please provide a dataset URL.")
valid_url, error = is_valid_url(dataset_url)
if not valid_url:
return styled_error(error)
dataset_url = dataset_url.strip()
dataset_identifier = dataset_url
dataset_source = "external"
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
# Use defaults for optional fields
config = dataset_config.strip() if dataset_config and dataset_config.strip() else "default"
split = dataset_split.strip() if dataset_split and dataset_split.strip() else "test"
audio_col = audio_column.strip() if audio_column and audio_column.strip() else "audio"
text_col = text_column.strip() if text_column and text_column.strip() else "text"
# Create safe identifier for filename
if dataset_source == "huggingface":
safe_dataset = dataset_identifier.replace("/", "_").replace(" ", "_")
else:
# For external URLs, create a safe identifier from the URL
safe_dataset = re.sub(r'[^a-zA-Z0-9]', '_', dataset_identifier)[:100]
safe_config = config.replace(" ", "_").lower()
OUT_DIR = f"{EVAL_REQUESTS_PATH}/language_requests"
os.makedirs(OUT_DIR, exist_ok=True)
# Check if similar request already exists
existing_files = os.listdir(OUT_DIR) if os.path.exists(OUT_DIR) else []
for existing_file in existing_files:
if existing_file.startswith(f"lang_eval_{safe_dataset}_{safe_config}_"):
return styled_warning("A similar evaluation request for this dataset configuration already exists.")
# Create language evaluation request entry
eval_entry = {
"type": "language_evaluation",
"source": dataset_source,
"dataset": dataset_identifier,
"config": config,
"split": split,
"audio_column": audio_col,
"text_column": text_col,
"license": license.strip() if license else "",
"status": "PENDING",
"submitted_time": current_time,
}
# Create unique filename
filename = f"lang_eval_{safe_dataset}_{safe_config}_{current_time.replace(':', '-')}.json"
print(f"Creating language eval request: {filename}")
out_path = f"{OUT_DIR}/{filename}"
with open(out_path, "w") as f:
f.write(json.dumps(eval_entry, indent=2))
# Upload to Hub if API is available
if API:
try:
print("Uploading language eval request")
API.upload_file(
path_or_fileobj=out_path,
path_in_repo=f"language_requests/{filename}",
repo_id=QUEUE_REPO,
repo_type="dataset",
commit_message=f"Add language evaluation request for {dataset_identifier} ({config})",
)
os.remove(out_path)
except Exception as e:
print(f"Could not upload to Hub: {e}")
# Keep local file if upload fails
source_label = "Hugging Face" if dataset_source == "huggingface" else "External URL"
return styled_message(
f"✅ Your language evaluation request has been submitted!\n\n"
f"**Source:** {source_label}\n"
f"**Dataset:** {dataset_identifier}\n"
f"**Config:** {config}\n\n"
f"We will review your request and run evaluations on all supported models."
)
|