File size: 7,159 Bytes
7a31ba6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import json
import os
import re
from datetime import datetime, timezone
from urllib.parse import urlparse

from huggingface_hub import dataset_info
from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError

from src.display.formatting import styled_error, styled_message, styled_warning
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO


def is_dataset_on_hub(dataset_name: str, token: str = None) -> tuple[bool, str | None]:
    """Check if a dataset exists on the Hugging Face Hub and is publicly available."""
    try:
        info = dataset_info(dataset_name, token=token)
        if info.private:
            return False, "is private. Please make the dataset publicly available on Hugging Face Hub."
        return True, None
    except RepositoryNotFoundError:
        return False, "was not found on the Hub!"
    except HfHubHTTPError as e:
        return False, f"could not be accessed: {str(e)}"
    except Exception as e:
        return False, f"error checking dataset: {str(e)}"


def is_valid_url(url: str) -> tuple[bool, str | None]:
    """Validate URL using urllib.parse with strict scheme enforcement."""
    if not url or not url.strip():
        return False, "URL cannot be empty."
    
    url = url.strip()
    
    try:
        parsed = urlparse(url)
        
        # Strict scheme validation - only http/https allowed
        if parsed.scheme not in ('http', 'https'):
            return False, "URL must start with http:// or https://"
        
        # Must have a valid network location (domain)
        if not parsed.netloc:
            return False, "Invalid URL domain. Please provide a complete URL."
        
        # Extract hostname (remove port if present)
        hostname = parsed.hostname
        if not hostname or '.' not in hostname:
            return False, "Invalid URL domain. Please provide a complete URL."
        
        # Validate hostname format (alphanumeric, dots, hyphens only)
        # This blocks javascript:, data:, vbscript: and other injection schemes
        hostname_parts = hostname.split('.')
        for part in hostname_parts:
            if not part or not all(c.isalnum() or c == '-' for c in part):
                return False, "Invalid domain name in URL."
            if part.startswith('-') or part.endswith('-'):
                return False, "Invalid domain name in URL."
        
        return True, None
    except Exception:
        return False, "Invalid URL format."


def add_language_eval_request(
    location: str,
    dataset_name: str,
    dataset_url: str,
    dataset_config: str,
    dataset_split: str,
    audio_column: str,
    text_column: str,
    license: str,
):
    """Submit a request to evaluate a new language/dataset on all models."""
    # Validate required fields based on location
    if not license or not license.strip():
        return styled_error("Please provide a license for the dataset.")
    
    if location == "HuggingFace":
        # Validate HuggingFace dataset
        if not dataset_name or not dataset_name.strip():
            return styled_error("Please provide a dataset name.")
        
        dataset_name = dataset_name.strip()
        if "/" not in dataset_name:
            return styled_error("Dataset name must be in the format 'owner/dataset-name' (e.g., 'mozilla-foundation/common_voice_13_0').")

        # Check if dataset exists on Hub
        dataset_on_hub, error = is_dataset_on_hub(dataset_name, token=TOKEN)
        if not dataset_on_hub:
            return styled_error(f'Dataset "{dataset_name}" {error}')
        
        dataset_identifier = dataset_name
        dataset_source = "huggingface"
    else:
        # Validate external URL
        if not dataset_url or not dataset_url.strip():
            return styled_error("Please provide a dataset URL.")
        
        valid_url, error = is_valid_url(dataset_url)
        if not valid_url:
            return styled_error(error)
        
        dataset_url = dataset_url.strip()
        dataset_identifier = dataset_url
        dataset_source = "external"

    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
    
    # Use defaults for optional fields
    config = dataset_config.strip() if dataset_config and dataset_config.strip() else "default"
    split = dataset_split.strip() if dataset_split and dataset_split.strip() else "test"
    audio_col = audio_column.strip() if audio_column and audio_column.strip() else "audio"
    text_col = text_column.strip() if text_column and text_column.strip() else "text"
    
    # Create safe identifier for filename
    if dataset_source == "huggingface":
        safe_dataset = dataset_identifier.replace("/", "_").replace(" ", "_")
    else:
        # For external URLs, create a safe identifier from the URL
        safe_dataset = re.sub(r'[^a-zA-Z0-9]', '_', dataset_identifier)[:100]
    safe_config = config.replace(" ", "_").lower()
    
    OUT_DIR = f"{EVAL_REQUESTS_PATH}/language_requests"
    os.makedirs(OUT_DIR, exist_ok=True)
    
    # Check if similar request already exists
    existing_files = os.listdir(OUT_DIR) if os.path.exists(OUT_DIR) else []
    for existing_file in existing_files:
        if existing_file.startswith(f"lang_eval_{safe_dataset}_{safe_config}_"):
            return styled_warning("A similar evaluation request for this dataset configuration already exists.")
    
    # Create language evaluation request entry
    eval_entry = {
        "type": "language_evaluation",
        "source": dataset_source,
        "dataset": dataset_identifier,
        "config": config,
        "split": split,
        "audio_column": audio_col,
        "text_column": text_col,
        "license": license.strip() if license else "",
        "status": "PENDING",
        "submitted_time": current_time,
    }

    # Create unique filename
    filename = f"lang_eval_{safe_dataset}_{safe_config}_{current_time.replace(':', '-')}.json"
    
    print(f"Creating language eval request: {filename}")
    out_path = f"{OUT_DIR}/{filename}"

    with open(out_path, "w") as f:
        f.write(json.dumps(eval_entry, indent=2))

    # Upload to Hub if API is available
    if API:
        try:
            print("Uploading language eval request")
            API.upload_file(
                path_or_fileobj=out_path,
                path_in_repo=f"language_requests/{filename}",
                repo_id=QUEUE_REPO,
                repo_type="dataset",
                commit_message=f"Add language evaluation request for {dataset_identifier} ({config})",
            )
            os.remove(out_path)
        except Exception as e:
            print(f"Could not upload to Hub: {e}")
            # Keep local file if upload fails

    source_label = "Hugging Face" if dataset_source == "huggingface" else "External URL"
    return styled_message(
        f"✅ Your language evaluation request has been submitted!\n\n"
        f"**Source:** {source_label}\n"
        f"**Dataset:** {dataset_identifier}\n"
        f"**Config:** {config}\n\n"
        f"We will review your request and run evaluations on all supported models."
    )