File size: 9,560 Bytes
2b4c539
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
"""Helpers for session-scoped dataset uploads to the Hugging Face Hub."""

import asyncio
import os
import re
import uuid
from dataclasses import dataclass
from urllib.parse import quote

from fastapi import HTTPException, UploadFile
from huggingface_hub import HfApi

MAX_DATASET_UPLOAD_BYTES = 100 * 1024 * 1024
ALLOWED_DATASET_EXTENSIONS = {"csv", "json", "jsonl"}
_SAFE_FILENAME_RE = re.compile(r"[^A-Za-z0-9._-]+")
_SAFE_NAMESPACE_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,95}$")


@dataclass(frozen=True)
class DatasetUpload:
    session_id: str
    repo_id: str
    repo_type: str
    private: bool
    upload_id: str
    config_name: str
    filename: str
    original_filename: str
    path_in_repo: str
    size_bytes: int
    format: str
    hub_url: str
    load_dataset_snippet: str

    def response_payload(self) -> dict[str, str | int | bool]:
        return {
            "session_id": self.session_id,
            "repo_id": self.repo_id,
            "repo_type": self.repo_type,
            "private": self.private,
            "upload_id": self.upload_id,
            "config_name": self.config_name,
            "filename": self.filename,
            "path_in_repo": self.path_in_repo,
            "size_bytes": self.size_bytes,
            "format": self.format,
            "hub_url": self.hub_url,
            "load_dataset_snippet": self.load_dataset_snippet,
        }


def sanitize_dataset_filename(filename: str | None) -> str:
    """Return a Hub-safe basename while preserving the extension."""
    raw = os.path.basename(filename or "").strip()
    if not raw:
        raw = "dataset.csv"

    safe = _SAFE_FILENAME_RE.sub("-", raw).strip(".-_")
    if not safe:
        safe = "dataset.csv"

    stem, ext = os.path.splitext(safe)
    if not stem:
        stem = "dataset"
    if not ext:
        ext = ".csv"

    max_stem_len = 96 - len(ext)
    stem = stem[:max_stem_len].strip(".-_") or "dataset"
    return f"{stem}{ext.lower()}"


def display_filename(filename: str | None, fallback: str) -> str:
    raw = os.path.basename(filename or "").strip()
    if not raw:
        return fallback
    cleaned = "".join(char for char in raw if ord(char) >= 32)
    return cleaned[:160] or fallback


def dataset_format_from_filename(filename: str) -> str:
    ext = os.path.splitext(filename)[1].lower().lstrip(".")
    if ext not in ALLOWED_DATASET_EXTENSIONS:
        raise HTTPException(
            status_code=400,
            detail="Only .csv, .json, and .jsonl dataset files are supported.",
        )
    return ext


def session_dataset_repo_id(hf_username: str | None, session_id: str) -> str:
    namespace = (hf_username or "").strip()
    if not namespace or not _SAFE_NAMESPACE_RE.fullmatch(namespace):
        raise HTTPException(
            status_code=400,
            detail="Could not determine a valid Hugging Face namespace.",
        )

    safe_session_id = re.sub(r"[^A-Za-z0-9]+", "-", session_id).strip("-")
    if not safe_session_id:
        safe_session_id = uuid.uuid4().hex[:8]
    return f"{namespace}/ml-intern-{safe_session_id[:8]}-datasets"


async def upload_size_bytes(upload: UploadFile) -> int:
    await asyncio.to_thread(upload.file.seek, 0, os.SEEK_END)
    size = await asyncio.to_thread(upload.file.tell)
    await asyncio.to_thread(upload.file.seek, 0)
    return int(size)


async def validate_dataset_upload(upload: UploadFile) -> tuple[str, str, int]:
    dataset_format = dataset_format_from_filename(upload.filename or "")
    safe_filename = sanitize_dataset_filename(upload.filename)
    size = await upload_size_bytes(upload)
    if size <= 0:
        raise HTTPException(status_code=400, detail="Uploaded dataset file is empty.")
    if size > MAX_DATASET_UPLOAD_BYTES:
        raise HTTPException(
            status_code=413,
            detail="Dataset upload exceeds the 100 MB limit.",
        )
    return safe_filename, dataset_format, size


def dataset_hub_url(repo_id: str, path_in_repo: str) -> str:
    quoted_path = quote(path_in_repo, safe="/")
    return f"https://huggingface.co/datasets/{repo_id}/blob/main/{quoted_path}"


def dataset_config_name(upload_id: str) -> str:
    safe_upload_id = re.sub(r"[^A-Za-z0-9]+", "_", upload_id).strip("_").lower()
    if not safe_upload_id:
        safe_upload_id = "dataset"
    return f"upload_{safe_upload_id[:32]}"


def dataset_config_name_from_path(path_in_repo: str) -> str:
    parts = path_in_repo.split("/")
    if len(parts) >= 3 and parts[0] == "uploads":
        return dataset_config_name(parts[1])
    stem = os.path.splitext(os.path.basename(path_in_repo))[0]
    return dataset_config_name(stem)


def is_dataset_upload_path(path_in_repo: str) -> bool:
    parts = path_in_repo.split("/")
    if len(parts) != 3 or parts[0] != "uploads" or not parts[1] or not parts[2]:
        return False
    extension = os.path.splitext(path_in_repo)[1].lower().lstrip(".")
    return extension in ALLOWED_DATASET_EXTENSIONS


def unique_dataset_upload_paths(paths: list[str]) -> list[str]:
    seen = set()
    upload_paths = []
    for path in paths:
        if not is_dataset_upload_path(path) or path in seen:
            continue
        seen.add(path)
        upload_paths.append(path)
    return upload_paths


def load_dataset_snippet(repo_id: str, config_name: str) -> str:
    return (
        "from datasets import load_dataset\n\n"
        f'dataset = load_dataset("{repo_id}", "{config_name}", '
        'split="train", token=True)'
    )


def dataset_repo_card(repo_id: str, upload_paths: list[str]) -> bytes:
    config_lines = []
    unique_upload_paths = unique_dataset_upload_paths(upload_paths)
    if unique_upload_paths:
        config_lines.append("configs:")
        for path in unique_upload_paths:
            config_lines.extend(
                [
                    f"- config_name: {dataset_config_name_from_path(path)}",
                    "  data_files:",
                    "  - split: train",
                    f'    path: "{path}"',
                ]
            )

    configs = "\n".join(config_lines)
    if configs:
        configs = f"{configs}\n"

    content = f"""---
tags:
- ml-intern
- uploaded-dataset
{configs}---

# {repo_id}

Private dataset files uploaded through ML Intern.

Files are stored under `uploads/<upload_id>/` and are attached to the
corresponding ML Intern session context by Hub reference, not by copying file
contents into the chat.

Each uploaded file is exposed as its own dataset config so files with different
schemas can coexist in the same session repo.
"""
    return content.encode("utf-8")


def dataset_context_note(upload: DatasetUpload) -> str:
    return f"""[SYSTEM: The user uploaded a dataset file for this session.

Use this Hugging Face Hub dataset reference when the task needs the uploaded data.
Do not look for the uploaded file on local disk and do not ask the user to
upload it again unless this Hub reference fails.

- Repo ID: {upload.repo_id}
- Repo type: dataset
- Dataset config: {upload.config_name}
- File in repo: {upload.path_in_repo}
- Original filename: {upload.original_filename}
- Stored filename: {upload.filename}
- Format: {upload.format}
- Size: {upload.size_bytes} bytes
- Hub URL: {upload.hub_url}

Load it with:
```python
{upload.load_dataset_snippet}
```
]"""


async def push_dataset_upload_to_hub(
    *,
    upload: UploadFile,
    session_id: str,
    hf_username: str,
    hf_token: str,
) -> DatasetUpload:
    safe_filename, dataset_format, size = await validate_dataset_upload(upload)
    original_filename = display_filename(upload.filename, safe_filename)
    upload_id = uuid.uuid4().hex[:12]
    config_name = dataset_config_name(upload_id)
    repo_id = session_dataset_repo_id(hf_username, session_id)
    path_in_repo = f"uploads/{upload_id}/{safe_filename}"
    hub_url = dataset_hub_url(repo_id, path_in_repo)
    snippet = load_dataset_snippet(repo_id, config_name)
    api = HfApi(token=hf_token)

    await asyncio.to_thread(
        api.create_repo,
        repo_id=repo_id,
        repo_type="dataset",
        private=True,
        exist_ok=True,
    )
    await asyncio.to_thread(
        api.update_repo_settings,
        repo_id=repo_id,
        repo_type="dataset",
        private=True,
    )
    repo_files = await asyncio.to_thread(
        api.list_repo_files,
        repo_id=repo_id,
        repo_type="dataset",
    )
    upload_paths = unique_dataset_upload_paths([*repo_files, path_in_repo])
    await asyncio.to_thread(upload.file.seek, 0)
    file_bytes = await asyncio.to_thread(upload.file.read)
    await asyncio.to_thread(
        api.upload_file,
        path_or_fileobj=file_bytes,
        path_in_repo=path_in_repo,
        repo_id=repo_id,
        repo_type="dataset",
        commit_message=f"Upload dataset file {safe_filename}",
    )
    await asyncio.to_thread(
        api.upload_file,
        path_or_fileobj=dataset_repo_card(repo_id, upload_paths),
        path_in_repo="README.md",
        repo_id=repo_id,
        repo_type="dataset",
        commit_message="Update ML Intern dataset upload configs",
    )

    return DatasetUpload(
        session_id=session_id,
        repo_id=repo_id,
        repo_type="dataset",
        private=True,
        upload_id=upload_id,
        config_name=config_name,
        filename=safe_filename,
        original_filename=original_filename,
        path_in_repo=path_in_repo,
        size_bytes=size,
        format=dataset_format,
        hub_url=hub_url,
        load_dataset_snippet=snippet,
    )