ml-intern

Sleeping

File size: 9,560 Bytes

2b4c539

"""Helpers for session-scoped dataset uploads to the Hugging Face Hub."""

import asyncio
import os
import re
import uuid
from dataclasses import dataclass
from urllib.parse import quote

from fastapi import HTTPException, UploadFile
from huggingface_hub import HfApi

MAX_DATASET_UPLOAD_BYTES = 100 * 1024 * 1024
ALLOWED_DATASET_EXTENSIONS = {"csv", "json", "jsonl"}
_SAFE_FILENAME_RE = re.compile(r"[^A-Za-z0-9._-]+")
_SAFE_NAMESPACE_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,95}$")


@dataclass(frozen=True)
class DatasetUpload:
    session_id: str
    repo_id: str
    repo_type: str
    private: bool
    upload_id: str
    config_name: str
    filename: str
    original_filename: str
    path_in_repo: str
    size_bytes: int
    format: str
    hub_url: str
    load_dataset_snippet: str

    def response_payload(self) -> dict[str, str | int | bool]:
        return {
            "session_id": self.session_id,
            "repo_id": self.repo_id,
            "repo_type": self.repo_type,
            "private": self.private,
            "upload_id": self.upload_id,
            "config_name": self.config_name,
            "filename": self.filename,
            "path_in_repo": self.path_in_repo,
            "size_bytes": self.size_bytes,
            "format": self.format,
            "hub_url": self.hub_url,
            "load_dataset_snippet": self.load_dataset_snippet,
        }


def sanitize_dataset_filename(filename: str | None) -> str:
    """Return a Hub-safe basename while preserving the extension."""
    raw = os.path.basename(filename or "").strip()
    if not raw:
        raw = "dataset.csv"

    safe = _SAFE_FILENAME_RE.sub("-", raw).strip(".-_")
    if not safe:
        safe = "dataset.csv"

    stem, ext = os.path.splitext(safe)
    if not stem:
        stem = "dataset"
    if not ext:
        ext = ".csv"

    max_stem_len = 96 - len(ext)
    stem = stem[:max_stem_len].strip(".-_") or "dataset"
    return f"{stem}{ext.lower()}"


def display_filename(filename: str | None, fallback: str) -> str:
    raw = os.path.basename(filename or "").strip()
    if not raw:
        return fallback
    cleaned = "".join(char for char in raw if ord(char) >= 32)
    return cleaned[:160] or fallback


def dataset_format_from_filename(filename: str) -> str:
    ext = os.path.splitext(filename)[1].lower().lstrip(".")
    if ext not in ALLOWED_DATASET_EXTENSIONS:
        raise HTTPException(
            status_code=400,
            detail="Only .csv, .json, and .jsonl dataset files are supported.",
        )
    return ext


def session_dataset_repo_id(hf_username: str | None, session_id: str) -> str:
    namespace = (hf_username or "").strip()
    if not namespace or not _SAFE_NAMESPACE_RE.fullmatch(namespace):
        raise HTTPException(
            status_code=400,
            detail="Could not determine a valid Hugging Face namespace.",
        )

    safe_session_id = re.sub(r"[^A-Za-z0-9]+", "-", session_id).strip("-")
    if not safe_session_id:
        safe_session_id = uuid.uuid4().hex[:8]
    return f"{namespace}/ml-intern-{safe_session_id[:8]}-datasets"


async def upload_size_bytes(upload: UploadFile) -> int:
    await asyncio.to_thread(upload.file.seek, 0, os.SEEK_END)
    size = await asyncio.to_thread(upload.file.tell)
    await asyncio.to_thread(upload.file.seek, 0)
    return int(size)


async def validate_dataset_upload(upload: UploadFile) -> tuple[str, str, int]:
    dataset_format = dataset_format_from_filename(upload.filename or "")
    safe_filename = sanitize_dataset_filename(upload.filename)
    size = await upload_size_bytes(upload)
    if size <= 0:
        raise HTTPException(status_code=400, detail="Uploaded dataset file is empty.")
    if size > MAX_DATASET_UPLOAD_BYTES:
        raise HTTPException(
            status_code=413,
            detail="Dataset upload exceeds the 100 MB limit.",
        )
    return safe_filename, dataset_format, size


def dataset_hub_url(repo_id: str, path_in_repo: str) -> str:
    quoted_path = quote(path_in_repo, safe="/")
    return f"https://huggingface.co/datasets/{repo_id}/blob/main/{quoted_path}"


def dataset_config_name(upload_id: str) -> str:
    safe_upload_id = re.sub(r"[^A-Za-z0-9]+", "_", upload_id).strip("_").lower()
    if not safe_upload_id:
        safe_upload_id = "dataset"
    return f"upload_{safe_upload_id[:32]}"


def dataset_config_name_from_path(path_in_repo: str) -> str:
    parts = path_in_repo.split("/")
    if len(parts) >= 3 and parts[0] == "uploads":
        return dataset_config_name(parts[1])
    stem = os.path.splitext(os.path.basename(path_in_repo))[0]
    return dataset_config_name(stem)


def is_dataset_upload_path(path_in_repo: str) -> bool:
    parts = path_in_repo.split("/")
    if len(parts) != 3 or parts[0] != "uploads" or not parts[1] or not parts[2]:
        return False
    extension = os.path.splitext(path_in_repo)[1].lower().lstrip(".")
    return extension in ALLOWED_DATASET_EXTENSIONS


def unique_dataset_upload_paths(paths: list[str]) -> list[str]:
    seen = set()
    upload_paths = []
    for path in paths:
        if not is_dataset_upload_path(path) or path in seen:
            continue
        seen.add(path)
        upload_paths.append(path)
    return upload_paths


def load_dataset_snippet(repo_id: str, config_name: str) -> str:
    return (
        "from datasets import load_dataset\n\n"
        f'dataset = load_dataset("{repo_id}", "{config_name}", '
        'split="train", token=True)'
    )


def dataset_repo_card(repo_id: str, upload_paths: list[str]) -> bytes:
    config_lines = []
    unique_upload_paths = unique_dataset_upload_paths(upload_paths)
    if unique_upload_paths:
        config_lines.append("configs:")
        for path in unique_upload_paths:
            config_lines.extend(
                [
                    f"- config_name: {dataset_config_name_from_path(path)}",
                    "  data_files:",
                    "  - split: train",
                    f'    path: "{path}"',
                ]
            )

    configs = "\n".join(config_lines)
    if configs:
        configs = f"{configs}\n"

    content = f"""---
tags:
- ml-intern
- uploaded-dataset
{configs}---

# {repo_id}

Private dataset files uploaded through ML Intern.

Files are stored under `uploads/<upload_id>/` and are attached to the
corresponding ML Intern session context by Hub reference, not by copying file
contents into the chat.

Each uploaded file is exposed as its own dataset config so files with different
schemas can coexist in the same session repo.
"""
    return content.encode("utf-8")


def dataset_context_note(upload: DatasetUpload) -> str:
    return f"""[SYSTEM: The user uploaded a dataset file for this session.

Use this Hugging Face Hub dataset reference when the task needs the uploaded data.
Do not look for the uploaded file on local disk and do not ask the user to
upload it again unless this Hub reference fails.

- Repo ID: {upload.repo_id}
- Repo type: dataset
- Dataset config: {upload.config_name}
- File in repo: {upload.path_in_repo}
- Original filename: {upload.original_filename}
- Stored filename: {upload.filename}
- Format: {upload.format}
- Size: {upload.size_bytes} bytes
- Hub URL: {upload.hub_url}

Load it with:
```python
{upload.load_dataset_snippet}
```
]"""


async def push_dataset_upload_to_hub(
    *,
    upload: UploadFile,
    session_id: str,
    hf_username: str,
    hf_token: str,
) -> DatasetUpload:
    safe_filename, dataset_format, size = await validate_dataset_upload(upload)
    original_filename = display_filename(upload.filename, safe_filename)
    upload_id = uuid.uuid4().hex[:12]
    config_name = dataset_config_name(upload_id)
    repo_id = session_dataset_repo_id(hf_username, session_id)
    path_in_repo = f"uploads/{upload_id}/{safe_filename}"
    hub_url = dataset_hub_url(repo_id, path_in_repo)
    snippet = load_dataset_snippet(repo_id, config_name)
    api = HfApi(token=hf_token)

    await asyncio.to_thread(
        api.create_repo,
        repo_id=repo_id,
        repo_type="dataset",
        private=True,
        exist_ok=True,
    )
    await asyncio.to_thread(
        api.update_repo_settings,
        repo_id=repo_id,
        repo_type="dataset",
        private=True,
    )
    repo_files = await asyncio.to_thread(
        api.list_repo_files,
        repo_id=repo_id,
        repo_type="dataset",
    )
    upload_paths = unique_dataset_upload_paths([*repo_files, path_in_repo])
    await asyncio.to_thread(upload.file.seek, 0)
    file_bytes = await asyncio.to_thread(upload.file.read)
    await asyncio.to_thread(
        api.upload_file,
        path_or_fileobj=file_bytes,
        path_in_repo=path_in_repo,
        repo_id=repo_id,
        repo_type="dataset",
        commit_message=f"Upload dataset file {safe_filename}",
    )
    await asyncio.to_thread(
        api.upload_file,
        path_or_fileobj=dataset_repo_card(repo_id, upload_paths),
        path_in_repo="README.md",
        repo_id=repo_id,
        repo_type="dataset",
        commit_message="Update ML Intern dataset upload configs",
    )

    return DatasetUpload(
        session_id=session_id,
        repo_id=repo_id,
        repo_type="dataset",
        private=True,
        upload_id=upload_id,
        config_name=config_name,
        filename=safe_filename,
        original_filename=original_filename,
        path_in_repo=path_in_repo,
        size_bytes=size,
        format=dataset_format,
        hub_url=hub_url,
        load_dataset_snippet=snippet,
    )