Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| import io | |
| import sys | |
| from pathlib import Path | |
| from types import SimpleNamespace | |
| import httpx | |
| import pytest | |
| from fastapi import HTTPException, UploadFile | |
| from huggingface_hub.errors import HfHubHTTPError | |
| from starlette.datastructures import FormData | |
| _BACKEND_DIR = Path(__file__).resolve().parent.parent.parent / "backend" | |
| if str(_BACKEND_DIR) not in sys.path: | |
| sys.path.insert(0, str(_BACKEND_DIR)) | |
| import dataset_uploads # noqa: E402 | |
| from routes import agent # noqa: E402 | |
| def _upload(filename: str, content: bytes = b"a,b\n1,2\n") -> UploadFile: | |
| return UploadFile(filename=filename, file=io.BytesIO(content)) | |
| def _track_close(upload: UploadFile): | |
| state = {"closed": False} | |
| original_close = upload.close | |
| async def close(): | |
| state["closed"] = True | |
| await original_close() | |
| upload.close = close | |
| return state | |
| def _request( | |
| upload: UploadFile | None = None, | |
| headers: dict[str, str] | None = None, | |
| ): | |
| state = {"form_called": False} | |
| class FakeRequest: | |
| def __init__(self): | |
| self.headers = headers or {} | |
| self.cookies = {} | |
| async def form(self, **_kwargs): | |
| state["form_called"] = True | |
| if upload is None: | |
| raise AssertionError("request.form() should not be called") | |
| return FormData([("file", upload)]) | |
| return FakeRequest(), state | |
| def test_sanitize_dataset_filename_strips_paths_and_unsafe_chars(): | |
| assert ( | |
| dataset_uploads.sanitize_dataset_filename("../../bad file (final).CSV") | |
| == "bad-file-final.csv" | |
| ) | |
| assert dataset_uploads.sanitize_dataset_filename("") == "dataset.csv" | |
| def test_dataset_format_rejects_unsupported_extension(): | |
| with pytest.raises(HTTPException) as exc_info: | |
| dataset_uploads.dataset_format_from_filename("notes.txt") | |
| assert exc_info.value.status_code == 400 | |
| with pytest.raises(HTTPException): | |
| dataset_uploads.dataset_format_from_filename("notes") | |
| def test_dataset_repo_card_exposes_each_upload_as_config(): | |
| card = dataset_uploads.dataset_repo_card( | |
| "alice/ml-intern-s1-datasets", | |
| [ | |
| "README.md", | |
| "uploads/oldabc/rows.jsonl", | |
| "uploads/oldabc/rows.jsonl", | |
| "uploads/newdef/table.csv", | |
| ], | |
| ).decode("utf-8") | |
| assert "configs:" in card | |
| assert "- config_name: upload_oldabc" in card | |
| assert ' path: "uploads/oldabc/rows.jsonl"' in card | |
| assert "- config_name: upload_newdef" in card | |
| assert ' path: "uploads/newdef/table.csv"' in card | |
| assert card.count("- config_name: upload_oldabc") == 1 | |
| async def test_validate_dataset_upload_rejects_size_over_limit(monkeypatch): | |
| monkeypatch.setattr(dataset_uploads, "MAX_DATASET_UPLOAD_BYTES", 3) | |
| upload = _upload("rows.csv", b"abcd") | |
| try: | |
| with pytest.raises(HTTPException) as exc_info: | |
| await dataset_uploads.validate_dataset_upload(upload) | |
| finally: | |
| await upload.close() | |
| assert exc_info.value.status_code == 413 | |
| async def test_push_dataset_upload_creates_private_repo_and_uploads_file(monkeypatch): | |
| instances = [] | |
| class FakeApi: | |
| def __init__(self, token): | |
| self.token = token | |
| self.create_calls = [] | |
| self.settings_calls = [] | |
| self.list_calls = [] | |
| self.upload_calls = [] | |
| instances.append(self) | |
| def create_repo(self, **kwargs): | |
| self.create_calls.append(kwargs) | |
| def update_repo_settings(self, **kwargs): | |
| self.settings_calls.append(kwargs) | |
| def list_repo_files(self, **kwargs): | |
| self.list_calls.append(kwargs) | |
| return [ | |
| "README.md", | |
| "uploads/oldupload/old.jsonl", | |
| "uploads/notes.txt", | |
| ] | |
| def upload_file(self, **kwargs): | |
| if kwargs["path_in_repo"] != "README.md": | |
| assert kwargs["path_or_fileobj"] == b"a,b\n1,2\n" | |
| self.upload_calls.append(kwargs) | |
| monkeypatch.setattr(dataset_uploads, "HfApi", FakeApi) | |
| monkeypatch.setattr( | |
| dataset_uploads.uuid, | |
| "uuid4", | |
| lambda: SimpleNamespace(hex="feedfacecafebeef"), | |
| ) | |
| upload = _upload("../Data Set.CSV") | |
| try: | |
| result = await dataset_uploads.push_dataset_upload_to_hub( | |
| upload=upload, | |
| session_id="12345678-90ab-cdef-1234-567890abcdef", | |
| hf_username="alice", | |
| hf_token="hf-token", | |
| ) | |
| finally: | |
| await upload.close() | |
| api = instances[0] | |
| assert api.token == "hf-token" | |
| assert api.create_calls == [ | |
| { | |
| "repo_id": "alice/ml-intern-12345678-datasets", | |
| "repo_type": "dataset", | |
| "private": True, | |
| "exist_ok": True, | |
| } | |
| ] | |
| assert api.settings_calls == [ | |
| { | |
| "repo_id": "alice/ml-intern-12345678-datasets", | |
| "repo_type": "dataset", | |
| "private": True, | |
| } | |
| ] | |
| assert api.list_calls == [ | |
| { | |
| "repo_id": "alice/ml-intern-12345678-datasets", | |
| "repo_type": "dataset", | |
| } | |
| ] | |
| assert [call["path_in_repo"] for call in api.upload_calls] == [ | |
| "uploads/feedfacecafe/Data-Set.csv", | |
| "README.md", | |
| ] | |
| readme = api.upload_calls[1]["path_or_fileobj"].decode("utf-8") | |
| assert "- config_name: upload_oldupload" in readme | |
| assert ' path: "uploads/oldupload/old.jsonl"' in readme | |
| assert "- config_name: upload_feedfacecafe" in readme | |
| assert ' path: "uploads/feedfacecafe/Data-Set.csv"' in readme | |
| assert result.repo_id == "alice/ml-intern-12345678-datasets" | |
| assert result.config_name == "upload_feedfacecafe" | |
| assert result.format == "csv" | |
| assert result.load_dataset_snippet == ( | |
| "from datasets import load_dataset\n\n" | |
| 'dataset = load_dataset("alice/ml-intern-12345678-datasets", ' | |
| '"upload_feedfacecafe", split="train", token=True)' | |
| ) | |
| async def test_upload_route_requires_hf_token_without_parsing_upload(monkeypatch): | |
| monkeypatch.delenv("HF_TOKEN", raising=False) | |
| upload = _upload("rows.csv") | |
| close_state = _track_close(upload) | |
| request, request_state = _request(upload) | |
| async def fake_check_session_access(*_args, **_kwargs): | |
| return SimpleNamespace( | |
| is_active=True, | |
| is_processing=False, | |
| session=SimpleNamespace(pending_approval=None), | |
| hf_username="alice", | |
| ) | |
| monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access) | |
| try: | |
| with pytest.raises(HTTPException) as exc_info: | |
| await agent.upload_session_dataset( | |
| "s1", | |
| request, | |
| {"user_id": "u1", "username": "alice"}, | |
| ) | |
| assert exc_info.value.status_code == 401 | |
| assert request_state["form_called"] is False | |
| assert close_state["closed"] is False | |
| finally: | |
| await upload.close() | |
| async def test_upload_route_rejects_content_length_before_parsing(monkeypatch): | |
| upload = _upload("rows.csv") | |
| close_state = _track_close(upload) | |
| request, request_state = _request( | |
| upload, | |
| headers={ | |
| "content-length": str( | |
| dataset_uploads.MAX_DATASET_UPLOAD_BYTES | |
| + agent.DATASET_UPLOAD_MULTIPART_SLACK_BYTES | |
| + 1 | |
| ) | |
| }, | |
| ) | |
| async def fake_check_session_access(*_args, **_kwargs): | |
| raise AssertionError("session access should not run for oversized uploads") | |
| monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access) | |
| try: | |
| with pytest.raises(HTTPException) as exc_info: | |
| await agent.upload_session_dataset( | |
| "s1", | |
| request, | |
| { | |
| "user_id": "u1", | |
| "username": "alice", | |
| agent.INTERNAL_HF_TOKEN_KEY: "hf-token", | |
| }, | |
| ) | |
| assert exc_info.value.status_code == 413 | |
| assert request_state["form_called"] is False | |
| assert close_state["closed"] is False | |
| finally: | |
| await upload.close() | |
| async def test_upload_route_rejects_busy_session_without_parsing_upload(monkeypatch): | |
| upload = _upload("rows.csv") | |
| close_state = _track_close(upload) | |
| request, request_state = _request(upload) | |
| async def fake_check_session_access(*_args, **_kwargs): | |
| return SimpleNamespace( | |
| is_active=True, | |
| is_processing=True, | |
| session=SimpleNamespace(pending_approval=None), | |
| hf_username="alice", | |
| ) | |
| monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access) | |
| with pytest.raises(HTTPException) as exc_info: | |
| await agent.upload_session_dataset( | |
| "s1", | |
| request, | |
| { | |
| "user_id": "u1", | |
| "username": "alice", | |
| agent.INTERNAL_HF_TOKEN_KEY: "hf-token", | |
| }, | |
| ) | |
| assert exc_info.value.status_code == 409 | |
| assert request_state["form_called"] is False | |
| assert close_state["closed"] is False | |
| await upload.close() | |
| async def test_upload_route_appends_context_note_and_persists(monkeypatch): | |
| upload = _upload("rows.jsonl", b'{"text":"hi"}\n') | |
| close_state = _track_close(upload) | |
| request, request_state = _request(upload) | |
| messages = [] | |
| persisted = [] | |
| agent_session = SimpleNamespace( | |
| is_active=True, | |
| is_processing=False, | |
| session=SimpleNamespace( | |
| pending_approval=None, | |
| context_manager=SimpleNamespace(add_message=messages.append), | |
| ), | |
| hf_username="alice", | |
| ) | |
| uploaded = dataset_uploads.DatasetUpload( | |
| session_id="s1", | |
| repo_id="alice/ml-intern-s1-datasets", | |
| repo_type="dataset", | |
| private=True, | |
| upload_id="abc123", | |
| config_name="upload_abc123", | |
| filename="rows.jsonl", | |
| original_filename="rows.jsonl", | |
| path_in_repo="uploads/abc123/rows.jsonl", | |
| size_bytes=14, | |
| format="jsonl", | |
| hub_url="https://huggingface.co/datasets/alice/ml-intern-s1-datasets/blob/main/uploads/abc123/rows.jsonl", | |
| load_dataset_snippet='dataset = load_dataset("json")', | |
| ) | |
| async def fake_check_session_access(*_args, **_kwargs): | |
| return agent_session | |
| async def fake_push_dataset_upload_to_hub(**kwargs): | |
| assert kwargs["upload"] is upload | |
| assert kwargs["hf_token"] == "hf-token" | |
| return uploaded | |
| async def fake_persist_session_snapshot(value): | |
| persisted.append(value) | |
| monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access) | |
| monkeypatch.setattr( | |
| agent, "push_dataset_upload_to_hub", fake_push_dataset_upload_to_hub | |
| ) | |
| monkeypatch.setattr( | |
| agent.session_manager, | |
| "persist_session_snapshot", | |
| fake_persist_session_snapshot, | |
| ) | |
| response = await agent.upload_session_dataset( | |
| "s1", | |
| request, | |
| { | |
| "user_id": "u1", | |
| "username": "alice", | |
| agent.INTERNAL_HF_TOKEN_KEY: "hf-token", | |
| }, | |
| ) | |
| assert response.repo_id == uploaded.repo_id | |
| assert response.config_name == uploaded.config_name | |
| assert response.path_in_repo == uploaded.path_in_repo | |
| assert len(messages) == 1 | |
| assert messages[0].role == "user" | |
| assert messages[0].content.startswith("[SYSTEM:") | |
| assert uploaded.config_name in messages[0].content | |
| assert uploaded.path_in_repo in messages[0].content | |
| assert persisted == [agent_session] | |
| assert request_state["form_called"] is True | |
| assert close_state["closed"] is True | |
| async def test_upload_route_closes_upload_when_hub_upload_fails(monkeypatch): | |
| upload = _upload("rows.csv") | |
| close_state = _track_close(upload) | |
| request, request_state = _request(upload) | |
| async def fake_check_session_access(*_args, **_kwargs): | |
| return SimpleNamespace( | |
| is_active=True, | |
| is_processing=False, | |
| session=SimpleNamespace(pending_approval=None), | |
| hf_username="alice", | |
| ) | |
| async def fake_push_dataset_upload_to_hub(**_kwargs): | |
| raise RuntimeError("hub unavailable") | |
| monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access) | |
| monkeypatch.setattr( | |
| agent, "push_dataset_upload_to_hub", fake_push_dataset_upload_to_hub | |
| ) | |
| with pytest.raises(HTTPException) as exc_info: | |
| await agent.upload_session_dataset( | |
| "s1", | |
| request, | |
| { | |
| "user_id": "u1", | |
| "username": "alice", | |
| agent.INTERNAL_HF_TOKEN_KEY: "hf-token", | |
| }, | |
| ) | |
| assert exc_info.value.status_code == 502 | |
| assert exc_info.value.detail == "Dataset upload failed. Please try again." | |
| assert request_state["form_called"] is True | |
| assert close_state["closed"] is True | |
| async def test_upload_route_maps_hub_permission_error_safely(monkeypatch): | |
| upload = _upload("rows.csv") | |
| close_state = _track_close(upload) | |
| request, request_state = _request(upload) | |
| async def fake_check_session_access(*_args, **_kwargs): | |
| return SimpleNamespace( | |
| is_active=True, | |
| is_processing=False, | |
| session=SimpleNamespace(pending_approval=None), | |
| hf_username="alice", | |
| ) | |
| async def fake_push_dataset_upload_to_hub(**_kwargs): | |
| response = httpx.Response( | |
| 403, | |
| request=httpx.Request("POST", "https://huggingface.co/api/datasets"), | |
| headers={"x-request-id": "req-123"}, | |
| ) | |
| raise HfHubHTTPError( | |
| "403 Forbidden: token hf_secret cannot write", | |
| response=response, | |
| server_message="token hf_secret cannot write", | |
| ) | |
| monkeypatch.setattr(agent, "_check_session_access", fake_check_session_access) | |
| monkeypatch.setattr( | |
| agent, "push_dataset_upload_to_hub", fake_push_dataset_upload_to_hub | |
| ) | |
| with pytest.raises(HTTPException) as exc_info: | |
| await agent.upload_session_dataset( | |
| "s1", | |
| request, | |
| { | |
| "user_id": "u1", | |
| "username": "alice", | |
| agent.INTERNAL_HF_TOKEN_KEY: "hf-token", | |
| }, | |
| ) | |
| assert exc_info.value.status_code == 403 | |
| assert exc_info.value.detail == ( | |
| "Hugging Face denied permission to create or write to the dataset repo." | |
| ) | |
| assert "hf_secret" not in exc_info.value.detail | |
| assert request_state["form_called"] is True | |
| assert close_state["closed"] is True | |