"""Tests for ``/ingest`` upload, URL ingest, and collection management.""" import asyncio from unittest.mock import AsyncMock from api.routes import ingest as ingest_route from storage.job_store import create_ingest_job, mark_job_processing def test_upload_queues_job_success(client, monkeypatch): monkeypatch.setattr("api.routes.ingest.create_ingest_job", AsyncMock(return_value="job-123")) monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None)) response = client.post( "/ingest/upload", data={"collection_name": "default"}, files=[("files", ("sample.txt", b"hello world", "text/plain"))], ) assert response.status_code == 200 body = response.json() assert body["status"] == "queued" assert body["job_id"] == "job-123" assert body["total_files"] == 1 assert body["filenames"] == ["sample.txt"] assert "Poll /jobs/job-123" in body["message"] def test_upload_rejects_unsupported_extension(client): response = client.post( "/ingest/upload", data={"collection_name": "default"}, files=[("files", ("sample.csv", b"a,b\n1,2", "text/csv"))], ) assert response.status_code == 400 assert "Unsupported file type" in response.json()["detail"] def test_upload_rejects_oversized_file(client): oversized = b"x" * (2 * 1024 * 1024) response = client.post( "/ingest/upload", data={"collection_name": "default"}, files=[("files", ("large.txt", oversized, "text/plain"))], ) assert response.status_code == 413 assert "too large" in response.json()["detail"].lower() def test_upload_returns_500_on_job_creation_error(client, monkeypatch): monkeypatch.setattr( "api.routes.ingest.create_ingest_job", AsyncMock(side_effect=RuntimeError("job store unavailable")), ) monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None)) response = client.post( "/ingest/upload", data={"collection_name": "default"}, files=[("files", ("sample.txt", b"hello", "text/plain"))], ) assert response.status_code == 500 assert "job store unavailable" in response.json()["detail"] def test_download_request_headers_sec_compliant(): headers = ingest_route._download_request_headers("DocuAudit AI test@example.com") assert headers["User-Agent"] == "DocuAudit AI test@example.com" assert headers["Accept-Encoding"] == "gzip, deflate" assert "application/pdf" in headers["Accept"] def test_ingest_url_rejects_non_http_scheme(client, monkeypatch): monkeypatch.setattr( "api.routes.ingest._download_url_to_temp", AsyncMock( side_effect=ingest_route.HTTPException(status_code=400, detail="Only http and https URLs are supported.") ), ) response = client.post( "/ingest/url", json={"urls": ["https://example.com/file.txt"], "collection_name": "default"}, ) assert response.status_code == 400 assert "http and https" in response.json()["detail"] def test_upload_pdf_queues_job_with_job_id(client, monkeypatch): """Spec: single PDF upload returns job_id.""" monkeypatch.setattr("api.routes.ingest.create_ingest_job", AsyncMock(return_value="pdf-job-99")) monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None)) response = client.post( "/ingest/upload", data={"collection_name": "default"}, files=[("files", ("brief.pdf", b"%PDF-1.4 minimal", "application/pdf"))], ) assert response.status_code == 200 body = response.json() assert body["job_id"] == "pdf-job-99" assert body["filenames"] == ["brief.pdf"] def test_list_collections_backfills_created_at_from_jobs(client, test_settings, monkeypatch): monkeypatch.setattr( "api.routes.ingest.list_collection_names", lambda *_: ["default"], ) monkeypatch.setattr("api.routes.ingest.collection_document_count", lambda *_: 3) monkeypatch.setattr("api.routes.ingest.collection_created_at", lambda *_: None) monkeypatch.setattr( "api.routes.ingest.earliest_job_created_at_for_collection", AsyncMock(return_value="2026-05-21 07:05:38"), ) monkeypatch.setattr( "api.routes.ingest.ensure_collection_created_at", lambda *_a, **_k: "2026-05-21T07:05:38Z", ) response = client.get("/ingest/collections") assert response.status_code == 200 body = response.json() assert body["total"] == 1 assert body["collections"][0]["name"] == "default" assert body["collections"][0]["document_count"] == 3 assert body["collections"][0]["created_at"] is not None def test_job_status_polling_after_real_job_create(client, test_settings): """Spec: job status polling returns correct structure.""" job_id = asyncio.run( create_ingest_job( test_settings.jobs_db_path, collection_name="default", filenames=["sample.txt"], ) ) asyncio.run(mark_job_processing(test_settings.jobs_db_path, job_id)) response = client.get(f"/jobs/{job_id}") assert response.status_code == 200 body = response.json() assert body["job_id"] == job_id assert body["status"] == "processing" assert body["total_files"] == 1 assert "progress_percent" in body assert "errors" in body