| """Tests for ``/ingest`` upload, URL ingest, and collection management.""" |
|
|
| import asyncio |
| from unittest.mock import AsyncMock |
|
|
| from api.routes import ingest as ingest_route |
| from storage.job_store import create_ingest_job, mark_job_processing |
|
|
|
|
| def test_upload_queues_job_success(client, monkeypatch): |
| monkeypatch.setattr("api.routes.ingest.create_ingest_job", AsyncMock(return_value="job-123")) |
| monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None)) |
|
|
| response = client.post( |
| "/ingest/upload", |
| data={"collection_name": "default"}, |
| files=[("files", ("sample.txt", b"hello world", "text/plain"))], |
| ) |
|
|
| assert response.status_code == 200 |
| body = response.json() |
| assert body["status"] == "queued" |
| assert body["job_id"] == "job-123" |
| assert body["total_files"] == 1 |
| assert body["filenames"] == ["sample.txt"] |
| assert "Poll /jobs/job-123" in body["message"] |
|
|
|
|
| def test_upload_rejects_unsupported_extension(client): |
| response = client.post( |
| "/ingest/upload", |
| data={"collection_name": "default"}, |
| files=[("files", ("sample.csv", b"a,b\n1,2", "text/csv"))], |
| ) |
|
|
| assert response.status_code == 400 |
| assert "Unsupported file type" in response.json()["detail"] |
|
|
|
|
| def test_upload_rejects_oversized_file(client): |
| oversized = b"x" * (2 * 1024 * 1024) |
| response = client.post( |
| "/ingest/upload", |
| data={"collection_name": "default"}, |
| files=[("files", ("large.txt", oversized, "text/plain"))], |
| ) |
|
|
| assert response.status_code == 413 |
| assert "too large" in response.json()["detail"].lower() |
|
|
|
|
| def test_upload_returns_500_on_job_creation_error(client, monkeypatch): |
| monkeypatch.setattr( |
| "api.routes.ingest.create_ingest_job", |
| AsyncMock(side_effect=RuntimeError("job store unavailable")), |
| ) |
| monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None)) |
|
|
| response = client.post( |
| "/ingest/upload", |
| data={"collection_name": "default"}, |
| files=[("files", ("sample.txt", b"hello", "text/plain"))], |
| ) |
|
|
| assert response.status_code == 500 |
| assert "job store unavailable" in response.json()["detail"] |
|
|
|
|
| def test_download_request_headers_sec_compliant(): |
| headers = ingest_route._download_request_headers("DocuAudit AI test@example.com") |
| assert headers["User-Agent"] == "DocuAudit AI test@example.com" |
| assert headers["Accept-Encoding"] == "gzip, deflate" |
| assert "application/pdf" in headers["Accept"] |
|
|
|
|
| def test_ingest_url_rejects_non_http_scheme(client, monkeypatch): |
| monkeypatch.setattr( |
| "api.routes.ingest._download_url_to_temp", |
| AsyncMock( |
| side_effect=ingest_route.HTTPException(status_code=400, detail="Only http and https URLs are supported.") |
| ), |
| ) |
|
|
| response = client.post( |
| "/ingest/url", |
| json={"urls": ["https://example.com/file.txt"], "collection_name": "default"}, |
| ) |
|
|
| assert response.status_code == 400 |
| assert "http and https" in response.json()["detail"] |
|
|
|
|
| def test_upload_pdf_queues_job_with_job_id(client, monkeypatch): |
| """Spec: single PDF upload returns job_id.""" |
| monkeypatch.setattr("api.routes.ingest.create_ingest_job", AsyncMock(return_value="pdf-job-99")) |
| monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None)) |
|
|
| response = client.post( |
| "/ingest/upload", |
| data={"collection_name": "default"}, |
| files=[("files", ("brief.pdf", b"%PDF-1.4 minimal", "application/pdf"))], |
| ) |
|
|
| assert response.status_code == 200 |
| body = response.json() |
| assert body["job_id"] == "pdf-job-99" |
| assert body["filenames"] == ["brief.pdf"] |
|
|
|
|
| def test_list_collections_backfills_created_at_from_jobs(client, test_settings, monkeypatch): |
| monkeypatch.setattr( |
| "api.routes.ingest.list_collection_names", |
| lambda *_: ["default"], |
| ) |
| monkeypatch.setattr("api.routes.ingest.collection_document_count", lambda *_: 3) |
| monkeypatch.setattr("api.routes.ingest.collection_created_at", lambda *_: None) |
| monkeypatch.setattr( |
| "api.routes.ingest.earliest_job_created_at_for_collection", |
| AsyncMock(return_value="2026-05-21 07:05:38"), |
| ) |
| monkeypatch.setattr( |
| "api.routes.ingest.ensure_collection_created_at", |
| lambda *_a, **_k: "2026-05-21T07:05:38Z", |
| ) |
|
|
| response = client.get("/ingest/collections") |
| assert response.status_code == 200 |
| body = response.json() |
| assert body["total"] == 1 |
| assert body["collections"][0]["name"] == "default" |
| assert body["collections"][0]["document_count"] == 3 |
| assert body["collections"][0]["created_at"] is not None |
|
|
|
|
| def test_job_status_polling_after_real_job_create(client, test_settings): |
| """Spec: job status polling returns correct structure.""" |
| job_id = asyncio.run( |
| create_ingest_job( |
| test_settings.jobs_db_path, |
| collection_name="default", |
| filenames=["sample.txt"], |
| ) |
| ) |
| asyncio.run(mark_job_processing(test_settings.jobs_db_path, job_id)) |
|
|
| response = client.get(f"/jobs/{job_id}") |
| assert response.status_code == 200 |
| body = response.json() |
| assert body["job_id"] == job_id |
| assert body["status"] == "processing" |
| assert body["total_files"] == 1 |
| assert "progress_percent" in body |
| assert "errors" in body |
|
|