File size: 5,417 Bytes
d44b33d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | """Tests for ``/ingest`` upload, URL ingest, and collection management."""
import asyncio
from unittest.mock import AsyncMock
from api.routes import ingest as ingest_route
from storage.job_store import create_ingest_job, mark_job_processing
def test_upload_queues_job_success(client, monkeypatch):
monkeypatch.setattr("api.routes.ingest.create_ingest_job", AsyncMock(return_value="job-123"))
monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None))
response = client.post(
"/ingest/upload",
data={"collection_name": "default"},
files=[("files", ("sample.txt", b"hello world", "text/plain"))],
)
assert response.status_code == 200
body = response.json()
assert body["status"] == "queued"
assert body["job_id"] == "job-123"
assert body["total_files"] == 1
assert body["filenames"] == ["sample.txt"]
assert "Poll /jobs/job-123" in body["message"]
def test_upload_rejects_unsupported_extension(client):
response = client.post(
"/ingest/upload",
data={"collection_name": "default"},
files=[("files", ("sample.csv", b"a,b\n1,2", "text/csv"))],
)
assert response.status_code == 400
assert "Unsupported file type" in response.json()["detail"]
def test_upload_rejects_oversized_file(client):
oversized = b"x" * (2 * 1024 * 1024)
response = client.post(
"/ingest/upload",
data={"collection_name": "default"},
files=[("files", ("large.txt", oversized, "text/plain"))],
)
assert response.status_code == 413
assert "too large" in response.json()["detail"].lower()
def test_upload_returns_500_on_job_creation_error(client, monkeypatch):
monkeypatch.setattr(
"api.routes.ingest.create_ingest_job",
AsyncMock(side_effect=RuntimeError("job store unavailable")),
)
monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None))
response = client.post(
"/ingest/upload",
data={"collection_name": "default"},
files=[("files", ("sample.txt", b"hello", "text/plain"))],
)
assert response.status_code == 500
assert "job store unavailable" in response.json()["detail"]
def test_download_request_headers_sec_compliant():
headers = ingest_route._download_request_headers("DocuAudit AI test@example.com")
assert headers["User-Agent"] == "DocuAudit AI test@example.com"
assert headers["Accept-Encoding"] == "gzip, deflate"
assert "application/pdf" in headers["Accept"]
def test_ingest_url_rejects_non_http_scheme(client, monkeypatch):
monkeypatch.setattr(
"api.routes.ingest._download_url_to_temp",
AsyncMock(
side_effect=ingest_route.HTTPException(status_code=400, detail="Only http and https URLs are supported.")
),
)
response = client.post(
"/ingest/url",
json={"urls": ["https://example.com/file.txt"], "collection_name": "default"},
)
assert response.status_code == 400
assert "http and https" in response.json()["detail"]
def test_upload_pdf_queues_job_with_job_id(client, monkeypatch):
"""Spec: single PDF upload returns job_id."""
monkeypatch.setattr("api.routes.ingest.create_ingest_job", AsyncMock(return_value="pdf-job-99"))
monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None))
response = client.post(
"/ingest/upload",
data={"collection_name": "default"},
files=[("files", ("brief.pdf", b"%PDF-1.4 minimal", "application/pdf"))],
)
assert response.status_code == 200
body = response.json()
assert body["job_id"] == "pdf-job-99"
assert body["filenames"] == ["brief.pdf"]
def test_list_collections_backfills_created_at_from_jobs(client, test_settings, monkeypatch):
monkeypatch.setattr(
"api.routes.ingest.list_collection_names",
lambda *_: ["default"],
)
monkeypatch.setattr("api.routes.ingest.collection_document_count", lambda *_: 3)
monkeypatch.setattr("api.routes.ingest.collection_created_at", lambda *_: None)
monkeypatch.setattr(
"api.routes.ingest.earliest_job_created_at_for_collection",
AsyncMock(return_value="2026-05-21 07:05:38"),
)
monkeypatch.setattr(
"api.routes.ingest.ensure_collection_created_at",
lambda *_a, **_k: "2026-05-21T07:05:38Z",
)
response = client.get("/ingest/collections")
assert response.status_code == 200
body = response.json()
assert body["total"] == 1
assert body["collections"][0]["name"] == "default"
assert body["collections"][0]["document_count"] == 3
assert body["collections"][0]["created_at"] is not None
def test_job_status_polling_after_real_job_create(client, test_settings):
"""Spec: job status polling returns correct structure."""
job_id = asyncio.run(
create_ingest_job(
test_settings.jobs_db_path,
collection_name="default",
filenames=["sample.txt"],
)
)
asyncio.run(mark_job_processing(test_settings.jobs_db_path, job_id))
response = client.get(f"/jobs/{job_id}")
assert response.status_code == 200
body = response.json()
assert body["job_id"] == job_id
assert body["status"] == "processing"
assert body["total_files"] == 1
assert "progress_percent" in body
assert "errors" in body
|