Document-Audit-RAG / tests /test_ingest.py
Mayank Chugh
Deploy DocuAudit AI to Hugging Face Space (no binaries)
d44b33d
"""Tests for ``/ingest`` upload, URL ingest, and collection management."""
import asyncio
from unittest.mock import AsyncMock
from api.routes import ingest as ingest_route
from storage.job_store import create_ingest_job, mark_job_processing
def test_upload_queues_job_success(client, monkeypatch):
monkeypatch.setattr("api.routes.ingest.create_ingest_job", AsyncMock(return_value="job-123"))
monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None))
response = client.post(
"/ingest/upload",
data={"collection_name": "default"},
files=[("files", ("sample.txt", b"hello world", "text/plain"))],
)
assert response.status_code == 200
body = response.json()
assert body["status"] == "queued"
assert body["job_id"] == "job-123"
assert body["total_files"] == 1
assert body["filenames"] == ["sample.txt"]
assert "Poll /jobs/job-123" in body["message"]
def test_upload_rejects_unsupported_extension(client):
response = client.post(
"/ingest/upload",
data={"collection_name": "default"},
files=[("files", ("sample.csv", b"a,b\n1,2", "text/csv"))],
)
assert response.status_code == 400
assert "Unsupported file type" in response.json()["detail"]
def test_upload_rejects_oversized_file(client):
oversized = b"x" * (2 * 1024 * 1024)
response = client.post(
"/ingest/upload",
data={"collection_name": "default"},
files=[("files", ("large.txt", oversized, "text/plain"))],
)
assert response.status_code == 413
assert "too large" in response.json()["detail"].lower()
def test_upload_returns_500_on_job_creation_error(client, monkeypatch):
monkeypatch.setattr(
"api.routes.ingest.create_ingest_job",
AsyncMock(side_effect=RuntimeError("job store unavailable")),
)
monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None))
response = client.post(
"/ingest/upload",
data={"collection_name": "default"},
files=[("files", ("sample.txt", b"hello", "text/plain"))],
)
assert response.status_code == 500
assert "job store unavailable" in response.json()["detail"]
def test_download_request_headers_sec_compliant():
headers = ingest_route._download_request_headers("DocuAudit AI test@example.com")
assert headers["User-Agent"] == "DocuAudit AI test@example.com"
assert headers["Accept-Encoding"] == "gzip, deflate"
assert "application/pdf" in headers["Accept"]
def test_ingest_url_rejects_non_http_scheme(client, monkeypatch):
monkeypatch.setattr(
"api.routes.ingest._download_url_to_temp",
AsyncMock(
side_effect=ingest_route.HTTPException(status_code=400, detail="Only http and https URLs are supported.")
),
)
response = client.post(
"/ingest/url",
json={"urls": ["https://example.com/file.txt"], "collection_name": "default"},
)
assert response.status_code == 400
assert "http and https" in response.json()["detail"]
def test_upload_pdf_queues_job_with_job_id(client, monkeypatch):
"""Spec: single PDF upload returns job_id."""
monkeypatch.setattr("api.routes.ingest.create_ingest_job", AsyncMock(return_value="pdf-job-99"))
monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None))
response = client.post(
"/ingest/upload",
data={"collection_name": "default"},
files=[("files", ("brief.pdf", b"%PDF-1.4 minimal", "application/pdf"))],
)
assert response.status_code == 200
body = response.json()
assert body["job_id"] == "pdf-job-99"
assert body["filenames"] == ["brief.pdf"]
def test_list_collections_backfills_created_at_from_jobs(client, test_settings, monkeypatch):
monkeypatch.setattr(
"api.routes.ingest.list_collection_names",
lambda *_: ["default"],
)
monkeypatch.setattr("api.routes.ingest.collection_document_count", lambda *_: 3)
monkeypatch.setattr("api.routes.ingest.collection_created_at", lambda *_: None)
monkeypatch.setattr(
"api.routes.ingest.earliest_job_created_at_for_collection",
AsyncMock(return_value="2026-05-21 07:05:38"),
)
monkeypatch.setattr(
"api.routes.ingest.ensure_collection_created_at",
lambda *_a, **_k: "2026-05-21T07:05:38Z",
)
response = client.get("/ingest/collections")
assert response.status_code == 200
body = response.json()
assert body["total"] == 1
assert body["collections"][0]["name"] == "default"
assert body["collections"][0]["document_count"] == 3
assert body["collections"][0]["created_at"] is not None
def test_job_status_polling_after_real_job_create(client, test_settings):
"""Spec: job status polling returns correct structure."""
job_id = asyncio.run(
create_ingest_job(
test_settings.jobs_db_path,
collection_name="default",
filenames=["sample.txt"],
)
)
asyncio.run(mark_job_processing(test_settings.jobs_db_path, job_id))
response = client.get(f"/jobs/{job_id}")
assert response.status_code == 200
body = response.json()
assert body["job_id"] == job_id
assert body["status"] == "processing"
assert body["total_files"] == 1
assert "progress_percent" in body
assert "errors" in body