File size: 5,417 Bytes
d44b33d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""Tests for ``/ingest`` upload, URL ingest, and collection management."""

import asyncio
from unittest.mock import AsyncMock

from api.routes import ingest as ingest_route
from storage.job_store import create_ingest_job, mark_job_processing


def test_upload_queues_job_success(client, monkeypatch):
    monkeypatch.setattr("api.routes.ingest.create_ingest_job", AsyncMock(return_value="job-123"))
    monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None))

    response = client.post(
        "/ingest/upload",
        data={"collection_name": "default"},
        files=[("files", ("sample.txt", b"hello world", "text/plain"))],
    )

    assert response.status_code == 200
    body = response.json()
    assert body["status"] == "queued"
    assert body["job_id"] == "job-123"
    assert body["total_files"] == 1
    assert body["filenames"] == ["sample.txt"]
    assert "Poll /jobs/job-123" in body["message"]


def test_upload_rejects_unsupported_extension(client):
    response = client.post(
        "/ingest/upload",
        data={"collection_name": "default"},
        files=[("files", ("sample.csv", b"a,b\n1,2", "text/csv"))],
    )

    assert response.status_code == 400
    assert "Unsupported file type" in response.json()["detail"]


def test_upload_rejects_oversized_file(client):
    oversized = b"x" * (2 * 1024 * 1024)
    response = client.post(
        "/ingest/upload",
        data={"collection_name": "default"},
        files=[("files", ("large.txt", oversized, "text/plain"))],
    )

    assert response.status_code == 413
    assert "too large" in response.json()["detail"].lower()


def test_upload_returns_500_on_job_creation_error(client, monkeypatch):
    monkeypatch.setattr(
        "api.routes.ingest.create_ingest_job",
        AsyncMock(side_effect=RuntimeError("job store unavailable")),
    )
    monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None))

    response = client.post(
        "/ingest/upload",
        data={"collection_name": "default"},
        files=[("files", ("sample.txt", b"hello", "text/plain"))],
    )

    assert response.status_code == 500
    assert "job store unavailable" in response.json()["detail"]


def test_download_request_headers_sec_compliant():
    headers = ingest_route._download_request_headers("DocuAudit AI test@example.com")
    assert headers["User-Agent"] == "DocuAudit AI test@example.com"
    assert headers["Accept-Encoding"] == "gzip, deflate"
    assert "application/pdf" in headers["Accept"]


def test_ingest_url_rejects_non_http_scheme(client, monkeypatch):
    monkeypatch.setattr(
        "api.routes.ingest._download_url_to_temp",
        AsyncMock(
            side_effect=ingest_route.HTTPException(status_code=400, detail="Only http and https URLs are supported.")
        ),
    )

    response = client.post(
        "/ingest/url",
        json={"urls": ["https://example.com/file.txt"], "collection_name": "default"},
    )

    assert response.status_code == 400
    assert "http and https" in response.json()["detail"]


def test_upload_pdf_queues_job_with_job_id(client, monkeypatch):
    """Spec: single PDF upload returns job_id."""
    monkeypatch.setattr("api.routes.ingest.create_ingest_job", AsyncMock(return_value="pdf-job-99"))
    monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None))

    response = client.post(
        "/ingest/upload",
        data={"collection_name": "default"},
        files=[("files", ("brief.pdf", b"%PDF-1.4 minimal", "application/pdf"))],
    )

    assert response.status_code == 200
    body = response.json()
    assert body["job_id"] == "pdf-job-99"
    assert body["filenames"] == ["brief.pdf"]


def test_list_collections_backfills_created_at_from_jobs(client, test_settings, monkeypatch):
    monkeypatch.setattr(
        "api.routes.ingest.list_collection_names",
        lambda *_: ["default"],
    )
    monkeypatch.setattr("api.routes.ingest.collection_document_count", lambda *_: 3)
    monkeypatch.setattr("api.routes.ingest.collection_created_at", lambda *_: None)
    monkeypatch.setattr(
        "api.routes.ingest.earliest_job_created_at_for_collection",
        AsyncMock(return_value="2026-05-21 07:05:38"),
    )
    monkeypatch.setattr(
        "api.routes.ingest.ensure_collection_created_at",
        lambda *_a, **_k: "2026-05-21T07:05:38Z",
    )

    response = client.get("/ingest/collections")
    assert response.status_code == 200
    body = response.json()
    assert body["total"] == 1
    assert body["collections"][0]["name"] == "default"
    assert body["collections"][0]["document_count"] == 3
    assert body["collections"][0]["created_at"] is not None


def test_job_status_polling_after_real_job_create(client, test_settings):
    """Spec: job status polling returns correct structure."""
    job_id = asyncio.run(
        create_ingest_job(
            test_settings.jobs_db_path,
            collection_name="default",
            filenames=["sample.txt"],
        )
    )
    asyncio.run(mark_job_processing(test_settings.jobs_db_path, job_id))

    response = client.get(f"/jobs/{job_id}")
    assert response.status_code == 200
    body = response.json()
    assert body["job_id"] == job_id
    assert body["status"] == "processing"
    assert body["total_files"] == 1
    assert "progress_percent" in body
    assert "errors" in body