Spaces:

mayankchugh-learning
/

Document-Audit-RAG

Sleeping

App Files Files Community

Mayank Chugh commited on 7 days ago

Commit

d44b33d

0 Parent(s):

Deploy DocuAudit AI to Hugging Face Space (no binaries)

Browse files

Files changed (45) hide show

.dockerignore +29 -0
.env.example +99 -0
.python-version +1 -0
Dockerfile +29 -0
LICENSE +201 -0
README.md +186 -0
api/__init__.py +1 -0
api/config.py +135 -0
api/main.py +64 -0
api/routes/__init__.py +1 -0
api/routes/audit.py +65 -0
api/routes/ingest.py +348 -0
api/routes/jobs.py +47 -0
api/routes/query.py +179 -0
app.py +117 -0
docker-compose.yml +67 -0
main.py +13 -0
models/__init__.py +1 -0
models/requests.py +78 -0
models/responses.py +135 -0
pyproject.toml +34 -0
pytest.ini +3 -0
rag/__init__.py +6 -0
rag/chunker.py +28 -0
rag/embedder.py +44 -0
rag/hf_hub_inference.py +380 -0
rag/loader.py +35 -0
rag/retriever.py +218 -0
rag/vector_store.py +125 -0
requirements.txt +23 -0
sample.txt +16 -0
storage/__init__.py +1 -0
storage/audit_store.py +295 -0
storage/job_store.py +309 -0
streamlit_app.py +513 -0
tests/conftest.py +41 -0
tests/test_audit.py +218 -0
tests/test_config.py +21 -0
tests/test_health.py +9 -0
tests/test_ingest.py +153 -0
tests/test_jobs.py +58 -0
tests/test_query.py +229 -0
uv.lock +0 -0
workers/__init__.py +1 -0
workers/ingest_worker.py +108 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,29 @@

+# Markdown: omit from build context except repo root README.md (Dockerfile COPY README.md).
+*.md
+**/*.md
+!README.md
+.git
+.gitignore
+.env
+.venv
+venv
+__pycache__
+*.py[cod]
+*$py.class
+.pytest_cache
+.mypy_cache
+.ruff_cache
+*.egg-info
+dist
+build
+.coverage
+htmlcov
+.DS_Store
+docs
+tests
+.cursor
+terminals
+*.log
+data/chroma
+chroma

.env.example ADDED Viewed

	@@ -0,0 +1,99 @@

+# DocuAudit AI — environment template (see docs/DOCUAUDIT_AI_REQUIREMENTS.md)
+# LLM Provider: ollama | anthropic | openai | huggingface
+LLM_PROVIDER=ollama
+# OpenAI (optional)
+OPENAI_API_KEY=
+OPENAI_MODEL=gpt-4o
+OPENAI_EMBEDDING_MODEL=text-embedding-3-small
+# Anthropic (optional)
+ANTHROPIC_API_KEY=
+ANTHROPIC_MODEL=claude-3-5-sonnet-20241022
+# Hugging Face Inference API (when LLM_PROVIDER=huggingface — typical on Hugging Face Spaces)
+# Use a fine-grained token with "Make calls to Inference Providers" / Inference API where required.
+HUGGINGFACE_API_KEY=
+# Use a model your Hub gates allow (e.g. Llama 3.8B under “Meta Llama 3”, or Mistral instruct). Llama 3.1 needs its own gate. Chat: hf-inference then router auto.
+#HUGGINGFACE_MODEL=mistralai/Mistral-7B-Instruct-v0.3
+#HUGGINGFACE_MODEL=meta-llama/Meta-Llama-3.1-8B-Instruct
+HUGGINGFACE_MODEL=meta-llama/Meta-Llama-3-8B-Instruct
+HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
+# Optional: huggingface_hub InferenceClient provider. Leave unset: primary hf-inference, then router auto for chat (Mistral instruct ids also try Novita).
+# Use `auto` for router-only primary client (may pick Novita and break some models).
+HUGGINGFACE_INFERENCE_PROVIDER=
+# On Hugging Face Spaces you can omit HUGGINGFACE_API_KEY if the Space provides HF_TOKEN (mapped
+# automatically when LLM_PROVIDER=huggingface). For local .env you can set HF_TOKEN instead.
+# Ollama (recommended local default)
+OLLAMA_BASE_URL=http://localhost:11434
+OLLAMA_CHAT_MODEL=llama3.1:8b
+OLLAMA_EMBEDDING_MODEL=nomic-embed-text
+# App
+APP_NAME=DocuAudit AI
+APP_VERSION=1.0.0
+DEBUG=false
+MAX_FILE_SIZE_MB=50
+# Spec name alias (optional; mapped to MAX_FILE_SIZE_MB in settings)
+MAX_UPLOAD_SIZE_MB=
+# ChromaDB
+CHROMA_PERSIST_DIRECTORY=./data/chroma
+CHROMA_PERSIST_DIR=
+CHROMA_COLLECTION_NAME=docuaudit_docs
+# Chunking
+CHUNK_SIZE=1000
+CHUNK_OVERLAP=200
+# Retrieval default (overridable per request on /query/ask via top_k)
+TOP_K_RESULTS=5
+# Audit + jobs SQLite
+AUDIT_DB_PATH=./audit.db
+JOBS_DB_PATH=./data/jobs.db
+# Limits
+MAX_DOCUMENTS_PER_BATCH=100
+# URL ingest (POST /ingest/url). SEC.gov blocks undeclared bots — use "Company Name you@email.com".
+# INGEST_USER_AGENT=DocuAudit AI you@example.com
+# Streamlit → API (Streamlit process reads these when set in the shell / OS env)
+STREAMLIT_BACKEND_URL=http://localhost:8000
+DOC_AUDI_API_BASE=http://127.0.0.1:8000
+# Read timeout (seconds) for Ask/Summarise HTTP calls; default in code is 3600 if unset
+DOC_AUDI_HTTP_READ_TIMEOUT=3600
+# --- Docker Compose (Milestone 12) ---
+# Copy this file to `.env` before `docker compose up` (Compose loads `.env` for substitution and `env_file`).
+#
+# Persistent paths below are overridden in docker-compose.yml to a single volume mount at /data:
+#   CHROMA_PERSIST_DIRECTORY=/data/chroma, AUDIT_DB_PATH=/data/audit.db, JOBS_DB_PATH=/data/jobs.db
+# You do not need to duplicate those in .env for compose unless you use a custom override file.
+#
+# Ollama from the API container cannot reach localhost on your machine; default in compose is:
+#   OLLAMA_BASE_URL=http://host.docker.internal:11434
+# (extra_hosts host-gateway is set for Linux.) Run `ollama serve` on the host, or start the bundled
+# Ollama service:  docker compose --profile ollama up -d
+# When using the compose `ollama` profile, set in .env:
+#   OLLAMA_BASE_URL=http://ollama:11434
+#
+# Compose sets DOC_AUDI_API_BASE / STREAMLIT_BACKEND_URL to http://api:8000 for the Streamlit service
+# so server-side HTTP calls reach the API on the Docker network (do not override for UI in compose).
+#
+# Optional port overrides: API_PORT=8000, STREAMLIT_PORT=8501, OLLAMA_HOST_PORT=11434
+# --- Hugging Face Spaces ---
+# Recommended for CPU Spaces (no Ollama): set in Space Settings → Repository secrets → Variables
+#   LLM_PROVIDER=huggingface
+#   HUGGINGFACE_API_KEY=<token>   OR rely on built-in HF_TOKEN (same value as a Hub token secret)
+#   HUGGINGFACE_MODEL / HUGGINGFACE_EMBEDDING_MODEL as needed
+# If the API runs in a second Space or external URL, set for the Streamlit Space:
+#   DOC_AUDI_API_BASE=https://your-api....hf.space   (or your FastAPI public URL)
+# Streamlit on Spaces must listen on port 8501 (default). Entry file: app.py (see docs/HUGGING_FACE_SPACES.md).
+# On Streamlit SDK Spaces, only Streamlit starts by default; app.py auto-starts uvicorn on 127.0.0.1:8000 when
+# SPACE_ID is set (built-in Hub env). Set DOC_AUDI_EMBED_API=0 to disable if you use a separate API URL above.
+# Repository secrets (HF_TOKEN / HUGGINGFACE_API_KEY) are copied from st.secrets into the API subprocess env.

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+# Single image for API (uvicorn) and UI (Streamlit); compose overrides the command per service.
+FROM python:3.11-slim-bookworm
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PYTHONPATH=/app \
+    PIP_NO_CACHE_DIR=1 \
+    ANONYMIZED_TELEMETRY=FALSE
+WORKDIR /app
+# PyMuPDF / scientific wheels are manylinux; minimal OS deps for SSL and fonts used by PDF tooling.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --upgrade pip && pip install -r requirements.txt
+COPY api/ api/
+COPY models/ models/
+COPY rag/ rag/
+COPY storage/ storage/
+COPY workers/ workers/
+COPY app.py streamlit_app.py main.py pyproject.toml README.md ./
+EXPOSE 8000 8501
+CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,186 @@

+---
+title: Document-Audit RAG
+emoji: 📑
+colorFrom: blue
+colorTo: indigo
+sdk: streamlit
+sdk_version: "1.39.0"
+app_file: app.py
+---
+# DocuAudit AI
+**DocuAudit AI** is a production-oriented FastAPI backend plus optional Streamlit UI for **multi-document RAG**: upload documents, build a Chroma vector index, ask grounded questions with citations, and retain a **SQLite audit trail** of every query.
+## Architecture
+```mermaid
+flowchart LR
+  subgraph ingest [Ingestion]
+    A[PDF / TXT / MD] --> B[Loader]
+    B --> C[Chunker]
+    C --> D[Embedder]
+    D --> E[(ChromaDB)]
+  end
+  subgraph query [Query path]
+    Q[User question] --> R[Semantic search]
+    R --> E
+    R --> T[Top-K chunks]
+    T --> L[LLM]
+    L --> U[Answer + citations]
+  end
+  U --> V[(SQLite audit)]
+```
+ASCII equivalent:
+```
+PDF Upload → Parser → Chunker → Embedder → ChromaDB
+                                              ↓
+User Query → Semantic Search → Top-K Chunks → LLM → Answer + Citations
+                                              ↓
+                                       Audit Log (SQLite)
+```
+## Use cases
+- **Litigation document analysis** — trace claims to exact pages and filenames.
+- **Corporate finance review** — compare disclosures and filings under a consistent audit log.
+- **Investigation support** — bulk ingest, async jobs, and reproducible query history.
+## Deploying on Hugging Face Spaces
+- Set **`LLM_PROVIDER=huggingface`**; use **`HUGGINGFACE_API_KEY`** and/or the Space secret **`HF_TOKEN`** (see [`.env.example`](.env.example)).
+- Use root **`app.py`** as the Streamlit entry for the default Hub command.
+- Hub UI, secrets, hardware, and Streamlit SDK details: [Streamlit Spaces](https://huggingface.co/docs/hub/spaces-sdks-streamlit), [Spaces overview](https://huggingface.co/docs/hub/spaces-overview).
+- **Test locally before deploy:** `uv run python scripts/verify_huggingface_inference.py` (requires `LLM_PROVIDER=huggingface` in `.env`).
+## Quick start with Docker
+Requires [Docker Engine](https://docs.docker.com/engine/) and Compose v2. The snippet below matches the shipped **`docker-compose.yml`**: API on **8000**, Streamlit on **8501**, with Chroma and SQLite under **`/data`** inside the API container. After **`docker compose up -d`**, expect **`curl http://localhost:8000/health`** to return JSON including **`"status":"ok"`**.
+```bash
+git clone <repository-url> doc-Audi-ai
+cd doc-Audi-ai
+cp .env.example .env
+# edit .env as needed; for compose Ollama: OLLAMA_BASE_URL=http://ollama:11434
+# (with host Ollama: run `ollama serve`; compose defaults to host.docker.internal:11434)
+docker compose build
+docker compose up -d
+curl -s http://localhost:8000/health
+# http://localhost:8501 — Streamlit
+docker compose down
+```
+Optional all-in-one Ollama in Compose: `docker compose --profile ollama up -d` (then set `OLLAMA_BASE_URL=http://ollama:11434` in `.env` and recreate containers).
+## How it works (user workflow)
+Collections, ingestion vs querying, jobs vs audit, Streamlit tabs, and **per-button UI flows**: **[docs/USER_WORKFLOW.md](docs/USER_WORKFLOW.md)**.
+## Run and test (step-by-step)
+For ingestion formats, URL rules, job polling, sample `sample.txt` walkthrough, curl/PowerShell examples, and troubleshooting, see **[docs/RUN_AND_TEST_GUIDE.md](docs/RUN_AND_TEST_GUIDE.md)**.
+For SQLite vs Memcached, offline DB inspection, and the Cursor **SQLite Viewer** extension (`qwtel.sqlite-viewer`), see **[docs/SQLITE_AND_DB_INSPECTION.md](docs/SQLITE_AND_DB_INSPECTION.md)**.
+## Quick start (local, without Docker)
+Run the API with **uv** (or your preferred tool):
+```bash
+git clone <repository-url> doc-Audi-ai
+cd doc-Audi-ai
+cp .env.example .env
+uv sync
+ollama pull llama3.1:8b
+ollama pull nomic-embed-text
+uv run uvicorn api.main:app --host 0.0.0.0 --port 8000 --reload
+uv run uvicorn api.main:app --host 0.0.0.0 --port 8000 --reload --reload-dir api --reload-dir storage
+```
+Optional UI:
+```bash
+uv run streamlit run streamlit_app.py --server.port 8501 --server.address 0.0.0.0
+```
+## API overview
+| Method | Path | Description |
+|--------|------|-------------|
+| GET | `/health` | Liveness; returns configured app name and version |
+| POST | `/ingest/upload` | Multipart **`files`** (one or more); queues background ingest job |
+| POST | `/ingest/url` | JSON **`urls`** array (1–100); download and queue ingest |
+| GET | `/ingest/collections` | Lists collections with **`document_count`** and optional **`created_at`** |
+| DELETE | `/ingest/collection/{collection_name}` | Drops a collection; returns **`documents_removed`** |
+| GET | `/jobs` | Lists jobs with **`total`** count |
+| GET | `/jobs/{job_id}` | Job status with **`progress_percent`**, file counters, timestamps, **`errors`** |
+| POST | `/query/ask` | Grounded answer; request includes **`top_k`**, **`user_id`** |
+| POST | `/query/summarise` | Collection summary; distinct response shape (`summary`, `document_count`, …) |
+| POST | `/query` | Legacy alias of **`/query/ask`** |
+| GET | `/audit/logs` | Filterable audit index (`user_id`, `from_date`, `to_date`, pagination) |
+| GET | `/audit/logs/{query_id}` | Full stored answer and citations for one query |
+Interactive docs: `http://localhost:8000/docs`.
+## Sample request and response (`POST /query/ask`)
+Request:
+```json
+{
+  "question": "What were the key risk factors identified in the Q3 2023 financial report?",
+  "collection_name": "default",
+  "top_k": 5,
+  "user_id": "analyst_001"
+}
+```
+Response (shape; values depend on your documents and model):
+```json
+{
+  "query_id": "uuid-string",
+  "question": "What were the key risk factors identified in the Q3 2023 financial report?",
+  "answer": "… grounded text with citations …",
+  "sources": [
+    {
+      "document_name": "q3_financial_report.pdf",
+      "page_number": 12,
+      "chunk_text": "Key risk factors include …",
+      "relevance_score": 0.91
+    }
+  ],
+  "model_used": "llama3.1:8b",
+  "tokens_used": 0,
+  "response_time_ms": 1820,
+  "timestamp": "2026-05-03T12:00:00Z"
+}
+```
+## Design decisions
+- **Source citations** — High-stakes review requires every substantive claim to be tied to **document name** and **page** (where available), not a free-floating model monologue.
+- **Auditability** — Each ask/summarise persists **query id**, **user id**, timing, model id, token usage (when the provider exposes it), and serialized sources so regulators or counsel can reconstruct what the system returned.
+## Scale note
+Architecture is designed for **high-volume document ingestion** via **async background jobs** (FastAPI `BackgroundTasks`), persistent Chroma collections, and a stateless API tier that can be replicated once you add a shared vector store and job queue.
+## Tests
+Automated API tests use **pytest** with isolated temp databases; they do **not** require a running server or Ollama.
+```bash
+uv sync
+uv run pytest tests/ -q
+```
+Full guide (commands, coverage by file, mocks vs manual smoke tests, troubleshooting): **[docs/TESTING.md](docs/TESTING.md)**.
+## Configuration
+See **`.env.example`**. Common variables include `LLM_PROVIDER`, Ollama/OpenAI/Anthropic keys and models, `CHROMA_PERSIST_DIRECTORY`, `AUDIT_DB_PATH`, `JOBS_DB_PATH`, and upload limits (`MAX_FILE_SIZE_MB`; **`MAX_UPLOAD_SIZE_MB`** is accepted as an alias via settings normalization).

api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """HTTP API package: FastAPI app, settings, and route modules."""

api/config.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Application configuration loaded from environment variables and ``.env``.
+``Settings`` is the single source of truth for LLM provider choice, Chroma paths,
+chunking limits, upload caps, and SQLite locations. Use :func:`get_settings` (cached)
+from route handlers and RAG modules instead of reading ``os.environ`` directly.
+"""
+import os
+from functools import lru_cache
+from typing import Any, Self
+from pydantic import Field, model_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    """Pydantic-settings model for DocuAudit AI; fields map to env vars (case-insensitive)."""
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+        case_sensitive=False,
+        populate_by_name=True,
+    )
+    @model_validator(mode="before")
+    @classmethod
+    def _map_max_upload_env_alias(cls, data: Any) -> Any:
+        if not isinstance(data, dict):
+            return data
+        out = dict(data)
+        if out.get("max_file_size_mb") in (None, "") and out.get("max_upload_size_mb") not in (None, ""):
+            out["max_file_size_mb"] = out.pop("max_upload_size_mb")
+        elif "max_upload_size_mb" in out and "max_file_size_mb" not in out:
+            out["max_file_size_mb"] = out.pop("max_upload_size_mb")
+        return out
+    app_name: str = Field(default="DocuAudit AI", description="FastAPI title and product name")
+    app_version: str = Field(default="1.0.0", description="Application version")
+    app_description: str = Field(
+        default=(
+            "Multi-document RAG API for high-stakes consulting environments. "
+            "Every answer is grounded in source documents with full audit trails."
+        ),
+        description="OpenAPI /docs description",
+    )
+    llm_provider: str = Field(default="ollama", description="Embedding provider")
+    openai_api_key: str | None = Field(default=None, description="OpenAI API key")
+    openai_model: str = "gpt-4o"
+    openai_embedding_model: str = "text-embedding-3-small"
+    anthropic_api_key: str = ""
+    anthropic_model: str = "claude-3-5-sonnet-20241022"
+    huggingface_api_key: str = ""
+    huggingface_model: str = Field(
+        default="meta-llama/Meta-Llama-3-8B-Instruct",
+        description=(
+            "HF chat model id (use a repo your Hub account already has access to; Llama 3.1 needs the "
+            "separate Llama 3.1 gate). Chat tries hf-inference then router auto when unset."
+        ),
+    )
+    huggingface_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
+    huggingface_inference_provider: str | None = Field(
+        default=None,
+        description=(
+            "Optional huggingface_hub InferenceClient provider (e.g. hf-inference, together). "
+            "Unset uses hf-inference in chat code; set to `auto` for router auto-routing."
+        ),
+    )
+    ollama_base_url: str = Field(default="http://localhost:11434", description="Ollama base URL")
+    ollama_chat_model: str = "llama3.1:8b"
+    ollama_embedding_model: str = "nomic-embed-text"
+    chroma_persist_directory: str = Field(default="./data/chroma", description="Chroma persistence path")
+    chroma_persist_dir: str = Field(default="./chroma", description="Chroma persistence path")
+    chroma_collection_name: str = "docuaudit_docs"
+    chunk_size: int = Field(default=1000, ge=100, le=8000, description="Chunk size for splitting")
+    chunk_overlap: int = Field(default=200, ge=0, le=2000, description="Chunk overlap for splitting")
+    top_k_results: int = Field(default=5, ge=1, le=20, description="Default number of chunks to retrieve")
+    audit_db_path: str = "./audit.db"
+    jobs_db_path: str = Field(default="./data/jobs.db", description="SQLite path for ingest job tracking")
+    max_file_size_mb: int = Field(default=50, ge=1, le=200, description="Max upload file size (MB)")
+    max_documents_per_batch: int = Field(default=100, ge=1, le=1000, description="Max documents per batch")
+    ingest_user_agent: str = Field(
+        default="DocuAudit AI docuaudit-ingest@example.com",
+        description=(
+            "HTTP User-Agent for POST /ingest/url downloads. SEC.gov requires "
+            "'Company Name contact@email.com' with a reachable address (see sec.gov/os/accessing-edgar-data)."
+        ),
+    )
+    @model_validator(mode="after")
+    def _space_default_llm_provider(self) -> Self:
+        """Hugging Face Spaces do not run Ollama locally; use Hub inference unless the user set LLM_PROVIDER."""
+        if not (os.environ.get("SPACE_ID") or "").strip():
+            return self
+        if "LLM_PROVIDER" in os.environ:
+            return self
+        if self.llm_provider.lower() != "ollama":
+            return self
+        self.llm_provider = "huggingface"
+        return self
+    @model_validator(mode="after")
+    def _huggingface_token_from_hub_env(self) -> Self:
+        """When using the Hugging Face inference stack, accept the Hub token from standard env names.
+        Spaces often expose `HF_TOKEN` (read/write per Space secrets). Map it into `huggingface_api_key`
+        when `HUGGINGFACE_API_KEY` is unset so embedder/chat clients receive a token.
+        """
+        if self.llm_provider.lower() != "huggingface":
+            return self
+        if (self.huggingface_api_key or "").strip():
+            return self
+        for key in ("HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"):
+            token = (os.environ.get(key) or "").strip()
+            if token:
+                self.huggingface_api_key = token
+                break
+        return self
+@lru_cache
+def get_settings() -> Settings:
+    """Return the process-wide settings singleton (cleared in tests via ``cache_clear()``)."""
+    return Settings()

api/main.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""FastAPI application entry point for DocuAudit AI.
+Creates the ASGI app, registers CORS, mounts route modules (ingest, query, jobs, audit),
+and initializes SQLite audit and job stores on startup.
+Run locally::
+    uv run uvicorn api.main:app --host 0.0.0.0 --port 8000 --reload
+Health check: ``GET /health``.
+"""
+import os
+# Before any route imports that touch Chroma: disable product telemetry (avoids posthog capture() errors in logs).
+os.environ.setdefault("ANONYMIZED_TELEMETRY", "FALSE")
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from api.config import get_settings
+from storage.audit_store import init_audit_db
+from storage.job_store import init_jobs_db
+from .routes import audit, ingest, jobs, query
+_settings = get_settings()
+app = FastAPI(
+    title=_settings.app_name,
+    version=_settings.app_version,
+    description=_settings.app_description,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.include_router(audit.router)
+app.include_router(ingest.router)
+app.include_router(jobs.router)
+app.include_router(query.router)
+app.include_router(query.legacy_query_router)
+@app.on_event("startup")
+async def startup() -> None:
+    """Ensure audit and ingest-job SQLite schemas exist before serving traffic."""
+    settings = get_settings()
+    await init_audit_db(settings.audit_db_path)
+    await init_jobs_db(settings.jobs_db_path)
+@app.get("/health", tags=["Health"])
+def health() -> dict[str, str]:
+    """Liveness probe returning app name, version, and ``status: ok``."""
+    settings = get_settings()
+    return {
+        "status": "ok",
+        "app": settings.app_name,
+        "version": settings.app_version,
+    }

api/routes/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """FastAPI routers grouped by domain: ingest, query, jobs, and audit."""

api/routes/audit.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""Query audit log HTTP routes.
+Every successful ask/summarise call writes to SQLite via :mod:`storage.audit_store`.
+These endpoints expose paginated list and per-query detail for compliance review.
+"""
+from typing import Annotated
+from fastapi import APIRouter, Depends, HTTPException, Query, status
+from api.config import get_settings
+from models.requests import AuditListParams
+from models.responses import AuditLogDetailResponse, AuditLogsResponse
+from storage.audit_store import get_audit_event, list_audit_events
+def _audit_list_params(
+    limit: Annotated[int, Query(ge=1, le=100)] = 50,
+    offset: Annotated[int, Query(ge=0)] = 0,
+    user_id: Annotated[str | None, Query(max_length=256)] = None,
+    from_date: Annotated[str | None, Query(description="ISO 8601 lower bound")] = None,
+    to_date: Annotated[str | None, Query(description="ISO 8601 upper bound")] = None,
+) -> AuditListParams:
+    return AuditListParams(
+        limit=limit,
+        offset=offset,
+        user_id=user_id,
+        from_date=from_date,
+        to_date=to_date,
+    )
+router = APIRouter(prefix="/audit", tags=["audit"])
+@router.get("/logs", response_model=AuditLogsResponse)
+async def audit_logs(
+    params: Annotated[AuditListParams, Depends(_audit_list_params)],
+) -> AuditLogsResponse:
+    """Paginated audit trail with optional user and date filters."""
+    settings = get_settings()
+    logs, total = await list_audit_events(
+        settings.audit_db_path,
+        limit=params.limit,
+        offset=params.offset,
+        user_id=params.user_id,
+        from_date=params.from_date,
+        to_date=params.to_date,
+    )
+    return AuditLogsResponse(
+        logs=logs,
+        total=total,
+        limit=params.limit,
+        offset=params.offset,
+    )
+@router.get("/logs/{query_id}", response_model=AuditLogDetailResponse)
+async def audit_log_detail(query_id: str) -> AuditLogDetailResponse:
+    """Full answer and citations for one audited query."""
+    settings = get_settings()
+    event = await get_audit_event(settings.audit_db_path, query_id)
+    if event is None:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Audit event not found.")
+    return event

api/routes/ingest.py ADDED Viewed

	@@ -0,0 +1,348 @@

+"""Document ingestion HTTP routes.
+Endpoints under ``/ingest`` queue background jobs that load PDF/TXT/MD files (upload or URL),
+chunk and embed them, and write vectors into a named Chroma collection. Poll ``/jobs/{id}``
+for progress. Collection listing and deletion are synchronous.
+"""
+from datetime import datetime, timezone
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import Annotated
+from urllib.parse import unquote, urlparse
+import httpx
+from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, UploadFile, status
+from api.config import get_settings
+from models.requests import URLIngestRequest
+from models.responses import (
+    CollectionItem,
+    IngestCollectionsResponse,
+    IngestDeleteCollectionResponse,
+    IngestUploadResponse,
+    UrlIngestResponse,
+)
+from rag.vector_store import (
+    collection_created_at,
+    collection_document_count,
+    delete_collection,
+    ensure_collection_created_at,
+    list_collection_names,
+)
+from storage.job_store import create_ingest_job, earliest_job_created_at_for_collection
+from workers.ingest_worker import run_ingest_job
+router = APIRouter(prefix="/ingest", tags=["ingest"])
+_SUPPORTED_EXTENSIONS = frozenset({".pdf", ".txt", ".md"})
+_CONTENT_TYPE_SUFFIX: dict[str, str] = {
+    "application/pdf": ".pdf",
+    "text/plain": ".txt",
+    "text/markdown": ".md",
+    "text/x-markdown": ".md",
+}
+def _validate_file(file: UploadFile, max_bytes: int) -> str:
+    """Check extension and size; return normalized suffix (e.g. ``.pdf``)."""
+    filename = (file.filename or "").strip()
+    if not filename:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Filename is required.")
+    suffix = Path(filename).suffix.lower()
+    if suffix not in _SUPPORTED_EXTENSIONS:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Unsupported file type. Only PDF, TXT, and MD are accepted.",
+        )
+    file.file.seek(0, 2)
+    size = file.file.tell()
+    file.file.seek(0)
+    if size > max_bytes:
+        raise HTTPException(
+            status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
+            detail=f"File too large. Max allowed is {max_bytes // (1024 * 1024)}MB.",
+        )
+    return suffix
+def _suffix_from_url_path(url: str) -> str | None:
+    path = urlparse(url).path
+    suffix = Path(unquote(path)).suffix.lower()
+    return suffix if suffix in _SUPPORTED_EXTENSIONS else None
+def _suffix_from_content_type(content_type: str | None) -> str | None:
+    if not content_type:
+        return None
+    base = content_type.split(";")[0].strip().lower()
+    return _CONTENT_TYPE_SUFFIX.get(base)
+def _download_request_headers(user_agent: str) -> dict[str, str]:
+    """Headers for remote URL fetches (SEC.gov requires declared User-Agent + Accept-Encoding)."""
+    return {
+        "User-Agent": user_agent.strip() or "DocuAudit AI docuaudit-ingest@example.com",
+        "Accept-Encoding": "gzip, deflate",
+        "Accept": "application/pdf,text/plain,text/markdown,*/*;q=0.8",
+    }
+def _display_name_from_url(url: str, suffix: str) -> str:
+    name = Path(unquote(urlparse(url).path)).name.strip()
+    if not name or name in {"/", "."}:
+        return f"download{suffix}"
+    if Path(name).suffix.lower() not in _SUPPORTED_EXTENSIONS:
+        return f"{name}{suffix}" if not name.endswith(suffix) else name
+    return name
+async def _download_url_to_temp(url: str, max_bytes: int, user_agent: str | None = None) -> tuple[str, str]:
+    """Stream-download a URL to a temp file; return ``(path, display_name)``."""
+    parsed = urlparse(url)
+    if parsed.scheme not in ("http", "https"):
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Only http and https URLs are supported.",
+        )
+    ua = user_agent or get_settings().ingest_user_agent
+    timeout = httpx.Timeout(60.0, connect=10.0)
+    limits = httpx.Limits(max_keepalive_connections=5, max_connections=5)
+    headers = _download_request_headers(ua)
+    try:
+        async with httpx.AsyncClient(timeout=timeout, limits=limits, follow_redirects=True) as client:
+            async with client.stream("GET", url, headers=headers) as response:
+                response.raise_for_status()
+                content_type = response.headers.get("content-type")
+                suffix = _suffix_from_url_path(url) or _suffix_from_content_type(content_type)
+                if not suffix:
+                    raise HTTPException(
+                        status_code=status.HTTP_400_BAD_REQUEST,
+                        detail=(
+                            "Could not determine file type from the URL path or Content-Type. "
+                            "Provide a .pdf, .txt, or .md resource with matching content-type."
+                        ),
+                    )
+                display_name = _display_name_from_url(url, suffix)
+                total = 0
+                with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                    temp_path = tmp.name
+                    async for chunk in response.aiter_bytes(chunk_size=65536):
+                        total += len(chunk)
+                        if total > max_bytes:
+                            raise HTTPException(
+                                status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
+                                detail=f"Download too large. Max allowed is {max_bytes // (1024 * 1024)}MB.",
+                            )
+                        tmp.write(chunk)
+    except HTTPException:
+        raise
+    except httpx.HTTPStatusError as exc:
+        code = exc.response.status_code if exc.response else "unknown"
+        detail = f"Remote server returned HTTP {code}."
+        if code == 403 and "sec.gov" in parsed.netloc.lower():
+            detail += (
+                " SEC.gov requires a declared User-Agent ('Company Name you@email.com'). "
+                "Set INGEST_USER_AGENT in .env (see sec.gov/os/accessing-edgar-data)."
+            )
+        raise HTTPException(
+            status_code=status.HTTP_502_BAD_GATEWAY,
+            detail=detail,
+        ) from exc
+    except httpx.RequestError as exc:
+        raise HTTPException(
+            status_code=status.HTTP_502_BAD_GATEWAY,
+            detail=f"Failed to download URL: {exc}",
+        ) from exc
+    return temp_path, display_name
+def _parse_created_at(raw: str | None) -> datetime | None:
+    if not raw:
+        return None
+    s = raw.strip()
+    if s.endswith("Z"):
+        s = s[:-1] + "+00:00"
+    try:
+        dt = datetime.fromisoformat(s)
+        if dt.tzinfo is None:
+            return dt.replace(tzinfo=timezone.utc)
+        return dt
+    except ValueError:
+        return None
+@router.post("/upload", response_model=IngestUploadResponse)
+async def upload_endpoint(
+    background_tasks: BackgroundTasks,
+    files: list[UploadFile] = File(..., description="One or more PDF, TXT, or MD files"),
+    collection_name: Annotated[str, Form(min_length=1, max_length=256)] = "default",
+) -> IngestUploadResponse:
+    """Accept multipart file uploads, validate, and queue a background ingest job."""
+    settings = get_settings()
+    if not files:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="At least one file is required.")
+    if len(files) > settings.max_documents_per_batch:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Too many files in one request (max {settings.max_documents_per_batch}).",
+        )
+    max_bytes = settings.max_file_size_mb * 1024 * 1024
+    temp_paths: list[tuple[str, str]] = []
+    filenames: list[str] = []
+    try:
+        for file in files:
+            suffix = _validate_file(file, max_bytes)
+            display_name = (file.filename or "upload").strip()
+            file_bytes = await file.read()
+            with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                tmp.write(file_bytes)
+                temp_paths.append((tmp.name, display_name))
+            filenames.append(display_name)
+            await file.close()
+        job_id = await create_ingest_job(
+            settings.jobs_db_path,
+            collection_name=collection_name.strip(),
+            filenames=filenames,
+        )
+        background_tasks.add_task(
+            run_ingest_job,
+            job_id,
+            temp_paths,
+            collection_name.strip(),
+            settings.jobs_db_path,
+            settings.chroma_persist_directory,
+        )
+        return IngestUploadResponse(
+            job_id=job_id,
+            status="queued",
+            total_files=len(filenames),
+            filenames=filenames,
+            message=f"Documents queued for processing. Poll /jobs/{job_id} for status.",
+        )
+    except HTTPException:
+        for path, _ in temp_paths:
+            Path(path).unlink(missing_ok=True)
+        raise
+    except Exception as exc:
+        for path, _ in temp_paths:
+            Path(path).unlink(missing_ok=True)
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(exc)) from exc
+@router.post("/url", response_model=UrlIngestResponse)
+async def ingest_url_endpoint(
+    background_tasks: BackgroundTasks,
+    payload: URLIngestRequest,
+) -> UrlIngestResponse:
+    """Download one or more HTTP(S) documents and queue them for ingestion."""
+    settings = get_settings()
+    max_bytes = settings.max_file_size_mb * 1024 * 1024
+    url_strings = [str(u).strip() for u in payload.urls]
+    if len(url_strings) > settings.max_documents_per_batch:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Too many URLs in one request (max {settings.max_documents_per_batch}).",
+        )
+    downloaded: list[tuple[str, str]] = []
+    try:
+        for url_str in url_strings:
+            temp_path, display_name = await _download_url_to_temp(
+                url_str, max_bytes, user_agent=settings.ingest_user_agent
+            )
+            downloaded.append((temp_path, display_name))
+        coll = (payload.collection_name or "default").strip()
+        job_id = await create_ingest_job(
+            settings.jobs_db_path,
+            collection_name=coll,
+            filenames=[name for _, name in downloaded],
+        )
+        background_tasks.add_task(
+            run_ingest_job,
+            job_id,
+            downloaded,
+            coll,
+            settings.jobs_db_path,
+            settings.chroma_persist_directory,
+        )
+        return UrlIngestResponse(
+            job_id=job_id,
+            status="queued",
+            total_urls=len(downloaded),
+            message="URLs queued for download and processing.",
+        )
+    except HTTPException:
+        for path, _ in downloaded:
+            Path(path).unlink(missing_ok=True)
+        raise
+    except Exception as exc:
+        for path, _ in downloaded:
+            Path(path).unlink(missing_ok=True)
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(exc)) from exc
+@router.get("/collections", response_model=IngestCollectionsResponse)
+async def list_collections_endpoint() -> IngestCollectionsResponse:
+    """List Chroma collections with document counts and creation timestamps."""
+    settings = get_settings()
+    try:
+        names = list_collection_names(settings.chroma_persist_directory)
+    except Exception as exc:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(exc)) from exc
+    items: list[CollectionItem] = []
+    for n in names:
+        cnt = collection_document_count(settings.chroma_persist_directory, n)
+        raw_created = collection_created_at(settings.chroma_persist_directory, n)
+        if not raw_created:
+            job_fallback = await earliest_job_created_at_for_collection(settings.jobs_db_path, n)
+            raw_created = ensure_collection_created_at(
+                settings.chroma_persist_directory,
+                n,
+                fallback=job_fallback,
+            )
+        items.append(
+            CollectionItem(
+                name=n,
+                document_count=cnt,
+                created_at=_parse_created_at(raw_created),
+            )
+        )
+    return IngestCollectionsResponse(collections=items, total=len(items))
+@router.delete("/collection/{collection_name}", response_model=IngestDeleteCollectionResponse)
+async def delete_collection_endpoint(collection_name: str) -> IngestDeleteCollectionResponse:
+    """Remove a Chroma collection and all embedded chunks."""
+    if not collection_name.strip():
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="collection_name is required.")
+    settings = get_settings()
+    name = collection_name.strip()
+    try:
+        existing = list_collection_names(settings.chroma_persist_directory)
+        if name not in existing:
+            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Collection not found.")
+        removed = delete_collection(settings.chroma_persist_directory, name)
+    except HTTPException:
+        raise
+    except Exception as exc:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(exc)) from exc
+    return IngestDeleteCollectionResponse(
+        message=f"Collection '{name}' deleted successfully.",
+        documents_removed=removed,
+    )

api/routes/jobs.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Ingest job status and listing.
+Jobs are created by upload/URL ingest routes and updated by :mod:`workers.ingest_worker`.
+"""
+from typing import Annotated
+from fastapi import APIRouter, Depends, HTTPException, Query, status
+from api.config import get_settings
+from models.requests import JobsListParams
+from models.responses import JobListResponse, JobStatusResponse
+from storage.job_store import get_job_status, list_ingest_jobs
+def _jobs_list_params(
+    limit: Annotated[int, Query(ge=1, le=100)] = 10,
+    offset: Annotated[int, Query(ge=0)] = 0,
+) -> JobsListParams:
+    return JobsListParams(limit=limit, offset=offset)
+router = APIRouter(tags=["jobs"])
+@router.get("/jobs", response_model=JobListResponse)
+async def list_jobs(
+    params: Annotated[JobsListParams, Depends(_jobs_list_params)],
+) -> JobListResponse:
+    """Paginated list of ingest jobs (newest first)."""
+    settings = get_settings()
+    jobs, total = await list_ingest_jobs(
+        settings.jobs_db_path,
+        limit=params.limit,
+        offset=params.offset,
+    )
+    return JobListResponse(jobs=jobs, total=total)
+@router.get("/jobs/{job_id}", response_model=JobStatusResponse)
+async def get_job(job_id: str) -> JobStatusResponse:
+    """Poll a single job by id (404 if unknown)."""
+    settings = get_settings()
+    job = await get_job_status(settings.jobs_db_path, job_id)
+    if job is None:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found.")
+    return job

api/routes/query.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""Grounded Q&A and summarisation routes.
+``POST /query/ask`` retrieves top-K chunks from Chroma, calls the configured LLM with
+citations enforced in the prompt, persists an audit row, and returns answer + sources.
+``POST /query/summarise`` uses a retrieval-oriented query then a summary-focused prompt.
+``POST /query`` is a legacy alias for ``/query/ask``.
+"""
+import time
+from datetime import datetime, timezone
+from uuid import uuid4
+from fastapi import APIRouter, HTTPException, status
+from api.config import Settings, get_settings
+from models.requests import QueryRequest, SummariseRequest
+from models.responses import AskQueryResponse, SourceCitation, SummariseQueryResponse
+from rag.embedder import create_embedding_function
+from rag.retriever import (
+    SUMMARY_RETRIEVAL_QUERY,
+    RetrievedChunk,
+    answer_with_grounding,
+    retrieve_chunks,
+    summarise_with_grounding,
+)
+from rag.vector_store import collection_document_count, get_vector_store
+from storage.audit_store import persist_query_audit
+router = APIRouter(prefix="/query", tags=["query"])
+def _model_used_label(settings: Settings) -> str:
+    provider = settings.llm_provider.lower()
+    if provider == "openai":
+        return settings.openai_model
+    if provider == "ollama":
+        return settings.ollama_chat_model
+    if provider == "anthropic":
+        return settings.anthropic_model
+    if provider == "huggingface":
+        return settings.huggingface_model
+    return f"{provider}:unknown"
+def _chunks_to_citations(chunks: list[RetrievedChunk]) -> list[SourceCitation]:
+    citations: list[SourceCitation] = []
+    for chunk in chunks:
+        page = chunk.page if chunk.page is not None else 0
+        score = float(chunk.score) if chunk.score is not None else 0.0
+        citations.append(
+            SourceCitation(
+                document_name=chunk.source or "unknown",
+                page_number=page,
+                chunk_text=chunk.text,
+                relevance_score=score,
+            )
+        )
+    return citations
+async def _run_ask(
+    settings: Settings,
+    payload: QueryRequest,
+) -> AskQueryResponse:
+    """Retrieve, generate grounded answer, audit, and build the API response."""
+    top_k = payload.top_k
+    t0 = time.perf_counter()
+    embedding_function = create_embedding_function()
+    vector_store = get_vector_store(
+        persist_directory=settings.chroma_persist_directory,
+        collection_name=payload.collection_name or "default",
+        embedding_function=embedding_function,
+    )
+    chunks = retrieve_chunks(vector_store, payload.question, top_k)
+    answer, tokens_used = answer_with_grounding(settings, payload.question, chunks)
+    elapsed_ms = int((time.perf_counter() - t0) * 1000)
+    citations = _chunks_to_citations(chunks)
+    query_id = str(uuid4())
+    ts = datetime.now(timezone.utc)
+    response = AskQueryResponse(
+        query_id=query_id,
+        question=payload.question,
+        answer=answer,
+        sources=citations,
+        model_used=_model_used_label(settings),
+        tokens_used=tokens_used,
+        response_time_ms=elapsed_ms,
+        timestamp=ts,
+    )
+    await persist_query_audit(
+        settings.audit_db_path,
+        query_id=query_id,
+        action="query",
+        user_id=payload.user_id,
+        question=payload.question,
+        collection_name=payload.collection_name or "default",
+        answer=answer,
+        sources=citations,
+        model_used=response.model_used,
+        tokens_used=tokens_used,
+        response_time_ms=elapsed_ms,
+        kind="ask",
+    )
+    return response
+async def _run_summarise(
+    settings: Settings,
+    payload: SummariseRequest,
+) -> SummariseQueryResponse:
+    """Retrieve with focus or default overview query, summarise, and audit."""
+    top_k = settings.top_k_results
+    retrieval_query = (payload.focus or "").strip() or SUMMARY_RETRIEVAL_QUERY
+    audit_question = payload.focus.strip() if payload.focus and payload.focus.strip() else "Summarise collection"
+    t0 = time.perf_counter()
+    embedding_function = create_embedding_function()
+    vector_store = get_vector_store(
+        persist_directory=settings.chroma_persist_directory,
+        collection_name=payload.collection_name,
+        embedding_function=embedding_function,
+    )
+    chunks = retrieve_chunks(vector_store, retrieval_query, top_k)
+    summary, tokens_used = summarise_with_grounding(settings, focus=payload.focus, chunks=chunks)
+    elapsed_ms = int((time.perf_counter() - t0) * 1000)
+    citations = _chunks_to_citations(chunks)
+    doc_count = collection_document_count(settings.chroma_persist_directory, payload.collection_name)
+    query_id = str(uuid4())
+    ts = datetime.now(timezone.utc)
+    response = SummariseQueryResponse(
+        query_id=query_id,
+        summary=summary,
+        document_count=doc_count,
+        sources=citations,
+        timestamp=ts,
+    )
+    await persist_query_audit(
+        settings.audit_db_path,
+        query_id=query_id,
+        action="summarise",
+        user_id=payload.user_id,
+        question=audit_question,
+        collection_name=payload.collection_name,
+        answer=summary,
+        sources=citations,
+        model_used=_model_used_label(settings),
+        tokens_used=tokens_used,
+        response_time_ms=elapsed_ms,
+        kind="summarise",
+    )
+    return response
+@router.post("/ask", response_model=AskQueryResponse)
+async def ask_endpoint(payload: QueryRequest) -> AskQueryResponse:
+    """Grounded question answering against a Chroma collection."""
+    settings = get_settings()
+    try:
+        return await _run_ask(settings, payload)
+    except Exception as exc:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(exc)) from exc
+@router.post("/summarise", response_model=SummariseQueryResponse)
+async def summarise_endpoint(payload: SummariseRequest) -> SummariseQueryResponse:
+    """Collection-wide summary with optional focus for retrieval."""
+    settings = get_settings()
+    try:
+        return await _run_summarise(settings, payload)
+    except Exception as exc:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(exc)) from exc
+legacy_query_router = APIRouter(tags=["query"])
+@legacy_query_router.post("/query", response_model=AskQueryResponse)
+async def query_post_compat(payload: QueryRequest) -> AskQueryResponse:
+    """Same behavior as POST /query/ask; kept for older clients and docs that used POST /query."""
+    return await ask_endpoint(payload)

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""Hugging Face Spaces default entry (Streamlit SDK expects `app.py`).
+Local development can still use `streamlit run streamlit_app.py`; Docker Compose uses `app.py`
+so the same entry path works on the Hub and in containers.
+On Hugging Face Streamlit Spaces only `streamlit run app.py` is started — no separate uvicorn
+process — so we spawn the FastAPI app on 127.0.0.1:8000 when `SPACE_ID` is present (see Hub
+built-in env vars). Set `DOC_AUDI_EMBED_API=0` to disable. Use `DOC_AUDI_EMBED_API=1` to force
+embedding elsewhere (e.g. demos).
+"""
+from __future__ import annotations
+import atexit
+import os
+import socket
+import subprocess
+import sys
+import time
+_uvicorn_proc: subprocess.Popen[bytes] | None = None
+_cleanup_registered = False
+def _port_accepting_connections(host: str, port: int) -> bool:
+    try:
+        with socket.create_connection((host, port), timeout=0.3):
+            return True
+    except OSError:
+        return False
+def _want_embedded_api() -> bool:
+    if os.environ.get("DOC_AUDI_EMBED_API", "").lower() in ("0", "false", "no"):
+        return False
+    if os.environ.get("DOC_AUDI_EMBED_API", "").lower() in ("1", "true", "yes"):
+        return True
+    return bool(os.environ.get("SPACE_ID"))
+def _propagate_streamlit_secrets_to_environ() -> None:
+    """Copy Hub tokens from Streamlit secrets into os.environ for the embedded uvicorn child.
+    On Hugging Face Streamlit Spaces, repository secrets are often available as ``st.secrets``
+    but are not always present in ``os.environ``. ``subprocess.Popen`` only forwards the
+    process environment, so the API would miss ``HF_TOKEN`` / ``HUGGINGFACE_API_KEY`` otherwise.
+    """
+    try:
+        import streamlit as st
+    except ImportError:
+        return
+    secrets = getattr(st, "secrets", None)
+    if secrets is None:
+        return
+    for key in ("HF_TOKEN", "HUGGINGFACE_API_KEY", "HUGGING_FACE_HUB_TOKEN"):
+        if (os.environ.get(key) or "").strip():
+            continue
+        try:
+            raw = secrets[key]
+        except Exception:
+            continue
+        if raw is not None and str(raw).strip():
+            os.environ[key] = str(raw).strip()
+def _maybe_start_embedded_uvicorn() -> None:
+    """Start uvicorn in-process when running on HF Spaces (or when DOC_AUDI_EMBED_API=1)."""
+    global _uvicorn_proc, _cleanup_registered
+    if not _want_embedded_api():
+        return
+    _propagate_streamlit_secrets_to_environ()
+    if _port_accepting_connections("127.0.0.1", 8000):
+        return
+    if _uvicorn_proc is not None and _uvicorn_proc.poll() is None:
+        for _ in range(120):
+            if _port_accepting_connections("127.0.0.1", 8000):
+                return
+            time.sleep(0.05)
+        return
+    cmd = [
+        sys.executable,
+        "-m",
+        "uvicorn",
+        "api.main:app",
+        "--host",
+        "127.0.0.1",
+        "--port",
+        "8000",
+    ]
+    _uvicorn_proc = subprocess.Popen(cmd)
+    proc = _uvicorn_proc
+    if not _cleanup_registered:
+        def _cleanup(p: subprocess.Popen[bytes] = proc) -> None:
+            if p.poll() is None:
+                p.terminate()
+                try:
+                    p.wait(timeout=10)
+                except subprocess.TimeoutExpired:
+                    p.kill()
+        atexit.register(_cleanup)
+        _cleanup_registered = True
+    for _ in range(120):
+        if _port_accepting_connections("127.0.0.1", 8000):
+            return
+        time.sleep(0.05)
+_maybe_start_embedded_uvicorn()
+from streamlit_app import main  # noqa: E402 — start API before importing Streamlit stack
+main()

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,67 @@

+# Requires a project `.env` (copy from `.env.example`) for `env_file` and variable substitution.
+name: docuaudit-ai
+x-app: &app
+  build: .
+  image: docuaudit-ai:${IMAGE_TAG:-local}
+services:
+  api:
+    <<: *app
+    command: uvicorn api.main:app --host 0.0.0.0 --port 8000
+    ports:
+      - "${API_PORT:-8000}:8000"
+    env_file:
+      - .env
+    environment:
+      CHROMA_PERSIST_DIRECTORY: /data/chroma
+      AUDIT_DB_PATH: /data/audit.db
+      JOBS_DB_PATH: /data/jobs.db
+      OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-http://host.docker.internal:11434}
+    volumes:
+      - docuaudit_data:/data
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    healthcheck:
+      test:
+        [
+          "CMD",
+          "python",
+          "-c",
+          "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health', timeout=5)",
+        ]
+      interval: 15s
+      timeout: 5s
+      retries: 5
+      start_period: 40s
+  streamlit:
+    <<: *app
+    command: >
+      streamlit run app.py
+      --server.port=8501
+      --server.address=0.0.0.0
+      --server.headless=true
+      --browser.gatherUsageStats=false
+    ports:
+      - "${STREAMLIT_PORT:-8501}:8501"
+    env_file:
+      - .env
+    environment:
+      DOC_AUDI_API_BASE: http://api:8000
+      STREAMLIT_BACKEND_URL: http://api:8000
+    depends_on:
+      api:
+        condition: service_healthy
+  ollama:
+    image: ollama/ollama:latest
+    profiles: ["ollama"]
+    ports:
+      - "${OLLAMA_HOST_PORT:-11434}:11434"
+    volumes:
+      - ollama_data:/root/.ollama
+volumes:
+  docuaudit_data:
+  ollama_data:

main.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Minimal CLI placeholder (not used by Docker or Hugging Face entrypoints).
+Production entrypoints: ``api.main:app`` (FastAPI) and ``app.py`` / ``streamlit_app.py`` (UI).
+"""
+def main() -> None:
+    """Print a hello message when run as ``python main.py``."""
+    print("Hello from doc-audi-ai!")
+if __name__ == "__main__":
+    main()

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """API contract models: request payloads and response DTOs."""

models/requests.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""Pydantic request bodies and query-parameter models for the HTTP API.
+Used by FastAPI route handlers for validation and OpenAPI schema generation.
+"""
+from typing import Optional
+from pydantic import BaseModel, ConfigDict, Field, HttpUrl
+class QueryRequest(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    question: str = Field(min_length=5, max_length=2000, description="Natural language question")
+    collection_name: Optional[str] = Field(
+        default="default",
+        min_length=1,
+        max_length=256,
+        description="Chroma collection to search",
+    )
+    top_k: int = Field(default=5, ge=1, le=20, description="Number of chunks to retrieve")
+    user_id: str = Field(default="anonymous", max_length=256, description="Caller id for audit filtering")
+class SummariseRequest(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    collection_name: str = Field(
+        default="default",
+        min_length=1,
+        max_length=256,
+        description="Chroma collection to summarise",
+    )
+    focus: str | None = Field(
+        default=None,
+        max_length=8000,
+        description="Optional angle or scope for retrieval and the summary",
+    )
+    user_id: str = Field(default="anonymous", max_length=256, description="Caller id for audit filtering")
+class URLIngestRequest(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    urls: list[HttpUrl] = Field(
+        min_length=1,
+        max_length=100,
+        description="One or more HTTP(S) URLs to PDF, TXT, or Markdown documents",
+    )
+    collection_name: Optional[str] = Field(
+        default="default",
+        min_length=1,
+        max_length=256,
+        description="Target Chroma collection name",
+    )
+class JobsListParams(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    limit: int = Field(default=10, ge=1, le=100, description="Max jobs to return")
+    offset: int = Field(default=0, ge=0, description="Offset for pagination")
+class AuditListParams(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    limit: int = Field(default=50, ge=1, le=100, description="Max log entries to return")
+    offset: int = Field(default=0, ge=0, description="Offset for pagination")
+    user_id: str | None = Field(default=None, max_length=256, description="Filter by user id")
+    from_date: str | None = Field(
+        default=None,
+        description="ISO 8601 datetime lower bound (inclusive) on timestamp",
+    )
+    to_date: str | None = Field(
+        default=None,
+        description="ISO 8601 datetime upper bound (inclusive) on timestamp",
+    )

models/responses.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Pydantic response models returned by FastAPI routes.
+Shared shape: :class:`SourceCitation` appears on ask, summarise, and audit detail responses.
+"""
+from datetime import datetime
+from pydantic import BaseModel, Field
+# --- Shared citations (spec-shaped) ---
+class SourceCitation(BaseModel):
+    document_name: str
+    page_number: int
+    chunk_text: str
+    relevance_score: float
+# --- Query: ask ---
+class AskQueryResponse(BaseModel):
+    query_id: str
+    question: str
+    answer: str
+    sources: list[SourceCitation] = Field(default_factory=list)
+    model_used: str
+    tokens_used: int
+    response_time_ms: int
+    timestamp: datetime
+# --- Query: summarise ---
+class SummariseQueryResponse(BaseModel):
+    query_id: str
+    summary: str
+    document_count: int
+    sources: list[SourceCitation] = Field(default_factory=list)
+    timestamp: datetime
+# --- Ingest ---
+class IngestUploadResponse(BaseModel):
+    job_id: str
+    status: str
+    total_files: int
+    filenames: list[str]
+    message: str
+class UrlIngestResponse(BaseModel):
+    job_id: str
+    status: str
+    total_urls: int
+    message: str
+class CollectionItem(BaseModel):
+    name: str
+    document_count: int
+    created_at: datetime | None = None
+class IngestCollectionsResponse(BaseModel):
+    collections: list[CollectionItem] = Field(default_factory=list)
+    total: int
+class IngestDeleteCollectionResponse(BaseModel):
+    message: str
+    documents_removed: int
+# --- Jobs ---
+class JobStatusResponse(BaseModel):
+    job_id: str
+    status: str
+    total_files: int
+    processed_files: int
+    failed_files: int
+    progress_percent: int
+    started_at: datetime | None
+    completed_at: datetime | None
+    errors: list[str] = Field(default_factory=list)
+class JobListItem(BaseModel):
+    job_id: str
+    status: str
+    total_files: int
+    completed_at: datetime | None = None
+class JobListResponse(BaseModel):
+    jobs: list[JobListItem] = Field(default_factory=list)
+    total: int
+# --- Audit ---
+class AuditLogEntry(BaseModel):
+    query_id: str
+    user_id: str
+    question: str
+    answer_summary: str
+    sources_count: int
+    model_used: str | None
+    timestamp: datetime
+class AuditLogsResponse(BaseModel):
+    logs: list[AuditLogEntry] = Field(default_factory=list)
+    total: int
+    limit: int
+    offset: int
+class AuditLogDetailResponse(BaseModel):
+    query_id: str
+    user_id: str
+    question: str
+    full_answer: str
+    sources: list[SourceCitation] = Field(default_factory=list)
+    model_used: str | None
+    tokens_used: int | None
+    timestamp: datetime

pyproject.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[project]
+name = "doc-audi-ai"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "fastapi==0.111.0",
+    "langchain==0.2.0",
+    "langchain-openai==0.1.7",
+    "langchain-community==0.2.0",
+    "langchain-chroma==0.1.4",
+    "langchain-text-splitters==0.2.0",
+    "langchain-anthropic==0.1.15",
+    "langchain-ollama==0.1.3",
+    "chromadb==0.5.0",
+    # Chroma 0.5 calls posthog.capture(distinct_id, event, props); posthog 6+ removed that API (breaks telemetry + spams stderr).
+    "posthog>=3.7.0,<4",
+    "openai==1.30.1",
+    "anthropic==0.28.1",
+    "pydantic-settings==2.3.4",
+    "pymupdf==1.25.5",
+    "python-multipart==0.0.9",
+    "aiosqlite>=0.21.0",
+    "httpx>=0.27.0",
+    "uvicorn[standard]==0.29.0",
+    "huggingface-hub>=1.13.0",
+    "langchain-huggingface>=0.0.3",
+    "streamlit>=1.39.0",
+    "pytest>=8.4.2",
+    "pytest-asyncio>=1.2.0",
+    "onnxruntime==1.23.2 ; sys_platform == 'darwin' and platform_machine == 'x86_64'",
+    "torch==2.2.2 ; sys_platform == 'darwin' and platform_machine == 'x86_64'",
+]

pytest.ini ADDED Viewed

	@@ -0,0 +1,3 @@

+[pytest]
+testpaths = tests
+python_files = test_*.py

rag/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""RAG pipeline: load → chunk → embed → store → retrieve → generate.
+Submodules: :mod:`loader`, :mod:`chunker`, :mod:`embedder`, :mod:`vector_store`,
+:mod:`retriever`, and :mod:`hf_hub_inference` for Hugging Face Hub compatibility.
+"""

rag/chunker.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Split loaded documents into overlapping chunks for embedding.
+Chunk size and overlap come from :func:`api.config.get_settings`. Each chunk receives
+``chunk_index``, ``source``, and ``page`` metadata.
+"""
+from langchain_core.documents import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from api.config import get_settings
+def chunk_documents(
+    documents: list[Document],
+) -> list[Document]:
+    """Recursive character split of all input documents."""
+    settings = get_settings()
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=settings.chunk_size,
+        chunk_overlap=settings.chunk_overlap,
+        separators=["\n\n", "\n", ". ", " ", ""],
+    )
+    chunks = splitter.split_documents(documents)
+    for idx, chunk in enumerate(chunks):
+        chunk.metadata["chunk_index"] = idx
+        chunk.metadata.setdefault("source", "unknown")
+        chunk.metadata.setdefault("page", 0)
+    return chunks

rag/embedder.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""Factory for LangChain embedding backends (OpenAI, Ollama, Hugging Face).
+The active provider is ``Settings.llm_provider``. Used by ingest and query paths when
+opening or querying Chroma collections.
+"""
+from langchain_core.embeddings import Embeddings
+from langchain_ollama import OllamaEmbeddings
+from langchain_openai import OpenAIEmbeddings
+from pydantic import SecretStr
+from api.config import get_settings
+from rag.hf_hub_inference import HubInferenceEmbeddings
+def create_embedding_function() -> Embeddings:
+    """Return an ``Embeddings`` implementation matching the configured LLM provider."""
+    settings = get_settings()
+    provider = settings.llm_provider.lower()
+    if provider == "openai":
+        if not settings.openai_api_key:
+            raise ValueError("OPENAI_API_KEY is required when LLM_PROVIDER=openai")
+        return OpenAIEmbeddings(
+            model=settings.openai_embedding_model,
+            api_key=SecretStr(settings.openai_api_key),
+        )
+    if provider == "huggingface":
+        if not settings.huggingface_api_key:
+            raise ValueError(
+                "A Hugging Face token is required when LLM_PROVIDER=huggingface "
+                "(set HUGGINGFACE_API_KEY or HF_TOKEN / HUGGING_FACE_HUB_TOKEN on Spaces)."
+            )
+        return HubInferenceEmbeddings(
+            model=settings.huggingface_embedding_model,
+            api_token=settings.huggingface_api_key,
+        )
+    if provider == "ollama":
+        return OllamaEmbeddings(
+            model=settings.ollama_embedding_model,
+            base_url=settings.ollama_base_url,
+        )
+    raise ValueError(f"Unsupported LLM_PROVIDER: {settings.llm_provider}")

rag/hf_hub_inference.py ADDED Viewed

	@@ -0,0 +1,380 @@

+"""Hugging Face Inference API via ``huggingface_hub.InferenceClient``.
+``langchain_huggingface`` 0.0.x uses ``InferenceClient.post()``, which was removed in
+``huggingface_hub`` 1.x. Chat tries ``InferenceClient.chat_completion`` on the primary
+provider, then (for repo ids containing ``mistral`` when primary is not Novita) Novita,
+which often maps those weights to conversational chat only. On router errors or local
+``ValueError`` (Hub sometimes omits ``pipeline_tag``), we fall back to ``text_generation``
+providers, then the classic **api-inference** ``POST /models/{id}`` JSON API.
+"""
+from __future__ import annotations
+from typing import Any, List, Optional
+import httpx
+import numpy as np
+from langchain_core.embeddings import Embeddings
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage
+from langchain_core.outputs import ChatGeneration, ChatResult
+from langchain_core.pydantic_v1 import Field, root_validator
+from huggingface_hub import InferenceClient, constants
+from huggingface_hub.errors import BadRequestError, HfHubHTTPError
+def _lc_messages_to_hf_chat(messages: List[BaseMessage]) -> list[dict[str, str]]:
+    """Map LangChain messages to Hugging Face ``chat_completion`` message dicts."""
+    out: list[dict[str, str]] = []
+    for m in messages:
+        content = m.content if isinstance(m.content, str) else str(m.content)
+        if isinstance(m, SystemMessage):
+            out.append({"role": "system", "content": content})
+        elif isinstance(m, HumanMessage):
+            out.append({"role": "user", "content": content})
+        elif isinstance(m, AIMessage):
+            out.append({"role": "assistant", "content": content})
+        elif isinstance(m, ToolMessage):
+            out.append({"role": "user", "content": f"[tool result]\n{content}"})
+        else:
+            out.append({"role": "user", "content": content})
+    return out
+def _messages_to_text_generation_prompt(repo_id: str, messages: List[BaseMessage]) -> str:
+    """Build a single prompt for causal / text-generation APIs (instruct templates)."""
+    blocks: list[str] = []
+    for m in messages:
+        content = m.content if isinstance(m.content, str) else str(m.content)
+        if isinstance(m, SystemMessage):
+            blocks.append(content)
+        elif isinstance(m, HumanMessage):
+            blocks.append(content)
+        elif isinstance(m, AIMessage):
+            blocks.append(content)
+        elif isinstance(m, ToolMessage):
+            blocks.append(f"[tool]\n{content}")
+        else:
+            blocks.append(content)
+    body = "\n\n".join(blocks)
+    rid = repo_id.lower()
+    if "mistral" in rid:
+        return f"<s>[INST] {body} [/INST]"
+    return f"{body}\n\nAssistant:\n"
+def _chat_completion_text_and_usage(out: Any) -> tuple[str, dict[str, int] | None]:
+    """Extract assistant text and optional token usage from ``ChatCompletionOutput``."""
+    choices = getattr(out, "choices", None) or []
+    if not choices:
+        return (str(out).strip(), None)
+    msg = getattr(choices[0], "message", None)
+    text = (getattr(msg, "content", None) or "").strip() if msg is not None else ""
+    usage_meta: dict[str, int] | None = None
+    u = getattr(out, "usage", None)
+    if u is not None:
+        usage_meta = {}
+        tt = getattr(u, "total_tokens", None)
+        pt = getattr(u, "prompt_tokens", None)
+        ct = getattr(u, "completion_tokens", None)
+        if tt is not None:
+            usage_meta["total_tokens"] = int(tt)
+        if pt is not None:
+            usage_meta["input_tokens"] = int(pt)
+        if ct is not None:
+            usage_meta["output_tokens"] = int(ct)
+        if not usage_meta:
+            usage_meta = None
+    return text, usage_meta
+def _legacy_api_text_generation(
+    model_id: str,
+    api_token: str,
+    prompt: str,
+    *,
+    max_new_tokens: int,
+    temperature: float,
+    stop: list[str] | None,
+) -> str:
+    """Classic HF Inference API (bypasses strict ``InferenceClient`` task checks)."""
+    url = f"{constants.INFERENCE_ENDPOINT.rstrip('/')}/models/{model_id}"
+    parameters: dict[str, Any] = {
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "return_full_text": False,
+    }
+    if stop:
+        parameters["stop"] = stop
+    body = {"inputs": prompt, "parameters": parameters}
+    headers = {"Authorization": f"Bearer {api_token}"}
+    timeout = httpx.Timeout(60.0, read=300.0)
+    with httpx.Client(timeout=timeout) as client:
+        resp = client.post(url, json=body, headers=headers)
+        try:
+            resp.raise_for_status()
+        except httpx.HTTPStatusError as exc:
+            _raise_legacy_inference_http_error(model_id, exc)
+    data = resp.json()
+    if isinstance(data, dict) and data.get("error"):
+        raise RuntimeError(str(data["error"]))
+    if isinstance(data, list) and data:
+        first = data[0]
+        if isinstance(first, dict) and "generated_text" in first:
+            return str(first["generated_text"]).strip()
+    if isinstance(data, dict) and "generated_text" in data:
+        return str(data["generated_text"]).strip()
+    raise RuntimeError(f"Unexpected legacy inference response: {data!r}")
+class LegacyInferenceNotFoundError(RuntimeError):
+    """Classic ``api-inference`` returned 404 for this model id (weights not on that route)."""
+def _raise_legacy_inference_http_error(model_id: str, exc: httpx.HTTPStatusError) -> None:
+    if exc.response.status_code == 404:
+        raise LegacyInferenceNotFoundError(
+            f"Hugging Face legacy inference returned 404 for model {model_id!r}. "
+            "The classic api-inference route often no longer serves this checkpoint, and router chat "
+            "can 404 as well depending on provider health. Try "
+            "HUGGINGFACE_MODEL=meta-llama/Meta-Llama-3-8B-Instruct (or another id your token can call), "
+            "another model id your token can reach, or Ollama/local inference."
+        ) from exc
+    raise exc
+class HubInferenceEmbeddings(Embeddings):
+    """Embeddings through ``InferenceClient.feature_extraction``."""
+    def __init__(self, *, model: str, api_token: str) -> None:
+        self._model = model
+        self._client = InferenceClient(model=model, token=api_token or None)
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        out: list[list[float]] = []
+        for text in texts:
+            t = text.replace("\n", " ")
+            raw = self._client.feature_extraction(t, model=self._model)
+            vec = np.asarray(raw, dtype=np.float32)
+            if vec.ndim > 1:
+                vec = vec.mean(axis=0)
+            out.append(vec.flatten().tolist())
+        return out
+    def embed_query(self, text: str) -> List[float]:
+        return self.embed_documents([text])[0]
+class HubInferenceChatModel(BaseChatModel):
+    """HF Inference: ``chat_completion`` when supported, else ``text_generation`` fallback."""
+    repo_id: str = Field(..., description="Hugging Face model id for inference")
+    huggingfacehub_api_token: str = Field(..., repr=False)
+    temperature: float = Field(default=0.2)
+    max_new_tokens: int = Field(default=2048)
+    inference_provider: Optional[str] = Field(
+        default=None,
+        description=(
+            "huggingface_hub provider id. Default is hf-inference (avoids Novita-only mappings). "
+            "Set to `auto` for router auto-routing (provider=None)."
+        ),
+    )
+    class Config:
+        """Pydantic v1 config."""
+        arbitrary_types_allowed = True
+    client: Any = Field(default=None, exclude=True)
+    @root_validator(skip_on_failure=True)
+    def _build_client(cls, values: dict) -> dict:
+        if values.get("client") is not None:
+            return values
+        raw = values.get("inference_provider")
+        if isinstance(raw, str):
+            raw = raw.strip() or None
+        # Auto-routing often picks Novita for Mistral instruct; Novita maps that model to
+        # "conversational" only, so text_generation fails. Default to HF's inference proxy.
+        if raw is None:
+            client_provider: str | None = "hf-inference"
+            stored = "hf-inference"
+        elif raw.lower() == "auto":
+            client_provider = None
+            stored = "auto"
+        else:
+            client_provider = raw
+            stored = raw
+        values["inference_provider"] = stored
+        values["client"] = InferenceClient(
+            model=values["repo_id"],
+            token=values.get("huggingfacehub_api_token") or None,
+            provider=client_provider,
+        )
+        return values
+    def _chat_inference_clients(self) -> list[InferenceClient]:
+        """Ordered ``InferenceClient`` instances for ``chat_completion``.
+        - Primary client (usually ``hf-inference`` when unset).
+        - For Mistral instruct ids, Novita often exposes **conversational** chat while HF task checks
+          or ``hf-inference`` reject the same repo.
+        - When primary is ``hf-inference``, append **router auto** (``provider=None``): many models
+          (e.g. Llama 3.1 Instruct) return *Model not supported by provider hf-inference* on the
+          serverless HF proxy but work via the inference router to another provider.
+        """
+        token = self.huggingfacehub_api_token or None
+        rid = self.repo_id
+        clients: list[InferenceClient] = [self.client]
+        ip = (self.inference_provider or "").strip().lower()
+        if "mistral" in rid.lower() and ip != "novita":
+            clients.append(InferenceClient(model=rid, token=token, provider="novita"))
+        if ip == "hf-inference":
+            clients.append(InferenceClient(model=rid, token=token, provider=None))
+        return clients
+    @property
+    def _llm_type(self) -> str:
+        return "hf-hub-inference"
+    @property
+    def _identifying_params(self) -> dict[str, Any]:
+        return {
+            "repo_id": self.repo_id,
+            "temperature": self.temperature,
+            "max_new_tokens": self.max_new_tokens,
+            "inference_provider": self.inference_provider,
+        }
+    def _text_generation_fallback(self, messages: List[BaseMessage], stop: Optional[List[str]]) -> str:
+        prompt = _messages_to_text_generation_prompt(self.repo_id, messages)
+        token = self.huggingfacehub_api_token
+        rid = self.repo_id
+        chain_raw: list[str | None] = []
+        p = (self.inference_provider or "").strip()
+        if p.lower() == "auto":
+            chain_raw.append(None)
+        elif p and p.lower() != "hf-inference":
+            chain_raw.append(p)
+        chain_raw.append("hf-inference")
+        chain_raw.append(None)
+        chain: list[str | None] = []
+        seen: set[str] = set()
+        for prov in chain_raw:
+            key = prov if prov is not None else "__auto__"
+            if key in seen:
+                continue
+            seen.add(key)
+            chain.append(prov)
+        last: Exception | None = None
+        for prov in chain:
+            try:
+                cli = InferenceClient(model=rid, token=token, provider=prov)
+                raw = cli.text_generation(
+                    prompt,
+                    model=rid,
+                    max_new_tokens=self.max_new_tokens,
+                    temperature=self.temperature,
+                    stop=stop,
+                    return_full_text=False,
+                )
+                return (raw if isinstance(raw, str) else str(raw)).strip()
+            except Exception as exc:
+                last = exc
+                continue
+        try:
+            return _legacy_api_text_generation(
+                rid,
+                token,
+                prompt,
+                max_new_tokens=self.max_new_tokens,
+                temperature=self.temperature,
+                stop=stop,
+            )
+        except Exception as legacy_exc:
+            if last is not None:
+                # Prefer the legacy endpoint error (e.g. explicit 404 guidance) over the last
+                # provider text_generation failure (often a task-mapping ValueError).
+                raise legacy_exc from last
+            raise legacy_exc
+    def _generate(
+        self,
+        messages: List[BaseMessage],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[Any] = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        chat_payload = _lc_messages_to_hf_chat(messages)
+        last_chat_err: BaseException | None = None
+        for cli in self._chat_inference_clients():
+            try:
+                out = cli.chat_completion(
+                    chat_payload,
+                    model=self.repo_id,
+                    max_tokens=self.max_new_tokens,
+                    temperature=self.temperature,
+                    stop=stop,
+                )
+                text, usage_meta = _chat_completion_text_and_usage(out)
+                message = AIMessage(content=text, usage_metadata=usage_meta)
+                return ChatResult(generations=[ChatGeneration(message=message)])
+            except BadRequestError as exc:
+                last_chat_err = exc
+                err = str(exc).lower()
+                if (
+                    "not a chat model" in err
+                    or "model_not_supported" in err
+                    or "not supported by provider" in err
+                    # Defer to post-loop handling so we can explain gated / unknown ids without masking
+                    # earlier recoverable errors from another client.
+                    or "model_not_found" in err
+                    or ("does not exist" in err and "model" in err)
+                ):
+                    continue
+                raise
+            except HfHubHTTPError as exc:
+                last_chat_err = exc
+                code = getattr(exc.response, "status_code", None)
+                # Novita/router may 404 a model or route; try remaining clients then completion fallbacks.
+                if code in (404, 410):
+                    continue
+                raise
+            except ValueError as exc:
+                # e.g. hf-inference _check_supported_task when Hub model card has no pipeline_tag
+                last_chat_err = exc
+                continue
+        if last_chat_err is not None and isinstance(last_chat_err, BadRequestError):
+            le = str(last_chat_err).lower()
+            if "model_not_found" in le or (
+                "does not exist" in le and ("model" in le or "requested model" in le)
+            ):
+                raise RuntimeError(
+                    f"Inference router could not use chat model {self.repo_id!r} "
+                    "(common for gated models: open the model page on the Hugging Face Hub, accept the "
+                    "license, ensure your API token has read access to that model, then retry)."
+                ) from last_chat_err
+        try:
+            text = self._text_generation_fallback(messages, stop)
+        except LegacyInferenceNotFoundError:
+            raise
+        except Exception as exc:
+            hint = (
+                f"Hugging Face chat_completion failed for {self.repo_id!r} on all tried providers; "
+                "text_generation / legacy fallbacks also failed. "
+                "Accept the model license on the Hub, check your token, or set "
+                "HUGGINGFACE_INFERENCE_PROVIDER=auto to use only router routing."
+            )
+            if last_chat_err is not None:
+                raise RuntimeError(f"{hint} Last chat error: {last_chat_err!r}") from exc
+            raise RuntimeError(hint) from exc
+        message = AIMessage(content=text, usage_metadata=None)
+        return ChatResult(generations=[ChatGeneration(message=message)])

rag/loader.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""Load raw documents from disk into LangChain ``Document`` objects.
+Supports PDF (PyMuPDF), plain text, and Markdown. Each document gets ``source`` and
+``page`` metadata for downstream chunking and citations.
+"""
+from pathlib import Path
+from langchain_core.documents import Document
+from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
+def load_documents(paths: str | list[str]) -> list[Document]:
+    """Load one or more files; raise ``ValueError`` for unsupported extensions."""
+    normalized_paths = [paths] if isinstance(paths, str) else paths
+    all_docs: list[Document] = []
+    for path_str in normalized_paths:
+        path = Path(path_str)
+        suffix = path.suffix.lower()
+        if suffix == ".pdf":
+            loader = PyMuPDFLoader(str(path_str))
+        elif suffix in {".txt", ".md"}:
+            loader = TextLoader(str(path_str), encoding="utf-8")
+        else:
+            raise ValueError(f"Unsupported file type: {suffix or 'unknown'}")
+        documents = loader.load()
+        for doc in documents:
+            doc.metadata.setdefault("source", path.name)
+            doc.metadata.setdefault("page", 0)
+        all_docs.extend(documents)
+    return all_docs

rag/retriever.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""Semantic retrieval and grounded LLM generation for ask and summarise flows.
+Pipeline: similarity search on Chroma → relevance filter → provider-specific chat model
+→ answer with citations. Prompt templates enforce document-only answers for consulting use.
+"""
+from dataclasses import dataclass
+from langchain_chroma import Chroma
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_ollama import ChatOllama
+from langchain_openai import ChatOpenAI
+from pydantic import SecretStr
+try:
+    from langchain_anthropic import ChatAnthropic
+except ImportError:
+    ChatAnthropic = None  # type: ignore[assignment]
+from api.config import Settings
+from rag.hf_hub_inference import HubInferenceChatModel
+NO_MATCH_ANSWER = "I cannot find this information in the uploaded documents."
+MIN_RELEVANCE_SCORE = 0.15
+# Verbatim from DOCUAUDIT_AI_REQUIREMENTS.md (placeholders filled at runtime).
+DOCUAUDIT_ASK_TEMPLATE = """You are DocuAudit AI, an expert document analyst for consulting environments.
+RULES:
+1. Answer ONLY based on the provided document excerpts below.
+2. If the answer is not in the documents, say: "I cannot find this information in the uploaded documents."
+3. ALWAYS cite your sources: mention the document name and page number for every claim.
+4. Be precise and professional. This is a high-stakes consulting environment.
+5. Do not speculate or add information not present in the documents.
+DOCUMENT EXCERPTS:
+{context}
+QUESTION: {question}
+ANSWER (with source citations):
+"""
+@dataclass
+class RetrievedChunk:
+    """One search hit with metadata needed for prompts and API citations."""
+    text: str
+    score: float | None
+    source: str
+    page: int | None
+    chunk_index: int | None
+def retrieve_chunks(vector_store: Chroma, question: str, k: int) -> list[RetrievedChunk]:
+    """Top-K similarity search with relevance scores from Chroma/LangChain."""
+    results = vector_store.similarity_search_with_relevance_scores(question, k=k)
+    chunks: list[RetrievedChunk] = []
+    for doc, score in results:
+        metadata = doc.metadata or {}
+        chunks.append(
+            RetrievedChunk(
+                text=doc.page_content,
+                score=score,
+                source=str(metadata.get("source", "unknown")),
+                page=_to_int_or_none(metadata.get("page")),
+                chunk_index=_to_int_or_none(metadata.get("chunk_index")),
+            )
+        )
+    return chunks
+SUMMARY_RETRIEVAL_QUERY = (
+    "Overview of the document: main topics, key definitions, obligations, risks, and conclusions."
+)
+def answer_with_grounding(settings: Settings, question: str, chunks: list[RetrievedChunk]) -> tuple[str, int]:
+    """Generate a cited answer from chunks; return ``(answer_text, token_count)``."""
+    ranked_chunks = [chunk for chunk in chunks if chunk.score is None or chunk.score >= MIN_RELEVANCE_SCORE]
+    if not ranked_chunks:
+        return NO_MATCH_ANSWER, 0
+    llm = _create_chat_model(settings)
+    prompt_context = _format_context(ranked_chunks)
+    user_content = DOCUAUDIT_ASK_TEMPLATE.format(context=prompt_context, question=question)
+    messages = [HumanMessage(content=user_content)]
+    response = llm.invoke(messages)
+    answer = _extract_message_text(response).strip()
+    tokens = _extract_usage_tokens(response)
+    return (answer or NO_MATCH_ANSWER), tokens
+def summarise_with_grounding(
+    settings: Settings,
+    *,
+    focus: str | None,
+    chunks: list[RetrievedChunk],
+) -> tuple[str, int]:
+    """Produce a structured summary grounded in retrieved excerpts."""
+    ranked_chunks = [chunk for chunk in chunks if chunk.score is None or chunk.score >= MIN_RELEVANCE_SCORE]
+    if not ranked_chunks:
+        return NO_MATCH_ANSWER, 0
+    llm = _create_chat_model(settings)
+    prompt_context = _format_context(ranked_chunks)
+    user_instruction = (
+        focus.strip()
+        if focus and focus.strip()
+        else "Summarise the main themes, structure, and important details. Use bullet points where helpful."
+    )
+    messages = [
+        SystemMessage(
+            content=(
+                "You write accurate summaries using only the provided document excerpts. "
+                "Do not invent facts. If the excerpts are insufficient, say what is missing."
+            )
+        ),
+        HumanMessage(
+            content=(
+                f"Summary request: {user_instruction}\n\n"
+                f"Document excerpts:\n{prompt_context}\n\n"
+                "Return a structured, concise summary grounded in the excerpts above."
+            )
+        ),
+    ]
+    response = llm.invoke(messages)
+    answer = _extract_message_text(response).strip()
+    tokens = _extract_usage_tokens(response)
+    return (answer or NO_MATCH_ANSWER), tokens
+def _create_chat_model(settings: Settings) -> BaseChatModel:
+    provider = settings.llm_provider.lower()
+    if provider == "openai":
+        if not settings.openai_api_key:
+            raise ValueError("OPENAI_API_KEY is required when LLM_PROVIDER=openai")
+        return ChatOpenAI(model=settings.openai_model, api_key=SecretStr(settings.openai_api_key))
+    if provider == "ollama":
+        return ChatOllama(model=settings.ollama_chat_model, base_url=settings.ollama_base_url)
+    if provider == "anthropic":
+        if ChatAnthropic is None:
+            raise ValueError("langchain-anthropic is not installed for LLM_PROVIDER=anthropic")
+        if not settings.anthropic_api_key:
+            raise ValueError("ANTHROPIC_API_KEY is required when LLM_PROVIDER=anthropic")
+        return ChatAnthropic(model=settings.anthropic_model, api_key=SecretStr(settings.anthropic_api_key))
+    if provider == "huggingface":
+        if not settings.huggingface_api_key:
+            raise ValueError(
+                "A Hugging Face token is required when LLM_PROVIDER=huggingface "
+                "(set HUGGINGFACE_API_KEY or HF_TOKEN / HUGGING_FACE_HUB_TOKEN on Spaces)."
+            )
+        return HubInferenceChatModel(
+            repo_id=settings.huggingface_model,
+            huggingfacehub_api_token=settings.huggingface_api_key,
+            temperature=0.2,
+            max_new_tokens=2048,
+            inference_provider=settings.huggingface_inference_provider,
+        )
+    raise ValueError(f"Unsupported LLM_PROVIDER: {settings.llm_provider}")
+def _format_context(chunks: list[RetrievedChunk]) -> str:
+    lines: list[str] = []
+    for idx, chunk in enumerate(chunks, start=1):
+        lines.append(
+            f"[{idx}] source={chunk.source}, page={chunk.page}, chunk={chunk.chunk_index}, score={chunk.score}\n"
+            f"{chunk.text}"
+        )
+    return "\n\n".join(lines)
+def _to_int_or_none(value: object) -> int | None:
+    try:
+        if value is None:
+            return None
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+def _extract_usage_tokens(response: object) -> int:
+    um = getattr(response, "usage_metadata", None)
+    if isinstance(um, dict):
+        total = um.get("total_tokens")
+        if total is not None:
+            return int(total)
+        inp = int(um.get("input_tokens", 0) or 0)
+        out = int(um.get("output_tokens", 0) or 0)
+        return inp + out
+    rm = getattr(response, "response_metadata", None) or {}
+    if isinstance(rm, dict):
+        tu = rm.get("token_usage")
+        if isinstance(tu, dict):
+            if tu.get("total_tokens") is not None:
+                return int(tu["total_tokens"])
+            return int(tu.get("prompt_tokens", 0) or 0) + int(tu.get("completion_tokens", 0) or 0)
+    return 0
+def _extract_message_text(response: object) -> str:
+    content = getattr(response, "content", "")
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        text_parts: list[str] = []
+        for item in content:
+            if isinstance(item, str):
+                text_parts.append(item)
+            elif isinstance(item, dict) and "text" in item:
+                text_parts.append(str(item["text"]))
+        return "\n".join(part for part in text_parts if part)
+    return str(content)

rag/vector_store.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""ChromaDB persistence and LangChain ``Chroma`` vector store helpers.
+Collections are named per ingest target; documents are stored with UUID chunk ids.
+Telemetry is disabled at the client level for quieter logs in production.
+"""
+from datetime import datetime, timezone
+from pathlib import Path
+from uuid import uuid4
+import chromadb
+from chromadb.config import Settings
+from langchain_chroma import Chroma
+from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
+_CHROMA_CLIENT_SETTINGS = Settings(anonymized_telemetry=False)
+def _utc_now_iso() -> str:
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+def _chroma_client(persist_directory: str) -> chromadb.PersistentClient:
+    Path(persist_directory).mkdir(parents=True, exist_ok=True)
+    return chromadb.PersistentClient(path=persist_directory, settings=_CHROMA_CLIENT_SETTINGS)
+def get_vector_store(
+    persist_directory: str,
+    collection_name: str,
+    embedding_function: Embeddings,
+) -> Chroma:
+    """Open or create a persisted Chroma collection wired to the given embedder."""
+    client = _chroma_client(persist_directory)
+    try:
+        client.get_collection(name=collection_name)
+    except Exception:
+        client.get_or_create_collection(
+            name=collection_name,
+            metadata={"created_at": _utc_now_iso()},
+        )
+    return Chroma(
+        collection_name=collection_name,
+        embedding_function=embedding_function,
+        persist_directory=persist_directory,
+        client_settings=_CHROMA_CLIENT_SETTINGS,
+    )
+def add_documents(vector_store: Chroma, chunks: list[Document]) -> list[str]:
+    """Embed and insert chunks; return the generated vector ids."""
+    document_ids = [str(uuid4()) for _ in chunks]
+    vector_store.add_documents(documents=chunks, ids=document_ids)
+    return document_ids
+def list_collection_names(persist_directory: str) -> list[str]:
+    """Sorted list of collection names in the persist directory."""
+    client = _chroma_client(persist_directory)
+    return sorted(c.name for c in client.list_collections())
+def delete_collection(persist_directory: str, collection_name: str) -> int:
+    """Delete a collection and return the number of documents that were removed (best effort)."""
+    client = _chroma_client(persist_directory)
+    removed = 0
+    try:
+        col = client.get_collection(name=collection_name)
+        removed = int(col.count())
+    except Exception:
+        removed = 0
+    client.delete_collection(name=collection_name)
+    return removed
+def collection_document_count(persist_directory: str, collection_name: str) -> int:
+    """Number of vectors in a collection, or 0 if the collection does not exist."""
+    client = _chroma_client(persist_directory)
+    try:
+        col = client.get_collection(name=collection_name)
+        return int(col.count())
+    except Exception:
+        return 0
+def collection_created_at(persist_directory: str, collection_name: str) -> str | None:
+    """Return collection metadata ``created_at`` if present (Chroma-specific)."""
+    client = _chroma_client(persist_directory)
+    try:
+        col = client.get_collection(name=collection_name)
+        meta = getattr(col, "metadata", None) or {}
+        if isinstance(meta, dict):
+            raw = meta.get("created_at") or meta.get("created")
+            if raw is not None:
+                return str(raw)
+    except Exception:
+        pass
+    return None
+def ensure_collection_created_at(
+    persist_directory: str,
+    collection_name: str,
+    *,
+    fallback: str | None = None,
+) -> str | None:
+    """Persist ``created_at`` on the Chroma collection when missing; never overwrites an existing value."""
+    client = _chroma_client(persist_directory)
+    try:
+        col = client.get_collection(name=collection_name)
+    except Exception:
+        return None
+    meta = getattr(col, "metadata", None) or {}
+    if not isinstance(meta, dict):
+        meta = {}
+    raw = meta.get("created_at") or meta.get("created")
+    if raw is not None:
+        return str(raw)
+    value = fallback or _utc_now_iso()
+    updated = dict(meta)
+    updated["created_at"] = value
+    col.modify(metadata=updated)
+    return value

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+fastapi==0.111.0
+uvicorn[standard]==0.29.0
+pydantic-settings==2.3.4
+langchain==0.2.0
+langchain-openai==0.1.7
+langchain-community==0.2.0
+langchain-chroma==0.1.4
+langchain-text-splitters==0.2.0
+langchain-anthropic==0.1.15
+langchain-ollama==0.1.3
+chromadb==0.5.0
+posthog>=3.7.0,<4
+openai==1.30.1
+anthropic==0.28.1
+pymupdf==1.25.5
+python-multipart==0.0.9
+aiosqlite
+httpx>=0.27.0
+huggingface-hub
+langchain-huggingface
+streamlit>=1.39.0
+pytest>=8.4.2
+pytest-asyncio>=1.2.0

sample.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+Doc-Audi-AI RAG Smoke Test Document
+Project: Doc-Audi-AI
+Environment: Lightning AI deployment with Ollama embeddings.
+This sample document is used to test ingestion and retrieval.
+The system should split this file into chunks, generate embeddings, and store vectors in Chroma.
+Key facts:
+- The project supports file ingestion for PDF, TXT, and MD formats.
+- The default collection name for tests is "default".
+- A typical retrieval question is: "What is this document about?"
+- Another test question is: "Which file formats are supported?"
+Expected behavior:
+If ingestion succeeds, querying should return text snippets from this document with relevance scores.

storage/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Persistence layer: SQLite audit log and ingest job tracking."""

storage/audit_store.py ADDED Viewed

	@@ -0,0 +1,295 @@

+"""SQLite persistence for query and summarise audit events.
+Schema is created/migrated on first use. Stores full answers, citation JSON, token usage,
+and optional filters (user_id, date range) for list endpoints.
+"""
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+from uuid import uuid4
+import aiosqlite
+from models.responses import AuditLogDetailResponse, AuditLogEntry, SourceCitation
+def _utc_now_iso() -> str:
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+def _parse_ts(value: object) -> datetime:
+    if value is None or value == "":
+        return datetime.now(timezone.utc)
+    s = str(value).strip()
+    if s.endswith("Z"):
+        s = s[:-1] + "+00:00"
+    try:
+        dt = datetime.fromisoformat(s)
+        if dt.tzinfo is None:
+            return dt.replace(tzinfo=timezone.utc)
+        return dt
+    except ValueError:
+        return datetime.now(timezone.utc)
+async def _migrate_audit_columns(conn: aiosqlite.Connection) -> None:
+    cursor = await conn.execute("PRAGMA table_info(audit_events)")
+    rows = await cursor.fetchall()
+    col_names = {str(r[1]) for r in rows}
+    alters: list[str] = []
+    if "user_id" not in col_names:
+        alters.append("ALTER TABLE audit_events ADD COLUMN user_id TEXT NOT NULL DEFAULT 'anonymous'")
+    if "model_used" not in col_names:
+        alters.append("ALTER TABLE audit_events ADD COLUMN model_used TEXT")
+    if "tokens_used" not in col_names:
+        alters.append("ALTER TABLE audit_events ADD COLUMN tokens_used INTEGER")
+    if "response_time_ms" not in col_names:
+        alters.append("ALTER TABLE audit_events ADD COLUMN response_time_ms INTEGER")
+    if "answer_summary" not in col_names:
+        alters.append("ALTER TABLE audit_events ADD COLUMN answer_summary TEXT")
+    if "kind" not in col_names:
+        alters.append("ALTER TABLE audit_events ADD COLUMN kind TEXT NOT NULL DEFAULT 'ask'")
+    for stmt in alters:
+        await conn.execute(stmt)
+    if alters:
+        await conn.commit()
+async def init_audit_db(db_path: str) -> None:
+    """Create ``audit_events`` table and apply additive column migrations."""
+    db_file = Path(db_path)
+    db_file.parent.mkdir(parents=True, exist_ok=True)
+    async with aiosqlite.connect(db_file.as_posix()) as conn:
+        await conn.execute(
+            """
+            CREATE TABLE IF NOT EXISTS audit_events (
+                event_id TEXT PRIMARY KEY,
+                action TEXT NOT NULL,
+                question TEXT NOT NULL,
+                collection_name TEXT NOT NULL,
+                answer TEXT,
+                status TEXT NOT NULL,
+                message TEXT NOT NULL,
+                sources_json TEXT NOT NULL,
+                results_json TEXT NOT NULL,
+                created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+                user_id TEXT NOT NULL DEFAULT 'anonymous',
+                model_used TEXT,
+                tokens_used INTEGER,
+                response_time_ms INTEGER,
+                answer_summary TEXT,
+                kind TEXT NOT NULL DEFAULT 'ask'
+            )
+            """
+        )
+        await conn.commit()
+        await _migrate_audit_columns(conn)
+def _summary_from_answer(answer: str, max_len: int = 280) -> str:
+    text = (answer or "").strip()
+    if len(text) <= max_len:
+        return text
+    return text[: max_len - 1].rstrip() + "…"
+def _sources_to_citations(raw: list[dict[str, Any]]) -> list[SourceCitation]:
+    out: list[SourceCitation] = []
+    for item in raw:
+        if not isinstance(item, dict):
+            continue
+        if "document_name" in item:
+            doc = str(item.get("document_name", ""))
+            page = int(item.get("page_number", 0) or 0)
+            chunk = str(item.get("chunk_text", ""))
+            score = float(item.get("relevance_score", 0.0) or 0.0)
+        else:
+            doc = str(item.get("source", item.get("document_name", "")))
+            p = item.get("page_number", item.get("page"))
+            try:
+                page = int(p) if p is not None else 0
+            except (TypeError, ValueError):
+                page = 0
+            chunk = str(item.get("chunk_text", item.get("excerpt", item.get("text", ""))))
+            s = item.get("relevance_score", item.get("score"))
+            try:
+                score = float(s) if s is not None else 0.0
+            except (TypeError, ValueError):
+                score = 0.0
+        out.append(
+            SourceCitation(
+                document_name=doc or "unknown",
+                page_number=page,
+                chunk_text=chunk,
+                relevance_score=score,
+            )
+        )
+    return out
+async def persist_query_audit(
+    db_path: str,
+    *,
+    query_id: str,
+    action: str,
+    user_id: str,
+    question: str,
+    collection_name: str,
+    answer: str,
+    sources: list[SourceCitation],
+    model_used: str,
+    tokens_used: int,
+    response_time_ms: int,
+    status: str = "success",
+    message: str = "ok",
+    kind: str = "ask",
+) -> str:
+    """Insert one audit row after a successful ask or summarise; returns ``query_id``."""
+    await init_audit_db(db_path)
+    sources_payload = [s.model_dump(mode="json") for s in sources]
+    summary = _summary_from_answer(answer)
+    created = _utc_now_iso()
+    async with aiosqlite.connect(db_path) as conn:
+        await conn.execute(
+            """
+            INSERT INTO audit_events (
+                event_id, action, question, collection_name, answer, status, message,
+                sources_json, results_json, created_at, user_id, model_used, tokens_used,
+                response_time_ms, answer_summary, kind
+            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, '[]', ?, ?, ?, ?, ?, ?, ?)
+            """,
+            (
+                query_id,
+                action,
+                question,
+                collection_name,
+                answer,
+                status,
+                message,
+                json.dumps(sources_payload),
+                created,
+                user_id,
+                model_used,
+                tokens_used,
+                response_time_ms,
+                summary,
+                kind,
+            ),
+        )
+        await conn.commit()
+    return query_id
+async def count_audit_events(
+    db_path: str,
+    *,
+    user_id: str | None = None,
+    from_date: str | None = None,
+    to_date: str | None = None,
+) -> int:
+    await init_audit_db(db_path)
+    where, params = _audit_filters(user_id, from_date, to_date)
+    async with aiosqlite.connect(db_path) as conn:
+        cur = await conn.execute(f"SELECT COUNT(*) AS c FROM audit_events {where}", params)
+        row = await cur.fetchone()
+    return int(row[0]) if row else 0
+def _audit_filters(user_id: str | None, from_date: str | None, to_date: str | None) -> tuple[str, list[Any]]:
+    clauses: list[str] = []
+    params: list[Any] = []
+    if user_id:
+        clauses.append("user_id = ?")
+        params.append(user_id)
+    if from_date:
+        clauses.append("datetime(created_at) >= datetime(?)")
+        params.append(from_date)
+    if to_date:
+        clauses.append("datetime(created_at) <= datetime(?)")
+        params.append(to_date)
+    if not clauses:
+        return "", []
+    return "WHERE " + " AND ".join(clauses), params
+async def list_audit_events(
+    db_path: str,
+    *,
+    limit: int,
+    offset: int,
+    user_id: str | None = None,
+    from_date: str | None = None,
+    to_date: str | None = None,
+) -> tuple[list[AuditLogEntry], int]:
+    """Paginated audit list with optional user and ISO datetime filters."""
+    await init_audit_db(db_path)
+    where, fparams = _audit_filters(user_id, from_date, to_date)
+    total = await count_audit_events(db_path, user_id=user_id, from_date=from_date, to_date=to_date)
+    async with aiosqlite.connect(db_path) as conn:
+        conn.row_factory = aiosqlite.Row
+        cursor = await conn.execute(
+            f"""
+            SELECT event_id, user_id, question, answer, answer_summary, sources_json, model_used, created_at
+            FROM audit_events
+            {where}
+            ORDER BY datetime(created_at) DESC, rowid DESC
+            LIMIT ? OFFSET ?
+            """,
+            [*fparams, limit, offset],
+        )
+        rows = await cursor.fetchall()
+    logs: list[AuditLogEntry] = []
+    for row in rows:
+        src_raw = json.loads(row["sources_json"] or "[]")
+        if not isinstance(src_raw, list):
+            src_raw = []
+        summary_cell = row["answer_summary"]
+        summary_text = str(summary_cell).strip() if summary_cell else ""
+        if not summary_text:
+            summary_text = _summary_from_answer(str(row["answer"] or ""))
+        logs.append(
+            AuditLogEntry(
+                query_id=str(row["event_id"]),
+                user_id=str(row["user_id"] or "anonymous"),
+                question=str(row["question"]),
+                answer_summary=summary_text,
+                sources_count=len(src_raw),
+                model_used=row["model_used"],
+                timestamp=_parse_ts(row["created_at"]),
+            )
+        )
+    return logs, total
+async def get_audit_event(db_path: str, query_id: str) -> AuditLogDetailResponse | None:
+    """Full audit record for one ``query_id``, or ``None`` if missing."""
+    await init_audit_db(db_path)
+    async with aiosqlite.connect(db_path) as conn:
+        conn.row_factory = aiosqlite.Row
+        cursor = await conn.execute(
+            """
+            SELECT event_id, user_id, question, answer, sources_json, model_used, tokens_used, created_at
+            FROM audit_events
+            WHERE event_id = ?
+            """,
+            (query_id,),
+        )
+        row = await cursor.fetchone()
+    if row is None:
+        return None
+    src_raw = json.loads(row["sources_json"] or "[]")
+    if not isinstance(src_raw, list):
+        src_raw = []
+    citations = _sources_to_citations(src_raw)
+    return AuditLogDetailResponse(
+        query_id=str(row["event_id"]),
+        user_id=str(row["user_id"] or "anonymous"),
+        question=str(row["question"]),
+        full_answer=str(row["answer"] or ""),
+        sources=citations,
+        model_used=row["model_used"],
+        tokens_used=row["tokens_used"],
+        timestamp=_parse_ts(row["created_at"]),
+    )

storage/job_store.py ADDED Viewed

	@@ -0,0 +1,309 @@

+"""SQLite tracking for asynchronous document ingest jobs.
+Jobs move through ``queued`` → ``processing`` → ``completed`` or ``failed``. Progress
+fields support multi-file batches and per-file error messages.
+"""
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+from uuid import uuid4
+import aiosqlite
+from models.responses import JobListItem, JobStatusResponse
+def _utc_now_iso() -> str:
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+async def _migrate_jobs_columns(conn: aiosqlite.Connection) -> None:
+    cursor = await conn.execute("PRAGMA table_info(ingest_jobs)")
+    rows = await cursor.fetchall()
+    col_names = {str(r[1]) for r in rows}
+    alters: list[str] = []
+    if "total_files" not in col_names:
+        alters.append("ALTER TABLE ingest_jobs ADD COLUMN total_files INTEGER NOT NULL DEFAULT 1")
+    if "processed_files" not in col_names:
+        alters.append("ALTER TABLE ingest_jobs ADD COLUMN processed_files INTEGER NOT NULL DEFAULT 0")
+    if "failed_files" not in col_names:
+        alters.append("ALTER TABLE ingest_jobs ADD COLUMN failed_files INTEGER NOT NULL DEFAULT 0")
+    if "filenames_json" not in col_names:
+        alters.append("ALTER TABLE ingest_jobs ADD COLUMN filenames_json TEXT NOT NULL DEFAULT '[]'")
+    if "errors_json" not in col_names:
+        alters.append("ALTER TABLE ingest_jobs ADD COLUMN errors_json TEXT NOT NULL DEFAULT '[]'")
+    if "started_at" not in col_names:
+        alters.append("ALTER TABLE ingest_jobs ADD COLUMN started_at TEXT")
+    if "completed_at" not in col_names:
+        alters.append("ALTER TABLE ingest_jobs ADD COLUMN completed_at TEXT")
+    for stmt in alters:
+        await conn.execute(stmt)
+    if alters:
+        await conn.commit()
+    await _backfill_job_filenames(conn)
+async def _backfill_job_filenames(conn: aiosqlite.Connection) -> None:
+    conn.row_factory = aiosqlite.Row
+    cursor = await conn.execute("SELECT job_id, filename, filenames_json, total_files FROM ingest_jobs")
+    rows = await cursor.fetchall()
+    for row in rows:
+        raw = row["filenames_json"] or "[]"
+        try:
+            parsed: Any = json.loads(raw)
+        except json.JSONDecodeError:
+            parsed = []
+        if not parsed and row["filename"]:
+            await conn.execute(
+                """
+                UPDATE ingest_jobs
+                SET filenames_json = ?, total_files = CASE WHEN total_files IS NULL OR total_files < 1 THEN 1 ELSE total_files END
+                WHERE job_id = ?
+                """,
+                (json.dumps([row["filename"]]), row["job_id"]),
+            )
+    await conn.commit()
+async def init_jobs_db(db_path: str) -> None:
+    """Create ``ingest_jobs`` table and apply additive column migrations."""
+    db_file = Path(db_path)
+    db_file.parent.mkdir(parents=True, exist_ok=True)
+    async with aiosqlite.connect(db_file.as_posix()) as conn:
+        await conn.execute(
+            """
+            CREATE TABLE IF NOT EXISTS ingest_jobs (
+                job_id TEXT PRIMARY KEY,
+                status TEXT NOT NULL,
+                collection_name TEXT NOT NULL,
+                filename TEXT NOT NULL,
+                message TEXT NOT NULL DEFAULT '',
+                document_ids_json TEXT NOT NULL DEFAULT '[]',
+                created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+                updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+                total_files INTEGER NOT NULL DEFAULT 1,
+                processed_files INTEGER NOT NULL DEFAULT 0,
+                failed_files INTEGER NOT NULL DEFAULT 0,
+                filenames_json TEXT NOT NULL DEFAULT '[]',
+                errors_json TEXT NOT NULL DEFAULT '[]',
+                started_at TEXT,
+                completed_at TEXT
+            )
+            """
+        )
+        await conn.commit()
+        await _migrate_jobs_columns(conn)
+async def create_ingest_job(
+    db_path: str,
+    *,
+    collection_name: str,
+    filenames: list[str],
+) -> str:
+    """Insert a new queued job; return the generated ``job_id``."""
+    if not filenames:
+        raise ValueError("filenames must not be empty")
+    job_id = str(uuid4())
+    primary = filenames[0]
+    names_json = json.dumps(filenames)
+    total = len(filenames)
+    await init_jobs_db(db_path)
+    async with aiosqlite.connect(db_path) as conn:
+        await conn.execute(
+            """
+            INSERT INTO ingest_jobs (
+                job_id, status, collection_name, filename, message, document_ids_json,
+                total_files, processed_files, failed_files, filenames_json, errors_json
+            ) VALUES (?, 'queued', ?, ?, '', '[]', ?, 0, 0, ?, '[]')
+            """,
+            (job_id, collection_name, primary, total, names_json),
+        )
+        await conn.commit()
+    return job_id
+async def mark_job_processing(db_path: str, job_id: str) -> None:
+    await init_jobs_db(db_path)
+    started = _utc_now_iso()
+    async with aiosqlite.connect(db_path) as conn:
+        await conn.execute(
+            """
+            UPDATE ingest_jobs
+            SET status = 'processing', message = 'Ingestion in progress.', started_at = COALESCE(started_at, ?),
+                updated_at = CURRENT_TIMESTAMP
+            WHERE job_id = ?
+            """,
+            (started, job_id),
+        )
+        await conn.commit()
+async def update_job_progress(
+    db_path: str,
+    job_id: str,
+    *,
+    processed_files: int,
+    failed_files: int,
+    errors: list[str],
+    message: str | None = None,
+) -> None:
+    await init_jobs_db(db_path)
+    async with aiosqlite.connect(db_path) as conn:
+        await conn.execute(
+            """
+            UPDATE ingest_jobs
+            SET processed_files = ?, failed_files = ?, errors_json = ?,
+                message = COALESCE(?, message), updated_at = CURRENT_TIMESTAMP
+            WHERE job_id = ?
+            """,
+            (processed_files, failed_files, json.dumps(errors), message, job_id),
+        )
+        await conn.commit()
+async def complete_ingest_job(
+    db_path: str,
+    job_id: str,
+    *,
+    document_ids: list[str],
+    message: str,
+) -> None:
+    await init_jobs_db(db_path)
+    completed = _utc_now_iso()
+    async with aiosqlite.connect(db_path) as conn:
+        await conn.execute(
+            """
+            UPDATE ingest_jobs
+            SET status = 'completed', message = ?, document_ids_json = ?,
+                completed_at = ?, updated_at = CURRENT_TIMESTAMP
+            WHERE job_id = ?
+            """,
+            (message, json.dumps(document_ids), completed, job_id),
+        )
+        await conn.commit()
+async def fail_ingest_job(db_path: str, job_id: str, *, message: str, errors: list[str] | None = None) -> None:
+    await init_jobs_db(db_path)
+    completed = _utc_now_iso()
+    err_json = json.dumps(errors or [message])
+    async with aiosqlite.connect(db_path) as conn:
+        await conn.execute(
+            """
+            UPDATE ingest_jobs
+            SET status = 'failed', message = ?, errors_json = ?, completed_at = ?,
+                updated_at = CURRENT_TIMESTAMP
+            WHERE job_id = ?
+            """,
+            (message, err_json, completed, job_id),
+        )
+        await conn.commit()
+async def get_job_status(db_path: str, job_id: str) -> JobStatusResponse | None:
+    """Job status DTO for API, including computed ``progress_percent``."""
+    await init_jobs_db(db_path)
+    async with aiosqlite.connect(db_path) as conn:
+        conn.row_factory = aiosqlite.Row
+        cursor = await conn.execute(
+            """
+            SELECT job_id, status, total_files, processed_files, failed_files, errors_json,
+                   started_at, completed_at, message
+            FROM ingest_jobs
+            WHERE job_id = ?
+            """,
+            (job_id,),
+        )
+        row = await cursor.fetchone()
+    if row is None:
+        return None
+    data = dict(row)
+    total = int(data["total_files"] or 0)
+    processed = int(data["processed_files"] or 0)
+    failed = int(data["failed_files"] or 0)
+    denom = total if total > 0 else 1
+    progress = int(min(100, max(0, round((processed + failed) / denom * 100))))
+    errors = json.loads(data.get("errors_json") or "[]")
+    if not isinstance(errors, list):
+        errors = [str(errors)]
+    errors_str = [str(e) for e in errors]
+    return JobStatusResponse(
+        job_id=str(data["job_id"]),
+        status=str(data["status"]),
+        total_files=total,
+        processed_files=processed,
+        failed_files=failed,
+        progress_percent=progress,
+        started_at=_parse_dt(data.get("started_at")),
+        completed_at=_parse_dt(data.get("completed_at")),
+        errors=errors_str,
+    )
+async def earliest_job_created_at_for_collection(db_path: str, collection_name: str) -> str | None:
+    """Earliest ingest job timestamp for a collection (SQLite ``created_at`` string)."""
+    await init_jobs_db(db_path)
+    async with aiosqlite.connect(db_path) as conn:
+        conn.row_factory = aiosqlite.Row
+        cursor = await conn.execute(
+            """
+            SELECT MIN(created_at) AS earliest
+            FROM ingest_jobs
+            WHERE collection_name = ?
+            """,
+            (collection_name,),
+        )
+        row = await cursor.fetchone()
+    if row is None or row["earliest"] is None:
+        return None
+    return str(row["earliest"])
+async def list_ingest_jobs(db_path: str, *, limit: int, offset: int) -> tuple[list[JobListItem], int]:
+    """Recent jobs summary list and total count for pagination."""
+    await init_jobs_db(db_path)
+    async with aiosqlite.connect(db_path) as conn:
+        conn.row_factory = aiosqlite.Row
+        cur_total = await conn.execute("SELECT COUNT(*) AS c FROM ingest_jobs")
+        total_row = await cur_total.fetchone()
+        total = int(total_row["c"]) if total_row else 0
+        cursor = await conn.execute(
+            """
+            SELECT job_id, status, total_files, completed_at
+            FROM ingest_jobs
+            ORDER BY datetime(updated_at) DESC, rowid DESC
+            LIMIT ? OFFSET ?
+            """,
+            (limit, offset),
+        )
+        rows = await cursor.fetchall()
+    items = [
+        JobListItem(
+            job_id=str(r["job_id"]),
+            status=str(r["status"]),
+            total_files=int(r["total_files"] or 0),
+            completed_at=_parse_dt(r["completed_at"]),
+        )
+        for r in rows
+    ]
+    return items, total
+def _parse_dt(value: object) -> datetime | None:
+    if value is None or value == "":
+        return None
+    s = str(value).strip()
+    if not s:
+        return None
+    if s.endswith("Z"):
+        s = s[:-1] + "+00:00"
+    try:
+        dt = datetime.fromisoformat(s)
+        if dt.tzinfo is None:
+            return dt.replace(tzinfo=timezone.utc)
+        return dt
+    except ValueError:
+        return None

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,513 @@

+"""Streamlit UI for doc-audi-ai — talks to the FastAPI backend only."""
+from __future__ import annotations
+import os
+import time
+from typing import Any
+import httpx
+import streamlit as st
+DEFAULT_API_BASE = os.environ.get("DOC_AUDI_API_BASE", "http://127.0.0.1:8000")
+# httpx read timeout for Ask/Summarise: embeddings + LLM on CPU or cold Ollama often exceeds 10 minutes.
+_HTTP_READ_TIMEOUT_DEFAULT_S = 3600.0
+_HTTP_READ_TIMEOUT_MIN_S = 60.0
+_HTTP_READ_TIMEOUT_MAX_S = 7200.0
+def _http_read_timeout_seconds() -> float:
+    raw = os.environ.get(
+        "DOC_AUDI_HTTP_READ_TIMEOUT",
+        str(int(_HTTP_READ_TIMEOUT_DEFAULT_S)),
+    )
+    try:
+        read_s = float(raw)
+    except ValueError:
+        read_s = _HTTP_READ_TIMEOUT_DEFAULT_S
+    return max(_HTTP_READ_TIMEOUT_MIN_S, min(read_s, _HTTP_READ_TIMEOUT_MAX_S))
+def _http_timeout() -> httpx.Timeout:
+    """LLM + embeddings can exceed a few minutes on CPU or cold Ollama; Streamlit uses this, not Uvicorn."""
+    read_s = _http_read_timeout_seconds()
+    return httpx.Timeout(connect=20.0, read=read_s, write=120.0, pool=30.0)
+def _fmt_timeout_hint() -> str:
+    cap = int(_http_read_timeout_seconds())
+    lo, hi = int(_HTTP_READ_TIMEOUT_MIN_S), int(_HTTP_READ_TIMEOUT_MAX_S)
+    return (
+        f"The UI stops waiting after **{cap}s** per request (set **DOC_AUDI_HTTP_READ_TIMEOUT**, "
+        f"allowed **{lo}–{hi}** s). "
+        "Ensure `ollama serve` is running; cold models or CPU inference can exceed a few minutes."
+    )
+def _api_base() -> str:
+    """Resolve API base URL. Whitespace-only sidebar input must not win over default (breaks httpx)."""
+    raw = st.session_state.get("api_base")
+    if raw is None:
+        return DEFAULT_API_BASE.rstrip("/")
+    s = str(raw).strip()
+    if not s:
+        return DEFAULT_API_BASE.rstrip("/")
+    return s.rstrip("/")
+def _client() -> httpx.Client:
+    return httpx.Client(base_url=_api_base(), timeout=_http_timeout())
+def _fmt_api_error(exc: httpx.HTTPStatusError) -> str:
+    try:
+        body = exc.response.json()
+    except Exception:
+        return f"HTTP {exc.response.status_code}: {exc.response.text[:500]}"
+    detail = body.get("detail")
+    if isinstance(detail, list):
+        parts = []
+        for item in detail:
+            if isinstance(item, dict):
+                loc = item.get("loc", ())
+                msg = item.get("msg", "")
+                parts.append(f"{'/'.join(str(x) for x in loc)}: {msg}")
+            else:
+                parts.append(str(item))
+        return f"HTTP {exc.response.status_code}: " + "; ".join(parts)
+    if detail is not None:
+        return f"HTTP {exc.response.status_code}: {detail}"
+    return f"HTTP {exc.response.status_code}"
+def _fmt_request_error(exc: httpx.RequestError) -> str:
+    """Human-readable transport errors (connection, timeouts, TLS, etc.)."""
+    base = _api_base()
+    if isinstance(exc, httpx.ReadTimeout):
+        return (
+            f"**Read timeout** — `{base}` did not send a full response in time (embeddings/LLM can be slow). "
+            f"{_fmt_timeout_hint()}"
+        )
+    if isinstance(exc, httpx.ConnectTimeout):
+        return (
+            f"**Connect timeout** — could not open TCP to `{base}` in time. "
+            "Confirm the FastAPI process is listening (`uv run uvicorn api.main:app --host 0.0.0.0 --port 8000`)."
+        )
+    if isinstance(exc, httpx.ConnectError):
+        return (
+            f"**Connection failed** — nothing is accepting HTTP at `{base}`: {exc}. "
+            "Start the API, or fix **API base URL** / **`DOC_AUDI_API_BASE`** (use `http://127.0.0.1:8000` from the same machine, not `0.0.0.0`)."
+        )
+    if isinstance(exc, httpx.TimeoutException):
+        return f"**Timeout** ({type(exc).__name__}): {exc}. {_fmt_timeout_hint()}"
+    return f"**Request error** ({type(exc).__name__}): {exc}. Backend: `{base}`."
+def _post_query_ask(
+    client: httpx.Client,
+    *,
+    question: str,
+    collection_name: str,
+    top_k: int = 5,
+    user_id: str = "anonymous",
+) -> httpx.Response:
+    """POST /query/ask (falls back to POST /query on older servers)."""
+    body: dict[str, object] = {
+        "question": question.strip(),
+        "collection_name": collection_name,
+        "top_k": top_k,
+        "user_id": user_id,
+    }
+    r = client.post("/query/ask", json=body)
+    if r.status_code == 404:
+        r = client.post("/query", json=body)
+    return r
+def _get_audit_logs(
+    client: httpx.Client,
+    *,
+    limit: int,
+    offset: int,
+    user_id: str | None = None,
+    from_date: str | None = None,
+    to_date: str | None = None,
+) -> httpx.Response:
+    params: dict[str, object] = {"limit": limit, "offset": offset}
+    if user_id:
+        params["user_id"] = user_id
+    if from_date:
+        params["from_date"] = from_date
+    if to_date:
+        params["to_date"] = to_date
+    r = client.get("/audit/logs", params=params)
+    if r.status_code == 404:
+        r = client.get("/audit", params=params)
+    return r
+def _get_audit_event_detail(client: httpx.Client, event_id: str) -> httpx.Response:
+    r = client.get(f"/audit/logs/{event_id}")
+    if r.status_code == 404:
+        r = client.get(f"/audit/{event_id}")
+    return r
+def _health_check() -> tuple[bool, str]:
+    try:
+        with _client() as c:
+            r = c.get("/health")
+            r.raise_for_status()
+            data = r.json()
+            return True, str(data)
+    except httpx.HTTPStatusError as e:
+        return False, _fmt_api_error(e)
+    except httpx.RequestError as e:
+        return False, _fmt_request_error(e)
+    except Exception as e:
+        return False, str(e)
+def main() -> None:
+    st.set_page_config(page_title="doc-audi-ai", layout="wide")
+    if "api_base" not in st.session_state:
+        st.session_state.api_base = DEFAULT_API_BASE
+    st.title("doc-audi-ai")
+    st.caption("Ingest, query, and audit via the FastAPI backend.")
+    st.caption(f"Requests go to: `{_api_base()}`")
+    with st.sidebar:
+        st.subheader("Backend")
+        st.text_input(
+            "API base URL",
+            key="api_base",
+            placeholder=DEFAULT_API_BASE,
+            help=f"Default: {DEFAULT_API_BASE}. Clear the field to use the default.",
+        )
+        st.caption(
+            f"Ask/Summarise wait up to **{int(_http_read_timeout_seconds())}s** per request "
+            f"(env `DOC_AUDI_HTTP_READ_TIMEOUT`, range {int(_HTTP_READ_TIMEOUT_MIN_S)}–{int(_HTTP_READ_TIMEOUT_MAX_S)})."
+        )
+        if st.button("Test connection"):
+            ok, msg = _health_check()
+            if ok:
+                st.success(msg)
+            else:
+                st.error(msg)
+    tab_upload, tab_jobs, tab_ask, tab_sum, tab_audit = st.tabs(
+        ["Upload", "Jobs", "Ask", "Summarise", "Audit"]
+    )
+    with tab_upload:
+        st.subheader("Upload document")
+        col_u1, col_u2 = st.columns(2)
+        with col_u1:
+            up_collection = st.text_input("Collection", value="default", key="up_col")
+            uploaded = st.file_uploader("PDF, TXT, or Markdown", type=["pdf", "txt", "md"], key="up_file")
+        with col_u2:
+            if st.button("Submit upload", key="btn_upload", disabled=uploaded is None):
+                if uploaded is None:
+                    st.warning("Choose a file first.")
+                else:
+                    try:
+                        files = {"files": (uploaded.name, uploaded.getvalue(), uploaded.type or "application/octet-stream")}
+                        data = {"collection_name": up_collection}
+                        with _client() as c:
+                            r = c.post("/ingest/upload", files=files, data=data)
+                            r.raise_for_status()
+                            out = r.json()
+                        st.success(out.get("message", "Queued"))
+                        st.json(out)
+                        if out.get("job_id"):
+                            st.session_state["last_job_id"] = out["job_id"]
+                    except httpx.HTTPStatusError as e:
+                        st.error(_fmt_api_error(e))
+                    except httpx.RequestError as e:
+                        st.error(_fmt_request_error(e))
+                    except Exception as e:
+                        st.exception(e)
+        st.subheader("Ingest from URL")
+        url_col = st.columns([3, 1])
+        with url_col[0]:
+            ingest_url = st.text_input("Document URL (http/https)", key="ingest_url")
+        with url_col[1]:
+            url_collection = st.text_input("Collection", value="default", key="url_col")
+        if st.button("Queue URL ingest", key="btn_url"):
+            if not ingest_url.strip():
+                st.warning("Enter a URL.")
+            else:
+                try:
+                    with _client() as c:
+                        r = c.post(
+                            "/ingest/url",
+                            json={"urls": [ingest_url.strip()], "collection_name": url_collection},
+                        )
+                        r.raise_for_status()
+                        out = r.json()
+                    st.success(out.get("message", "Queued"))
+                    st.json(out)
+                    if out.get("job_id"):
+                        st.session_state["last_job_id"] = out["job_id"]
+                except httpx.HTTPStatusError as e:
+                    st.error(_fmt_api_error(e))
+                except httpx.RequestError as e:
+                    st.error(_fmt_request_error(e))
+                except Exception as e:
+                    st.exception(e)
+        st.subheader("Collections")
+        if st.button("Refresh collections", key="btn_collections"):
+            try:
+                with _client() as c:
+                    r = c.get("/ingest/collections")
+                    r.raise_for_status()
+                    cols = r.json()
+                rows = cols.get("collections", [])
+                st.write(f"{cols.get('total', len(rows))} collection(s).")
+                if rows:
+                    st.dataframe(rows, hide_index=True, use_container_width=True)
+                else:
+                    st.info("No collections yet.")
+            except httpx.HTTPStatusError as e:
+                st.error(_fmt_api_error(e))
+            except httpx.RequestError as e:
+                st.error(_fmt_request_error(e))
+            except Exception as e:
+                st.exception(e)
+        del_name = st.text_input("Delete collection name (optional)", key="del_col")
+        if st.button("Delete collection", key="btn_del_col"):
+            if not del_name.strip():
+                st.warning("Enter a collection name.")
+            else:
+                try:
+                    with _client() as c:
+                        r = c.delete(f"/ingest/collection/{del_name.strip()}")
+                        r.raise_for_status()
+                    del_body = r.json()
+                    st.success(del_body.get("message", "Deleted"))
+                    if "documents_removed" in del_body:
+                        st.caption(f"Documents removed: **{del_body['documents_removed']}**")
+                except httpx.HTTPStatusError as e:
+                    st.error(_fmt_api_error(e))
+                except httpx.RequestError as e:
+                    st.error(_fmt_request_error(e))
+                except Exception as e:
+                    st.exception(e)
+    with tab_jobs:
+        st.subheader("Job list")
+        j1, j2 = st.columns(2)
+        with j1:
+            j_limit = st.number_input("Limit", min_value=1, max_value=100, value=20, key="j_lim")
+        with j2:
+            j_offset = st.number_input("Offset", min_value=0, value=0, key="j_off")
+        if st.button("List jobs", key="btn_jobs"):
+            try:
+                with _client() as c:
+                    r = c.get("/jobs", params={"limit": int(j_limit), "offset": int(j_offset)})
+                    r.raise_for_status()
+                    payload = r.json()
+                jobs: list[dict[str, Any]] = payload.get("jobs", [])
+                st.caption(f"Total jobs (matching filters): **{payload.get('total', len(jobs))}**")
+                if jobs:
+                    st.dataframe(jobs, hide_index=True, use_container_width=True)
+                else:
+                    st.info("No jobs in this window.")
+            except httpx.HTTPStatusError as e:
+                st.error(_fmt_api_error(e))
+            except httpx.RequestError as e:
+                st.error(_fmt_request_error(e))
+            except Exception as e:
+                st.exception(e)
+        st.subheader("Job detail")
+        default_job = st.session_state.get("last_job_id", "")
+        job_id = st.text_input("Job ID", value=default_job, key="job_id_in")
+        c1, c2 = st.columns(2)
+        with c1:
+            fetch_job = st.button("Fetch job", key="btn_job_one")
+        with c2:
+            poll_job = st.button("Poll until completed/failed", key="btn_job_poll")
+        if fetch_job and job_id.strip():
+            try:
+                with _client() as c:
+                    r = c.get(f"/jobs/{job_id.strip()}")
+                    r.raise_for_status()
+                    detail = r.json()
+                st.json(detail)
+            except httpx.HTTPStatusError as e:
+                st.error(_fmt_api_error(e))
+            except httpx.RequestError as e:
+                st.error(_fmt_request_error(e))
+            except Exception as e:
+                st.exception(e)
+        if poll_job and job_id.strip():
+            status_ph = st.empty()
+            try:
+                with _client() as c:
+                    for i in range(120):
+                        r = c.get(f"/jobs/{job_id.strip()}")
+                        r.raise_for_status()
+                        body = r.json()
+                        st_ = body.get("status", "")
+                        status_ph.write(f"Poll {i + 1}: **{st_}** — {body.get('progress_percent', 0)}%")
+                        if st_ in ("completed", "failed"):
+                            st.json(body)
+                            break
+                        time.sleep(1)
+                    else:
+                        status_ph.write("Stopped after 120 attempts (~2 min).")
+                        st.json(body)
+            except httpx.HTTPStatusError as e:
+                st.error(_fmt_api_error(e))
+            except httpx.RequestError as e:
+                st.error(_fmt_request_error(e))
+            except Exception as e:
+                st.exception(e)
+    with tab_ask:
+        st.subheader("Ask a question")
+        q_col = st.text_input("Collection", value="default", key="ask_col")
+        question = st.text_area("Question", height=120, key="ask_q")
+        if st.button("Ask", key="btn_ask"):
+            if not question.strip():
+                st.warning("Enter a question.")
+            else:
+                try:
+                    with st.spinner(
+                        "Calling the API (embeddings + LLM can take several minutes on a slow machine; "
+                        "ensure Ollama is running). Timeout is controlled by DOC_AUDI_HTTP_READ_TIMEOUT…"
+                    ):
+                        with _client() as c:
+                            r = _post_query_ask(
+                                c,
+                                question=question,
+                                collection_name=q_col,
+                            )
+                            r.raise_for_status()
+                            ans = r.json()
+                    st.success(f"Query id: `{ans.get('query_id', '')}`")
+                    if ans.get("answer"):
+                        st.markdown("### Answer")
+                        st.markdown(ans["answer"])
+                    else:
+                        st.warning(
+                            "The API returned no **answer** text. "
+                            "Check the collection has ingested chunks, LLM env, and expand **Raw response** below."
+                        )
+                    src = ans.get("sources") or []
+                    if src:
+                        with st.expander(f"Sources ({len(src)})"):
+                            st.json(src)
+                    else:
+                        st.caption("No sources in this response (empty retrieval or model returned nothing).")
+                    with st.expander("Raw response (debug)"):
+                        st.json(ans)
+                except httpx.HTTPStatusError as e:
+                    st.error(_fmt_api_error(e))
+                except httpx.RequestError as e:
+                    st.error(_fmt_request_error(e))
+                except Exception as e:
+                    st.exception(e)
+    with tab_sum:
+        st.subheader("Summarise collection")
+        s_col = st.text_input("Collection", value="default", key="sum_col")
+        focus = st.text_input("Optional focus / angle", value="", key="sum_focus")
+        if st.button("Summarise", key="btn_sum"):
+            try:
+                body: dict[str, Any] = {"collection_name": s_col}
+                if focus.strip():
+                    body["focus"] = focus.strip()
+                with st.spinner("Calling summarise (can take 1–2 minutes on a cold model)…"):
+                    with _client() as c:
+                        r = c.post("/query/summarise", json=body)
+                        r.raise_for_status()
+                        ans = r.json()
+                st.success(f"Query id: `{ans.get('query_id', '')}` · documents: **{ans.get('document_count', '')}**")
+                summary_text = ans.get("summary") or ans.get("answer")
+                if summary_text:
+                    st.markdown("### Summary")
+                    st.markdown(summary_text)
+                else:
+                    st.warning("No summary text in the response; see **Raw response** below.")
+                src = ans.get("sources") or []
+                if src:
+                    with st.expander(f"Sources ({len(src)})"):
+                        st.json(src)
+                with st.expander("Raw response (debug)"):
+                    st.json(ans)
+            except httpx.HTTPStatusError as e:
+                st.error(_fmt_api_error(e))
+            except httpx.RequestError as e:
+                st.error(_fmt_request_error(e))
+            except Exception as e:
+                st.exception(e)
+    with tab_audit:
+        st.subheader("Audit log")
+        a1, a2 = st.columns(2)
+        with a1:
+            a_limit = st.number_input("Limit", min_value=1, max_value=100, value=20, key="a_lim")
+        with a2:
+            a_offset = st.number_input("Offset", min_value=0, value=0, key="a_off")
+        if st.button("List audit events", key="btn_audit_list"):
+            try:
+                with _client() as c:
+                    r = _get_audit_logs(
+                        c,
+                        limit=int(a_limit),
+                        offset=int(a_offset),
+                    )
+                    r.raise_for_status()
+                    payload = r.json()
+                events = payload.get("logs", payload.get("events", []))
+                st.caption(f"Total matching: **{payload.get('total', len(events))}**")
+                if events:
+                    st.dataframe(events, hide_index=True, use_container_width=True)
+                    ids = [
+                        e.get("query_id") or e.get("event_id")
+                        for e in events
+                        if isinstance(e, dict) and (e.get("query_id") or e.get("event_id"))
+                    ]
+                    if ids:
+                        st.session_state["_audit_ids"] = ids
+                else:
+                    st.info("No audit events.")
+            except httpx.HTTPStatusError as e:
+                st.error(_fmt_api_error(e))
+            except httpx.RequestError as e:
+                st.error(_fmt_request_error(e))
+            except Exception as e:
+                st.exception(e)
+        st.subheader("Audit event detail")
+        ids_for_select = st.session_state.get("_audit_ids", [])
+        pick = ""
+        if ids_for_select:
+            pick = st.selectbox("Event ID", options=[""] + list(ids_for_select), key="audit_pick")
+        manual_id = st.text_input("Or enter query / event ID", key="audit_manual")
+        ev_id = (manual_id.strip() or (pick or "").strip()).strip()
+        if st.button("Load detail", key="btn_audit_detail") and ev_id:
+            try:
+                with _client() as c:
+                    r = _get_audit_event_detail(c, ev_id)
+                    r.raise_for_status()
+                    st.json(r.json())
+            except httpx.HTTPStatusError as e:
+                st.error(_fmt_api_error(e))
+            except httpx.RequestError as e:
+                st.error(_fmt_request_error(e))
+            except Exception as e:
+                st.exception(e)
+if __name__ == "__main__":
+    main()

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""Pytest fixtures: isolated temp DB/Chroma paths and a patched FastAPI test client."""
+import sys
+from pathlib import Path
+import pytest
+from fastapi.testclient import TestClient
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+from api.config import Settings
+from api.main import app
+@pytest.fixture
+def test_settings(tmp_path) -> Settings:
+    return Settings(
+        llm_provider="ollama",
+        chroma_persist_directory=str(tmp_path / "chroma"),
+        audit_db_path=str(tmp_path / "audit.db"),
+        jobs_db_path=str(tmp_path / "jobs.db"),
+        max_file_size_mb=1,
+        top_k_results=3,
+    )
+@pytest.fixture
+def settings(test_settings) -> Settings:
+    """Alias for audit tests that name the fixture `settings`."""
+    return test_settings
+@pytest.fixture
+def client(test_settings, monkeypatch):
+    monkeypatch.setattr("api.main.get_settings", lambda: test_settings)
+    for route_mod in ("ingest", "query", "audit", "jobs"):
+        monkeypatch.setattr(f"api.routes.{route_mod}.get_settings", lambda ts=test_settings: ts)
+    with TestClient(app) as test_client:
+        yield test_client

tests/test_audit.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""Tests for audit log list, detail, filters, and post-query persistence."""
+import asyncio
+from unittest.mock import AsyncMock
+from uuid import uuid4
+import pytest
+from fastapi.testclient import TestClient
+from api.config import Settings
+from api.main import app
+from models.responses import SourceCitation
+from rag.retriever import RetrievedChunk
+from storage.audit_store import persist_query_audit
+def _seed_audit(settings: Settings, question: str = "What are key risks?", user_id: str = "analyst_001") -> str:
+    query_id = str(uuid4())
+    asyncio.run(
+        persist_query_audit(
+            settings.audit_db_path,
+            query_id=query_id,
+            action="query",
+            user_id=user_id,
+            question=question,
+            collection_name="default",
+            answer="Grounded answer text for audit trail.",
+            sources=[
+                SourceCitation(
+                    document_name="report.pdf",
+                    page_number=3,
+                    chunk_text="Risk disclosure excerpt.",
+                    relevance_score=0.9,
+                )
+            ],
+            model_used="ollama:llama3.1:8b",
+            tokens_used=120,
+            response_time_ms=50,
+            kind="ask",
+        )
+    )
+    return query_id
+def test_audit_logs_and_detail_success(client, settings):
+    query_id = _seed_audit(settings)
+    list_response = client.get("/audit/logs?limit=10&offset=0")
+    assert list_response.status_code == 200
+    body = list_response.json()
+    assert "logs" in body
+    assert body["total"] >= 1
+    assert any(entry["query_id"] == query_id for entry in body["logs"])
+    detail_response = client.get(f"/audit/logs/{query_id}")
+    assert detail_response.status_code == 200
+    detail = detail_response.json()
+    assert detail["query_id"] == query_id
+    assert detail["question"] == "What are key risks?"
+    assert detail["full_answer"] == "Grounded answer text for audit trail."
+    assert len(detail["sources"]) == 1
+    assert detail["sources"][0]["document_name"] == "report.pdf"
+def test_audit_logs_filter_by_user_id(client, settings):
+    q1 = _seed_audit(settings, question="Q one", user_id="user_a")
+    _seed_audit(settings, question="Q two", user_id="user_b")
+    r = client.get("/audit/logs", params={"user_id": "user_a", "limit": 50, "offset": 0})
+    assert r.status_code == 200
+    body = r.json()
+    ids = {e["query_id"] for e in body["logs"]}
+    assert q1 in ids
+    assert all(e["user_id"] == "user_a" for e in body["logs"])
+def test_audit_logs_filter_by_from_date(client, settings):
+    query_id = str(uuid4())
+    asyncio.run(
+        persist_query_audit(
+            settings.audit_db_path,
+            query_id=query_id,
+            action="query",
+            user_id="u",
+            question="Future dated row",
+            collection_name="default",
+            answer="A",
+            sources=[],
+            model_used="m",
+            tokens_used=0,
+            response_time_ms=1,
+            kind="ask",
+        )
+    )
+    r = client.get("/audit/logs", params={"from_date": "2099-01-01T00:00:00Z", "limit": 50, "offset": 0})
+    assert r.status_code == 200
+    body = r.json()
+    assert query_id not in {e["query_id"] for e in body["logs"]}
+def test_audit_logs_filter_by_to_date(client, settings):
+    """Spec: date filtering on /audit/logs (upper bound)."""
+    query_id = str(uuid4())
+    asyncio.run(
+        persist_query_audit(
+            settings.audit_db_path,
+            query_id=query_id,
+            action="query",
+            user_id="u",
+            question="Recent row",
+            collection_name="default",
+            answer="B",
+            sources=[],
+            model_used="m",
+            tokens_used=0,
+            response_time_ms=1,
+            kind="ask",
+        )
+    )
+    r = client.get("/audit/logs", params={"to_date": "2000-01-01T00:00:00Z", "limit": 50, "offset": 0})
+    assert r.status_code == 200
+    body = r.json()
+    assert query_id not in {e["query_id"] for e in body["logs"]}
+def test_ask_is_logged_after_query_ask(client, monkeypatch):
+    """Spec: ask is logged after POST /query/ask."""
+    chunks = [
+        RetrievedChunk(
+            text="Audit trail test chunk.",
+            score=0.9,
+            source="audit-test.txt",
+            page=1,
+            chunk_index=0,
+        )
+    ]
+    monkeypatch.setattr("api.routes.query.create_embedding_function", lambda: object())
+    monkeypatch.setattr("api.routes.query.get_vector_store", lambda **_: object())
+    monkeypatch.setattr("api.routes.query.retrieve_chunks", lambda *_: chunks)
+    monkeypatch.setattr(
+        "api.routes.query.answer_with_grounding",
+        lambda *_: ("Answer stored in audit.", 11),
+    )
+    ask = client.post(
+        "/query/ask",
+        json={
+            "question": "What should appear in the audit log?",
+            "collection_name": "default",
+            "user_id": "audit_user",
+        },
+    )
+    assert ask.status_code == 200
+    query_id = ask.json()["query_id"]
+    detail = client.get(f"/audit/logs/{query_id}")
+    assert detail.status_code == 200
+    body = detail.json()
+    assert body["user_id"] == "audit_user"
+    assert body["full_answer"] == "Answer stored in audit."
+    assert body["question"] == "What should appear in the audit log?"
+def test_summarise_is_logged_after_query_summarise(client, monkeypatch):
+    """Spec: summarise is logged after POST /query/summarise."""
+    chunks = [
+        RetrievedChunk(
+            text="Summary source chunk.",
+            score=0.85,
+            source="summary.md",
+            page=2,
+            chunk_index=0,
+        )
+    ]
+    monkeypatch.setattr("api.routes.query.create_embedding_function", lambda: object())
+    monkeypatch.setattr("api.routes.query.get_vector_store", lambda **_: object())
+    monkeypatch.setattr("api.routes.query.retrieve_chunks", lambda *_: chunks)
+    monkeypatch.setattr(
+        "api.routes.query.summarise_with_grounding",
+        lambda *_, **__: ("Collection summary for audit.", 7),
+    )
+    monkeypatch.setattr("api.routes.query.collection_document_count", lambda *_: 2)
+    summarise = client.post(
+        "/query/summarise",
+        json={"collection_name": "default", "focus": "key themes", "user_id": "sum_user"},
+    )
+    assert summarise.status_code == 200
+    query_id = summarise.json()["query_id"]
+    detail = client.get(f"/audit/logs/{query_id}")
+    assert detail.status_code == 200
+    assert detail.json()["full_answer"] == "Collection summary for audit."
+    assert detail.json()["user_id"] == "sum_user"
+def test_audit_logs_validation_error_for_bad_limit(client):
+    response = client.get("/audit/logs?limit=0&offset=0")
+    assert response.status_code == 422
+def test_audit_detail_not_found(client):
+    response = client.get("/audit/logs/does-not-exist")
+    assert response.status_code == 404
+    assert "not found" in response.json()["detail"].lower()
+def test_audit_logs_returns_500_on_store_failure(settings, monkeypatch):
+    monkeypatch.setattr("api.main.get_settings", lambda: settings)
+    monkeypatch.setattr("api.routes.audit.get_settings", lambda: settings)
+    monkeypatch.setattr(
+        "api.routes.audit.list_audit_events",
+        AsyncMock(side_effect=RuntimeError("audit store failure")),
+    )
+    with TestClient(app, raise_server_exceptions=False) as test_client:
+        response = test_client.get("/audit/logs")
+    assert response.status_code == 500

tests/test_config.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Settings behaviour for Hugging Face Spaces and Hub tokens."""
+from api.config import Settings
+def test_space_id_without_llm_provider_env_uses_huggingface_and_hf_token(monkeypatch):
+    monkeypatch.setenv("SPACE_ID", "author/repo")
+    monkeypatch.delenv("LLM_PROVIDER", raising=False)
+    monkeypatch.delenv("HUGGINGFACE_API_KEY", raising=False)
+    monkeypatch.setenv("HF_TOKEN", "hf_test_token")
+    s = Settings(_env_file=None)
+    assert s.llm_provider == "huggingface"
+    assert s.huggingface_api_key == "hf_test_token"
+def test_space_id_respects_explicit_llm_provider_ollama(monkeypatch):
+    monkeypatch.setenv("SPACE_ID", "author/repo")
+    monkeypatch.setenv("LLM_PROVIDER", "ollama")
+    monkeypatch.delenv("HUGGINGFACE_API_KEY", raising=False)
+    s = Settings(_env_file=None)
+    assert s.llm_provider == "ollama"

tests/test_health.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""Smoke test for the liveness endpoint."""
+def test_health_returns_ok(client):
+    response = client.get("/health")
+    assert response.status_code == 200
+    body = response.json()
+    assert body["status"] == "ok"
+    assert "app" in body
+    assert "version" in body

tests/test_ingest.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""Tests for ``/ingest`` upload, URL ingest, and collection management."""
+import asyncio
+from unittest.mock import AsyncMock
+from api.routes import ingest as ingest_route
+from storage.job_store import create_ingest_job, mark_job_processing
+def test_upload_queues_job_success(client, monkeypatch):
+    monkeypatch.setattr("api.routes.ingest.create_ingest_job", AsyncMock(return_value="job-123"))
+    monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None))
+    response = client.post(
+        "/ingest/upload",
+        data={"collection_name": "default"},
+        files=[("files", ("sample.txt", b"hello world", "text/plain"))],
+    )
+    assert response.status_code == 200
+    body = response.json()
+    assert body["status"] == "queued"
+    assert body["job_id"] == "job-123"
+    assert body["total_files"] == 1
+    assert body["filenames"] == ["sample.txt"]
+    assert "Poll /jobs/job-123" in body["message"]
+def test_upload_rejects_unsupported_extension(client):
+    response = client.post(
+        "/ingest/upload",
+        data={"collection_name": "default"},
+        files=[("files", ("sample.csv", b"a,b\n1,2", "text/csv"))],
+    )
+    assert response.status_code == 400
+    assert "Unsupported file type" in response.json()["detail"]
+def test_upload_rejects_oversized_file(client):
+    oversized = b"x" * (2 * 1024 * 1024)
+    response = client.post(
+        "/ingest/upload",
+        data={"collection_name": "default"},
+        files=[("files", ("large.txt", oversized, "text/plain"))],
+    )
+    assert response.status_code == 413
+    assert "too large" in response.json()["detail"].lower()
+def test_upload_returns_500_on_job_creation_error(client, monkeypatch):
+    monkeypatch.setattr(
+        "api.routes.ingest.create_ingest_job",
+        AsyncMock(side_effect=RuntimeError("job store unavailable")),
+    )
+    monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None))
+    response = client.post(
+        "/ingest/upload",
+        data={"collection_name": "default"},
+        files=[("files", ("sample.txt", b"hello", "text/plain"))],
+    )
+    assert response.status_code == 500
+    assert "job store unavailable" in response.json()["detail"]
+def test_download_request_headers_sec_compliant():
+    headers = ingest_route._download_request_headers("DocuAudit AI test@example.com")
+    assert headers["User-Agent"] == "DocuAudit AI test@example.com"
+    assert headers["Accept-Encoding"] == "gzip, deflate"
+    assert "application/pdf" in headers["Accept"]
+def test_ingest_url_rejects_non_http_scheme(client, monkeypatch):
+    monkeypatch.setattr(
+        "api.routes.ingest._download_url_to_temp",
+        AsyncMock(
+            side_effect=ingest_route.HTTPException(status_code=400, detail="Only http and https URLs are supported.")
+        ),
+    )
+    response = client.post(
+        "/ingest/url",
+        json={"urls": ["https://example.com/file.txt"], "collection_name": "default"},
+    )
+    assert response.status_code == 400
+    assert "http and https" in response.json()["detail"]
+def test_upload_pdf_queues_job_with_job_id(client, monkeypatch):
+    """Spec: single PDF upload returns job_id."""
+    monkeypatch.setattr("api.routes.ingest.create_ingest_job", AsyncMock(return_value="pdf-job-99"))
+    monkeypatch.setattr("api.routes.ingest.run_ingest_job", AsyncMock(return_value=None))
+    response = client.post(
+        "/ingest/upload",
+        data={"collection_name": "default"},
+        files=[("files", ("brief.pdf", b"%PDF-1.4 minimal", "application/pdf"))],
+    )
+    assert response.status_code == 200
+    body = response.json()
+    assert body["job_id"] == "pdf-job-99"
+    assert body["filenames"] == ["brief.pdf"]
+def test_list_collections_backfills_created_at_from_jobs(client, test_settings, monkeypatch):
+    monkeypatch.setattr(
+        "api.routes.ingest.list_collection_names",
+        lambda *_: ["default"],
+    )
+    monkeypatch.setattr("api.routes.ingest.collection_document_count", lambda *_: 3)
+    monkeypatch.setattr("api.routes.ingest.collection_created_at", lambda *_: None)
+    monkeypatch.setattr(
+        "api.routes.ingest.earliest_job_created_at_for_collection",
+        AsyncMock(return_value="2026-05-21 07:05:38"),
+    )
+    monkeypatch.setattr(
+        "api.routes.ingest.ensure_collection_created_at",
+        lambda *_a, **_k: "2026-05-21T07:05:38Z",
+    )
+    response = client.get("/ingest/collections")
+    assert response.status_code == 200
+    body = response.json()
+    assert body["total"] == 1
+    assert body["collections"][0]["name"] == "default"
+    assert body["collections"][0]["document_count"] == 3
+    assert body["collections"][0]["created_at"] is not None
+def test_job_status_polling_after_real_job_create(client, test_settings):
+    """Spec: job status polling returns correct structure."""
+    job_id = asyncio.run(
+        create_ingest_job(
+            test_settings.jobs_db_path,
+            collection_name="default",
+            filenames=["sample.txt"],
+        )
+    )
+    asyncio.run(mark_job_processing(test_settings.jobs_db_path, job_id))
+    response = client.get(f"/jobs/{job_id}")
+    assert response.status_code == 200
+    body = response.json()
+    assert body["job_id"] == job_id
+    assert body["status"] == "processing"
+    assert body["total_files"] == 1
+    assert "progress_percent" in body
+    assert "errors" in body

tests/test_jobs.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""Tests for ingest job listing and status endpoints."""
+import asyncio
+from storage.job_store import create_ingest_job, update_job_progress
+def test_get_job_status_returns_spec_shape(client, test_settings):
+    job_id = asyncio.run(
+        create_ingest_job(
+            test_settings.jobs_db_path,
+            collection_name="default",
+            filenames=["report.pdf", "notes.txt"],
+        )
+    )
+    asyncio.run(
+        update_job_progress(
+            test_settings.jobs_db_path,
+            job_id,
+            processed_files=1,
+            failed_files=0,
+            errors=[],
+            message="Processing first file",
+        )
+    )
+    response = client.get(f"/jobs/{job_id}")
+    assert response.status_code == 200
+    body = response.json()
+    assert body["job_id"] == job_id
+    assert body["status"] in ("queued", "processing", "completed", "failed")
+    assert body["total_files"] == 2
+    assert body["processed_files"] == 1
+    assert body["failed_files"] == 0
+    assert 0 <= body["progress_percent"] <= 100
+    assert isinstance(body["errors"], list)
+def test_list_jobs_includes_total(client, test_settings):
+    job_id = asyncio.run(
+        create_ingest_job(
+            test_settings.jobs_db_path,
+            collection_name="default",
+            filenames=["sample.txt"],
+        )
+    )
+    response = client.get("/jobs", params={"limit": 10, "offset": 0})
+    assert response.status_code == 200
+    body = response.json()
+    assert body["total"] >= 1
+    assert any(j["job_id"] == job_id for j in body["jobs"])
+def test_get_job_not_found_returns_404(client):
+    response = client.get("/jobs/nonexistent-job-id")
+    assert response.status_code == 404
+    assert "not found" in response.json()["detail"].lower()

tests/test_query.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""Tests for ``/query/ask``, ``/query/summarise``, and legacy ``POST /query``."""
+from unittest.mock import AsyncMock
+from rag.retriever import NO_MATCH_ANSWER, RetrievedChunk
+def test_ask_returns_grounded_answer_with_sources(client, monkeypatch):
+    chunks = [
+        RetrievedChunk(
+            text="Audi has strategic EV expansion plans.",
+            score=0.92,
+            source="strategy.md",
+            page=1,
+            chunk_index=0,
+        )
+    ]
+    monkeypatch.setattr("api.routes.query.create_embedding_function", lambda: object())
+    monkeypatch.setattr("api.routes.query.get_vector_store", lambda **_: object())
+    monkeypatch.setattr("api.routes.query.retrieve_chunks", lambda *_: chunks)
+    monkeypatch.setattr("api.routes.query.answer_with_grounding", lambda *_: ("Audi is expanding EV investment.", 42))
+    monkeypatch.setattr("api.routes.query.persist_query_audit", AsyncMock(return_value="evt-1"))
+    response = client.post(
+        "/query/ask",
+        json={
+            "question": "What is Audi doing in EV markets worldwide?",
+            "collection_name": "default",
+            "top_k": 3,
+            "user_id": "tester",
+        },
+    )
+    assert response.status_code == 200
+    body = response.json()
+    assert body["answer"] == "Audi is expanding EV investment."
+    assert "query_id" in body
+    assert body["question"].startswith("What is Audi")
+    assert len(body["sources"]) == 1
+    assert body["sources"][0]["document_name"] == "strategy.md"
+    assert body["sources"][0]["page_number"] == 1
+    assert body["tokens_used"] == 42
+    assert "response_time_ms" in body
+    assert "model_used" in body
+def test_ask_respects_top_k_in_retrieve_call(client, monkeypatch):
+    captured: dict[str, object] = {}
+    def capture_retrieve(vs, question, k):
+        captured["k"] = k
+        return []
+    monkeypatch.setattr("api.routes.query.create_embedding_function", lambda: object())
+    monkeypatch.setattr("api.routes.query.get_vector_store", lambda **_: object())
+    monkeypatch.setattr("api.routes.query.retrieve_chunks", capture_retrieve)
+    monkeypatch.setattr("api.routes.query.answer_with_grounding", lambda *_: ("No match answer", 0))
+    monkeypatch.setattr("api.routes.query.persist_query_audit", AsyncMock())
+    response = client.post(
+        "/query/ask",
+        json={"question": "What is known about the topic here?", "collection_name": "default", "top_k": 7},
+    )
+    assert response.status_code == 200
+    assert captured.get("k") == 7
+def test_ask_empty_collection_returns_no_match_message(client, monkeypatch):
+    """Spec: query on empty collection returns appropriate message."""
+    monkeypatch.setattr("api.routes.query.create_embedding_function", lambda: object())
+    monkeypatch.setattr("api.routes.query.get_vector_store", lambda **_: object())
+    monkeypatch.setattr("api.routes.query.retrieve_chunks", lambda *_: [])
+    monkeypatch.setattr("api.routes.query.persist_query_audit", AsyncMock())
+    response = client.post(
+        "/query/ask",
+        json={
+            "question": "What does the document say about revenue?",
+            "collection_name": "default",
+            "top_k": 5,
+        },
+    )
+    assert response.status_code == 200
+    assert response.json()["answer"] == NO_MATCH_ANSWER
+    assert response.json()["sources"] == []
+def test_ask_low_relevance_chunks_returns_no_match_message(client, monkeypatch):
+    low_score_chunks = [
+        RetrievedChunk(
+            text="Unrelated fragment.",
+            score=0.05,
+            source="noise.txt",
+            page=1,
+            chunk_index=0,
+        )
+    ]
+    monkeypatch.setattr("api.routes.query.create_embedding_function", lambda: object())
+    monkeypatch.setattr("api.routes.query.get_vector_store", lambda **_: object())
+    monkeypatch.setattr("api.routes.query.retrieve_chunks", lambda *_: low_score_chunks)
+    monkeypatch.setattr("api.routes.query.persist_query_audit", AsyncMock())
+    response = client.post(
+        "/query/ask",
+        json={"question": "What are the key risk factors?", "collection_name": "default"},
+    )
+    assert response.status_code == 200
+    assert response.json()["answer"] == NO_MATCH_ANSWER
+def test_ask_returns_422_for_invalid_payload(client):
+    response = client.post("/query/ask", json={"collection_name": "default"})
+    assert response.status_code == 422
+def test_ask_returns_422_for_short_question(client):
+    response = client.post(
+        "/query/ask",
+        json={"question": "hi", "collection_name": "default"},
+    )
+    assert response.status_code == 422
+def test_ask_returns_500_when_retrieval_fails(client, monkeypatch):
+    monkeypatch.setattr("api.routes.query.create_embedding_function", lambda: object())
+    monkeypatch.setattr("api.routes.query.get_vector_store", lambda **_: object())
+    monkeypatch.setattr("api.routes.query.retrieve_chunks", lambda *_: (_ for _ in ()).throw(RuntimeError("retrieval failed")))
+    response = client.post(
+        "/query/ask",
+        json={"question": "What happened in the documents?", "collection_name": "default"},
+    )
+    assert response.status_code == 500
+    assert "retrieval failed" in response.json()["detail"]
+def test_summarise_returns_summary_payload(client, monkeypatch):
+    """Spec: /query/summarise returns summary payload when collection has documents."""
+    chunks = [
+        RetrievedChunk(
+            text="Revenue grew year over year.",
+            score=0.9,
+            source="report.txt",
+            page=2,
+            chunk_index=0,
+        )
+    ]
+    monkeypatch.setattr("api.routes.query.create_embedding_function", lambda: object())
+    monkeypatch.setattr("api.routes.query.get_vector_store", lambda **_: object())
+    monkeypatch.setattr("api.routes.query.retrieve_chunks", lambda *_: chunks)
+    monkeypatch.setattr("api.routes.query.summarise_with_grounding", lambda *_, **__: ("Executive summary text.", 25))
+    monkeypatch.setattr("api.routes.query.collection_document_count", lambda *_: 3)
+    monkeypatch.setattr("api.routes.query.persist_query_audit", AsyncMock())
+    response = client.post(
+        "/query/summarise",
+        json={"collection_name": "default", "focus": "financial highlights", "user_id": "analyst"},
+    )
+    assert response.status_code == 200
+    body = response.json()
+    assert body["summary"] == "Executive summary text."
+    assert body["document_count"] == 3
+    assert "query_id" in body
+    assert len(body["sources"]) == 1
+    assert body["sources"][0]["document_name"] == "report.txt"
+def test_summarise_returns_500_when_audit_persist_fails(client, monkeypatch):
+    chunks = [
+        RetrievedChunk(
+            text="Revenue and risks are discussed in the report.",
+            score=0.88,
+            source="report.txt",
+            page=None,
+            chunk_index=2,
+        )
+    ]
+    monkeypatch.setattr("api.routes.query.create_embedding_function", lambda: object())
+    monkeypatch.setattr("api.routes.query.get_vector_store", lambda **_: object())
+    monkeypatch.setattr("api.routes.query.retrieve_chunks", lambda *_: chunks)
+    monkeypatch.setattr("api.routes.query.summarise_with_grounding", lambda *_, **__: ("Summary output", 10))
+    monkeypatch.setattr("api.routes.query.collection_document_count", lambda *_: 5)
+    monkeypatch.setattr(
+        "api.routes.query.persist_query_audit",
+        AsyncMock(side_effect=RuntimeError("audit write failed")),
+    )
+    response = client.post(
+        "/query/summarise",
+        json={"collection_name": "default", "focus": "summarise risks", "user_id": "u1"},
+    )
+    assert response.status_code == 500
+    assert "audit write failed" in response.json()["detail"]
+def test_legacy_query_endpoint_matches_ask(client, monkeypatch):
+    chunks = [
+        RetrievedChunk(
+            text="Clause about indemnity.",
+            score=0.8,
+            source="contract.md",
+            page=4,
+            chunk_index=1,
+        )
+    ]
+    monkeypatch.setattr("api.routes.query.create_embedding_function", lambda: object())
+    monkeypatch.setattr("api.routes.query.get_vector_store", lambda **_: object())
+    monkeypatch.setattr("api.routes.query.retrieve_chunks", lambda *_: chunks)
+    monkeypatch.setattr("api.routes.query.answer_with_grounding", lambda *_: ("Indemnity is capped.", 5))
+    monkeypatch.setattr("api.routes.query.persist_query_audit", AsyncMock())
+    payload = {
+        "question": "What are the indemnity limits in the contract?",
+        "collection_name": "default",
+        "top_k": 3,
+    }
+    ask = client.post("/query/ask", json=payload)
+    legacy = client.post("/query", json=payload)
+    assert ask.status_code == 200
+    assert legacy.status_code == 200
+    assert legacy.json()["answer"] == ask.json()["answer"]
+    assert "query_id" in legacy.json()
+    assert legacy.json()["sources"][0]["document_name"] == "contract.md"

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

workers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Background workers (ingest pipeline)."""

workers/ingest_worker.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""Background ingest worker invoked from FastAPI ``BackgroundTasks``.
+For each temp file: load → chunk → embed → add to Chroma, then update job progress in SQLite.
+Temp files are always deleted in a ``finally`` block.
+"""
+import asyncio
+from pathlib import Path
+from rag.chunker import chunk_documents
+from rag.embedder import create_embedding_function
+from rag.loader import load_documents
+from rag.vector_store import add_documents, get_vector_store
+from storage.job_store import (
+    complete_ingest_job,
+    fail_ingest_job,
+    mark_job_processing,
+    update_job_progress,
+)
+def _ingest_one_file_sync(temp_path: str, collection_name: str, chroma_persist_directory: str) -> tuple[list[str], int]:
+    """Blocking ingest for one path; returns ``(chunk_vector_ids, chunk_count)``."""
+    documents = load_documents(temp_path)
+    chunks = chunk_documents(documents)
+    if not chunks:
+        raise ValueError("No content to ingest.")
+    embedding_function = create_embedding_function()
+    vector_store = get_vector_store(
+        persist_directory=chroma_persist_directory,
+        collection_name=collection_name,
+        embedding_function=embedding_function,
+    )
+    document_ids = add_documents(vector_store, chunks)
+    return document_ids, len(chunks)
+async def run_ingest_job(
+    job_id: str,
+    files: list[tuple[str, str]],
+    collection_name: str,
+    jobs_db_path: str,
+    chroma_persist_directory: str,
+) -> None:
+    """
+    Process one or more temp files for a single job. ``files`` is (temp_path, display_name).
+    """
+    all_doc_ids: list[str] = []
+    errors: list[str] = []
+    processed = 0
+    failed = 0
+    total = len(files)
+    if total == 0:
+        await fail_ingest_job(jobs_db_path, job_id, message="No files to ingest.")
+        return
+    try:
+        await mark_job_processing(jobs_db_path, job_id)
+        for temp_path, display_name in files:
+            try:
+                doc_ids, num_chunks = await asyncio.to_thread(
+                    _ingest_one_file_sync,
+                    temp_path,
+                    collection_name,
+                    chroma_persist_directory,
+                )
+                all_doc_ids.extend(doc_ids)
+                processed += 1
+                await update_job_progress(
+                    jobs_db_path,
+                    job_id,
+                    processed_files=processed,
+                    failed_files=failed,
+                    errors=errors,
+                    message=f"Ingested {display_name} ({num_chunks} chunks).",
+                )
+            except Exception as exc:
+                failed += 1
+                errors.append(f"{display_name}: {exc}")
+                await update_job_progress(
+                    jobs_db_path,
+                    job_id,
+                    processed_files=processed,
+                    failed_files=failed,
+                    errors=errors,
+                    message=f"Failed on {display_name}: {exc}",
+                )
+            finally:
+                Path(temp_path).unlink(missing_ok=True)
+        if processed == 0:
+            await fail_ingest_job(
+                jobs_db_path,
+                job_id,
+                message="All files failed ingestion.",
+                errors=errors,
+            )
+            return
+        chunk_note = f"{len(all_doc_ids)} chunk vector(s) across {processed} file(s)."
+        await complete_ingest_job(
+            jobs_db_path,
+            job_id,
+            document_ids=all_doc_ids,
+            message=f"Ingestion completed. {chunk_note}",
+        )
+    except Exception as exc:
+        await fail_ingest_job(jobs_db_path, job_id, message=str(exc), errors=errors + [str(exc)])