Spaces:

mayankchugh-learning
/

Document-Audit-RAG

Sleeping

App Files Files Community

Mayank Chugh commited on 29 days ago

Commit

c841e94

1 Parent(s): 97a5277

Implement Milestone 7 by introducing background ingestion jobs. Update environment configuration to include job database path, enhance job tracking with new endpoints, and refactor ingestion logic to support asynchronous processing. Modify response models to include job details and ensure proper initialization of the jobs database on startup.

Browse files

Files changed (10) hide show

.env.example +1 -0
api/config.py +9 -4
api/main.py +2 -0
api/routes/ingest.py +26 -20
api/routes/jobs.py +32 -8
models/requests.py +1 -0
models/responses.py +21 -0
storage/job_store.py +120 -0
workers/__init__.py +0 -0
workers/ingest_worker.py +61 -0

.env.example CHANGED Viewed

@@ -5,6 +5,7 @@ EMBEDDING_MODEL_NAME=nomic-embed-text
 OLLAMA_BASE_URL=http://localhost:11434
 OPENAI_API_KEY=
 CHROMA_PERSIST_DIRECTORY=./data/chroma
 CHUNK_SIZE=1000
 CHUNK_OVERLAP=150
 RETRIEVAL_K=4

 OLLAMA_BASE_URL=http://localhost:11434
 OPENAI_API_KEY=
 CHROMA_PERSIST_DIRECTORY=./data/chroma
+JOBS_DB_PATH=./data/jobs.db
 CHUNK_SIZE=1000
 CHUNK_OVERLAP=150
 RETRIEVAL_K=4

api/config.py CHANGED Viewed

@@ -5,7 +5,12 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
-    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
     app_name: str = Field(default="doc-audi-ai", description="The name of the application")
     app_version: str = Field(default="0.1.0", description="The version of the application")
@@ -36,12 +41,12 @@ class Settings(BaseSettings):
     top_k_results: int = Field(default=4, ge=1, le=20, description="Number of chunks to retrieve")
     audit_db_path: str = "./audit.db"
     max_file_size_mb: int = Field(default=50, ge=1, le=200, description="Max upload file size")
     max_documents_per_batch: int = Field(default=100, ge=1, le=1000, description="Max documents per batch")
-    model_config = SettingsConfigDict(env_file=".env", case_sensitive=False)
 @lru_cache
 def get_settings() -> Settings:
     return Settings()

 class Settings(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+        case_sensitive=False,
+    )
     app_name: str = Field(default="doc-audi-ai", description="The name of the application")
     app_version: str = Field(default="0.1.0", description="The version of the application")
     top_k_results: int = Field(default=4, ge=1, le=20, description="Number of chunks to retrieve")
     audit_db_path: str = "./audit.db"
+    jobs_db_path: str = Field(default="./data/jobs.db", description="SQLite path for ingest job tracking")
     max_file_size_mb: int = Field(default=50, ge=1, le=200, description="Max upload file size")
     max_documents_per_batch: int = Field(default=100, ge=1, le=1000, description="Max documents per batch")
 @lru_cache
 def get_settings() -> Settings:
     return Settings()

api/main.py CHANGED Viewed

@@ -2,6 +2,7 @@ from fastapi import FastAPI
 from api.config import get_settings
 from storage.audit_store import init_audit_db
 from .routes import audit, ingest, jobs, query
 app = FastAPI()
@@ -16,6 +17,7 @@ app.include_router(query.router)
 async def startup() -> None:
     settings = get_settings()
     await init_audit_db(settings.audit_db_path)
 @app.get("/health", tags=["Health"])
 def health() -> dict[str, str]:

 from api.config import get_settings
 from storage.audit_store import init_audit_db
+from storage.job_store import init_jobs_db
 from .routes import audit, ingest, jobs, query
 app = FastAPI()
 async def startup() -> None:
     settings = get_settings()
     await init_audit_db(settings.audit_db_path)
+    await init_jobs_db(settings.jobs_db_path)
 @app.get("/health", tags=["Health"])
 def health() -> dict[str, str]:

api/routes/ingest.py CHANGED Viewed

@@ -2,14 +2,12 @@ from pathlib import Path
 from tempfile import NamedTemporaryFile
 from typing import Annotated
-from fastapi import APIRouter, File, Form, HTTPException, UploadFile, status
 from api.config import get_settings
 from models.responses import IngestUploadResponse
-from rag.chunker import chunk_documents
-from rag.embedder import create_embedding_function
-from rag.loader import load_documents
-from rag.vector_store import add_documents, get_vector_store
 router = APIRouter(prefix="/ingest", tags=["ingest"])
@@ -42,12 +40,14 @@ def _validate_file(file: UploadFile, max_bytes: int) -> str:
 @router.post("/upload", response_model=IngestUploadResponse)
 async def upload_endpoint(
     file: UploadFile = File(..., description="PDF/TXT/MD document to ingest"),
     collection_name: Annotated[str, Form(min_length=1, max_length=256)] = "default",
 ) -> IngestUploadResponse:
     settings = get_settings()
     max_bytes = settings.max_file_size_mb * 1024 * 1024
     suffix = _validate_file(file, max_bytes)
     temp_path = ""
     try:
@@ -56,28 +56,34 @@ async def upload_endpoint(
             temp_path = tmp.name
             tmp.write(file_bytes)
-        documents = load_documents(temp_path)
-        chunks = chunk_documents(documents)
-        if not chunks:
-            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="No content to ingest.")
-        embedding_function = create_embedding_function()
-        vector_store = get_vector_store(
-            persist_directory=settings.chroma_persist_directory,
             collection_name=collection_name,
-            embedding_function=embedding_function,
         )
-        document_ids = add_documents(vector_store, chunks)
         return IngestUploadResponse(
-            status="success",
-            message=f"Ingested {len(chunks)} chunks into '{collection_name}'.",
-            document_ids=document_ids,
         )
     except HTTPException:
         raise
     except Exception as exc:
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(exc)) from exc
     finally:
         await file.close()
-        if temp_path:
-            Path(temp_path).unlink(missing_ok=True)

 from tempfile import NamedTemporaryFile
 from typing import Annotated
+from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, UploadFile, status
 from api.config import get_settings
 from models.responses import IngestUploadResponse
+from storage.job_store import create_ingest_job
+from workers.ingest_worker import run_ingest_job
 router = APIRouter(prefix="/ingest", tags=["ingest"])
 @router.post("/upload", response_model=IngestUploadResponse)
 async def upload_endpoint(
+    background_tasks: BackgroundTasks,
     file: UploadFile = File(..., description="PDF/TXT/MD document to ingest"),
     collection_name: Annotated[str, Form(min_length=1, max_length=256)] = "default",
 ) -> IngestUploadResponse:
     settings = get_settings()
     max_bytes = settings.max_file_size_mb * 1024 * 1024
     suffix = _validate_file(file, max_bytes)
+    display_name = (file.filename or "upload").strip()
     temp_path = ""
     try:
             temp_path = tmp.name
             tmp.write(file_bytes)
+        job_id = await create_ingest_job(
+            settings.jobs_db_path,
             collection_name=collection_name,
+            filename=display_name,
+        )
+        background_tasks.add_task(
+            run_ingest_job,
+            job_id,
+            temp_path,
+            collection_name,
+            settings.jobs_db_path,
+            settings.chroma_persist_directory,
         )
         return IngestUploadResponse(
+            status="queued",
+            message=f"Ingestion job accepted. Poll GET /jobs/{job_id} for status.",
+            job_id=job_id,
+            document_ids=[],
         )
     except HTTPException:
+        if temp_path:
+            Path(temp_path).unlink(missing_ok=True)
         raise
     except Exception as exc:
+        if temp_path:
+            Path(temp_path).unlink(missing_ok=True)
         raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(exc)) from exc
     finally:
         await file.close()

api/routes/jobs.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from typing import Annotated
-from fastapi import APIRouter, Depends, Query
 from models.requests import JobsListParams
-from models.responses import JobListResponse
 def _jobs_list_params(
@@ -12,14 +14,36 @@ def _jobs_list_params(
 ) -> JobsListParams:
     return JobsListParams(limit=limit, offset=offset)
 router = APIRouter(tags=["jobs"])
-@router.get("/jobs", response_model=JobListResponse )
-def jobs_placeholder(
     params: Annotated[JobsListParams, Depends(_jobs_list_params)],
 ) -> JobListResponse:
     return JobListResponse(
-        status="placeholder",
-        message="Jobs not implemented yet.",
-        jobs=[],
-    )

 from typing import Annotated
+from fastapi import APIRouter, Depends, HTTPException, Query, status
+from api.config import get_settings
 from models.requests import JobsListParams
+from models.responses import IngestJobDetailResponse, JobListResponse, JobSummary
+from storage.job_store import get_ingest_job, list_ingest_jobs
 def _jobs_list_params(
 ) -> JobsListParams:
     return JobsListParams(limit=limit, offset=offset)
 router = APIRouter(tags=["jobs"])
+@router.get("/jobs", response_model=JobListResponse)
+async def list_jobs(
     params: Annotated[JobsListParams, Depends(_jobs_list_params)],
 ) -> JobListResponse:
+    settings = get_settings()
+    rows = await list_ingest_jobs(
+        settings.jobs_db_path,
+        limit=params.limit,
+        offset=params.offset,
+    )
+    jobs = [JobSummary.model_validate(row) for row in rows]
     return JobListResponse(
+        status="success",
+        message=f"Returned {len(jobs)} job(s).",
+        jobs=jobs,
+    )
+@router.get("/jobs/{job_id}", response_model=IngestJobDetailResponse)
+async def get_job(job_id: str) -> IngestJobDetailResponse:
+    settings = get_settings()
+    job = await get_ingest_job(settings.jobs_db_path, job_id)
+    if job is None:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found.")
+    return IngestJobDetailResponse(
+        status="success",
+        message="Job found.",
+        job=job,
+    )

models/requests.py CHANGED Viewed

@@ -15,6 +15,7 @@ class IngestUploadRequest(BaseModel):
 class JobsListParams(BaseModel):
     model_config = ConfigDict(extra="forbid")
     limit: int = Field(default=10, ge=1, le=100, description="The limit of the jobs to list")
 class AuditListParams(BaseModel):
     model_config = ConfigDict(extra="forbid")

 class JobsListParams(BaseModel):
     model_config = ConfigDict(extra="forbid")
     limit: int = Field(default=10, ge=1, le=100, description="The limit of the jobs to list")
+    offset: int = Field(default=0, ge=0, description="The offset of the jobs to list")
 class AuditListParams(BaseModel):
     model_config = ConfigDict(extra="forbid")

models/responses.py CHANGED Viewed

@@ -22,17 +22,38 @@ class QueryResponse(BaseModel):
 class IngestUploadResponse(BaseModel):
     status: str
     message: str
     document_ids: list[str] = Field(default_factory=list)
 class JobSummary(BaseModel):
     job_id: str
     status: str
 class JobListResponse(BaseModel):
     status: str
     message: str
     jobs: list[JobSummary] = Field(default_factory=list)
 class AuditEvent(BaseModel):
     event_id: str
     action: str

 class IngestUploadResponse(BaseModel):
     status: str
     message: str
+    job_id: str
     document_ids: list[str] = Field(default_factory=list)
 class JobSummary(BaseModel):
     job_id: str
     status: str
+    collection_name: str | None = None
+    filename: str | None = None
+    created_at: str | None = None
 class JobListResponse(BaseModel):
     status: str
     message: str
     jobs: list[JobSummary] = Field(default_factory=list)
+class IngestJobDetail(BaseModel):
+    job_id: str
+    status: str
+    collection_name: str
+    filename: str
+    message: str
+    document_ids: list[str] = Field(default_factory=list)
+    created_at: str
+    updated_at: str
+class IngestJobDetailResponse(BaseModel):
+    status: str
+    message: str
+    job: IngestJobDetail | None = None
 class AuditEvent(BaseModel):
     event_id: str
     action: str

storage/job_store.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import json
+from pathlib import Path
+from typing import Any
+from uuid import uuid4
+import aiosqlite
+from models.responses import IngestJobDetail
+async def init_jobs_db(db_path: str) -> None:
+    db_file = Path(db_path)
+    db_file.parent.mkdir(parents=True, exist_ok=True)
+    async with aiosqlite.connect(db_file.as_posix()) as conn:
+        await conn.execute(
+            """
+            CREATE TABLE IF NOT EXISTS ingest_jobs (
+                job_id TEXT PRIMARY KEY,
+                status TEXT NOT NULL,
+                collection_name TEXT NOT NULL,
+                filename TEXT NOT NULL,
+                message TEXT NOT NULL DEFAULT '',
+                document_ids_json TEXT NOT NULL DEFAULT '[]',
+                created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
+                updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+            )
+            """
+        )
+        await conn.commit()
+async def create_ingest_job(
+    db_path: str,
+    *,
+    collection_name: str,
+    filename: str,
+) -> str:
+    job_id = str(uuid4())
+    await init_jobs_db(db_path)
+    async with aiosqlite.connect(db_path) as conn:
+        await conn.execute(
+            """
+            INSERT INTO ingest_jobs (
+                job_id, status, collection_name, filename, message, document_ids_json
+            ) VALUES (?, 'queued', ?, ?, '', '[]')
+            """,
+            (job_id, collection_name, filename),
+        )
+        await conn.commit()
+    return job_id
+async def update_ingest_job(
+    db_path: str,
+    job_id: str,
+    *,
+    status: str,
+    message: str | None = None,
+    document_ids: list[str] | None = None,
+) -> None:
+    await init_jobs_db(db_path)
+    async with aiosqlite.connect(db_path) as conn:
+        if document_ids is not None:
+            await conn.execute(
+                """
+                UPDATE ingest_jobs
+                SET status = ?, message = COALESCE(?, message), document_ids_json = ?,
+                    updated_at = CURRENT_TIMESTAMP
+                WHERE job_id = ?
+                """,
+                (status, message, json.dumps(document_ids), job_id),
+            )
+        else:
+            await conn.execute(
+                """
+                UPDATE ingest_jobs
+                SET status = ?, message = COALESCE(?, message),
+                    updated_at = CURRENT_TIMESTAMP
+                WHERE job_id = ?
+                """,
+                (status, message, job_id),
+            )
+        await conn.commit()
+async def get_ingest_job(db_path: str, job_id: str) -> IngestJobDetail | None:
+    await init_jobs_db(db_path)
+    async with aiosqlite.connect(db_path) as conn:
+        conn.row_factory = aiosqlite.Row
+        cursor = await conn.execute(
+            """
+            SELECT job_id, status, collection_name, filename, message, document_ids_json, created_at, updated_at
+            FROM ingest_jobs
+            WHERE job_id = ?
+            """,
+            (job_id,),
+        )
+        row = await cursor.fetchone()
+    if row is None:
+        return None
+    payload = dict(row)
+    payload["document_ids"] = json.loads(payload.pop("document_ids_json") or "[]")
+    return IngestJobDetail.model_validate(payload)
+async def list_ingest_jobs(db_path: str, *, limit: int, offset: int) -> list[dict[str, Any]]:
+    await init_jobs_db(db_path)
+    async with aiosqlite.connect(db_path) as conn:
+        conn.row_factory = aiosqlite.Row
+        cursor = await conn.execute(
+            """
+            SELECT job_id, status, collection_name, filename, created_at
+            FROM ingest_jobs
+            ORDER BY datetime(updated_at) DESC, rowid DESC
+            LIMIT ? OFFSET ?
+            """,
+            (limit, offset),
+        )
+        rows = await cursor.fetchall()
+    return [dict(row) for row in rows]

workers/__init__.py ADDED Viewed

File without changes

workers/ingest_worker.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import asyncio
+from pathlib import Path
+from rag.chunker import chunk_documents
+from rag.embedder import create_embedding_function
+from rag.loader import load_documents
+from rag.vector_store import add_documents, get_vector_store
+from storage.job_store import update_ingest_job
+def _ingest_sync(temp_path: str, collection_name: str, chroma_persist_directory: str) -> tuple[list[str], int]:
+    documents = load_documents(temp_path)
+    chunks = chunk_documents(documents)
+    if not chunks:
+        raise ValueError("No content to ingest.")
+    embedding_function = create_embedding_function()
+    vector_store = get_vector_store(
+        persist_directory=chroma_persist_directory,
+        collection_name=collection_name,
+        embedding_function=embedding_function,
+    )
+    document_ids = add_documents(vector_store, chunks)
+    return document_ids, len(chunks)
+async def run_ingest_job(
+    job_id: str,
+    temp_path: str,
+    collection_name: str,
+    jobs_db_path: str,
+    chroma_persist_directory: str,
+) -> None:
+    try:
+        await update_ingest_job(
+            jobs_db_path,
+            job_id,
+            status="processing",
+            message="Ingestion in progress.",
+        )
+        document_ids, num_chunks = await asyncio.to_thread(
+            _ingest_sync,
+            temp_path,
+            collection_name,
+            chroma_persist_directory,
+        )
+        await update_ingest_job(
+            jobs_db_path,
+            job_id,
+            status="completed",
+            message=f"Ingested {num_chunks} chunks.",
+            document_ids=document_ids,
+        )
+    except Exception as exc:
+        await update_ingest_job(
+            jobs_db_path,
+            job_id,
+            status="failed",
+            message=str(exc),
+        )
+    finally:
+        Path(temp_path).unlink(missing_ok=True)