Spaces:

Hammad712
/

ingestion

Sleeping

+# --- Python ---
+__pycache__/
+*.py[cod]
+*$py.class
+# --- Virtual Environments ---
+# Common names for virtual envs
+venv/
+env/
+.env/
+.venv/
+# --- Environment Variables (CRITICAL) ---
+# Never commit secrets or api keys
+.env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+# --- Distribution / Packaging ---
+dist/
+build/
+*.egg-info/
+# --- Testing & Coverage ---
+.pytest_cache/
+.coverage
+htmlcov/
+coverage.xml
+# --- Jupyter Notebooks (if applicable) ---
+.ipynb_checkpoints
+# --- IDE / Editors ---
+.vscode/
+.idea/
+*.swp
+# --- Databases ---
+# Ignore local SQLite databases so you don't overwrite prod data or commit binary blobs
+*.sqlite3
+*.db
+# --- Logs ---
+*.log
+logs/

Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+# Use an official lightweight Python image
+FROM python:3.10-slim
+# Set environment variables
+# PYTHONDONTWRITEBYTECODE: Prevents Python from writing pyc files to disc
+# PYTHONUNBUFFERED: Ensures logs are flushed immediately
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    # Point Hugging Face cache to a writable directory
+    HF_HOME=/app/.cache/huggingface
+# Set the working directory
+WORKDIR /app
+# Create a non-root user with a specific UID (1000) for security & HF compatibility
+# and give them ownership of the /app directory
+RUN useradd -m -u 1000 user && \
+    chown -R user:user /app
+# Switch to the non-root user
+USER user
+# Set up the PATH to include the user's local bin (where pip installs tools)
+ENV PATH="/home/user/.local/bin:$PATH"
+# Copy the requirements file first to leverage Docker cache
+COPY --chown=user:user requirements.txt .
+# Install dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application code
+COPY --chown=user:user . .
+# Expose port 7860 (Required for Hugging Face Spaces)
+EXPOSE 7860
+# Command to run the application
+# Note: Ensure your main file is named 'main.py' and the app instance is 'app'
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+PDF Extraction FastAPI
+======================
+This repository provides a FastAPI wrapper around a PDF visual/text extraction pipeline.
+Structure
+- `app/` — application package
+  - `main.py` — FastAPI app and endpoints
+  - `pipeline.py` — refactored pipeline logic (supports `dry_run=True` to avoid external APIs)
+  - `qdrant_ingest.py` — markdown chunking + ingest placeholder
+  - `utils.py` — helpers
+Quickstart
+1. Install system dependency for `pdf2image` (Debian/Ubuntu):
+```bash
+sudo apt-get update && sudo apt-get install -y poppler-utils
+```
+2. Create virtualenv and install Python deps:
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+3. Run the app:
+```bash
+uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
+```
+4. Use the `/process` endpoint to upload a PDF (Swagger UI at `/docs`). For quick local testing use `dry_run=true`.
+Notes
+- The code includes only a simulated/dry-run mode for model calls — enable real model usage by integrating your API keys and adding real calls into `app/pipeline.py` where marked.
+- To ingest into Qdrant, provide your credentials and implement vectorization in `app/qdrant_ingest.py` (placeholder included).

app/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """App package for PDF extraction service."""
2	+
3	+ __all__ = ["main", "pipeline", "qdrant_ingest", "schemas", "utils"]

app/core/config.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+from typing import Dict
+from dotenv import load_dotenv
+load_dotenv()
+def load_config() -> Dict[str, str]:
+    return {
+        "GOOGLE_API_KEY": os.environ.get("GOOGLE_API_KEY", ""),
+        "VOYAGE_API_KEY": os.environ.get("VOYAGE_API_KEY", ""),
+        "QDRANT_URL": os.environ.get("QDRANT_URL", ""),
+        "QDRANT_API_KEY": os.environ.get("QDRANT_API_KEY", ""),
+        "QDRANT_COLLECTION": os.environ.get("QDRANT_COLLECTION", "mercurygse"),
+        "QDRANT_BATCH_SIZE": os.environ.get("QDRANT_BATCH_SIZE", "256"),
+    }

app/main.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from fastapi import FastAPI
+from .routes import router as api_router
+from .services import model_client
+from .core import config as core_config
+import os
+import logging
+logger = logging.getLogger("pdf_extraction")
+if not logger.handlers:
+    # simple default handler
+    h = logging.StreamHandler()
+    fmt = logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s")
+    h.setFormatter(fmt)
+    logger.addHandler(h)
+    logger.setLevel(logging.INFO)
+app = FastAPI(title="PDF Extraction Service")
+@app.on_event("startup")
+def startup_event():
+    # initialize clients from environment if available
+    cfg = core_config.load_config()
+    # Log presence (do not print secrets)
+    logger.info("GOOGLE_API_KEY set: %s", bool(cfg.get("GOOGLE_API_KEY")))
+    logger.info("VOYAGE_API_KEY set: %s", bool(cfg.get("VOYAGE_API_KEY")))
+    logger.info("QDRANT_URL set: %s", bool(cfg.get("QDRANT_URL")))
+    logger.info("QDRANT_API_KEY set: %s", bool(cfg.get("QDRANT_API_KEY")))
+    genai = model_client.init_genai_client(cfg.get("GOOGLE_API_KEY"))
+    if genai:
+        logger.info("GenAI client initialized successfully")
+    else:
+        logger.warning("GenAI client not initialized - missing key or import failure")
+    emb = model_client.init_embeddings(cfg.get("VOYAGE_API_KEY"))
+    if emb:
+        logger.info("Embeddings client initialized successfully")
+    else:
+        logger.warning("Embeddings client not initialized - missing key or import failure")
+    qc = model_client.init_qdrant_client(cfg.get("QDRANT_URL"), cfg.get("QDRANT_API_KEY"))
+    if qc:
+        logger.info("Qdrant client initialized successfully")
+    else:
+        logger.warning("Qdrant client not initialized - missing URL/API key or import failure")
+app.include_router(api_router)
+@app.get("/", tags=["root"])
+def read_root():
+    return {"message": "Welcome to the PDF Extraction Service"}

app/routes/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from fastapi import APIRouter
+router = APIRouter()
+from . import process, health  # noqa: E402,F401
+router.include_router(process.router, prefix="/process", tags=["process"])
+router.include_router(health.router, prefix="/health", tags=["health"])

app/routes/health.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from fastapi import APIRouter
+router = APIRouter()
+@router.get("/live")
+async def live():
+    return {"status": "ok"}
+@router.get("/ready")
+async def ready():
+    return {"status": "ready"}

app/routes/ingest.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from fastapi import APIRouter, HTTPException
+import os
+from typing import Optional
+from ..services.qdrant_service import chunk_markdown_by_page, ingest_chunks_into_qdrant
+router = APIRouter()
+@router.post("/")
+async def ingest(report_path: str, collection: Optional[str] = "mercurygse"):
+    if not os.path.exists(report_path):
+        raise HTTPException(status_code=404, detail='Report not found')
+    chunks = chunk_markdown_by_page(report_path)
+    res = ingest_chunks_into_qdrant(chunks, collection_name=collection)
+    return {"chunks": len(chunks), "ingest_result": res}

app/routes/process.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from fastapi import APIRouter, UploadFile, File, HTTPException
+from fastapi.responses import FileResponse, StreamingResponse
+import os
+import json
+import uuid
+import queue
+import threading
+from typing import Optional
+from ..utils import save_upload_file_tmp
+from ..services.pipeline_service import run_pipeline
+import logging
+logger = logging.getLogger(__name__)
+router = APIRouter()
+@router.post("/pdf/stream")
+async def process_pdf_stream(file: UploadFile = File(...), max_pages: Optional[int] = None):
+    if not file.filename.lower().endswith('.pdf'):
+        raise HTTPException(status_code=400, detail='Only PDF uploads are supported')
+    tmp_path, filename = save_upload_file_tmp(file)
+    q = queue.Queue()
+    job_id = str(uuid.uuid4())
+    logger.info("Received upload %s -> %s; job=%s", file.filename, tmp_path, job_id)
+    def progress_hook(ev: dict):
+        ev_out = {"job_id": job_id, **ev}
+        q.put(ev_out)
+    def worker():
+        try:
+            run_pipeline(tmp_path, max_pages=max_pages, progress_hook=progress_hook, doc_id=job_id, original_filename=filename)
+            q.put({"job_id": job_id, "event": "worker_done"})
+        except Exception as e:
+            q.put({"job_id": job_id, "event": "error", "error": str(e)})
+    thread = threading.Thread(target=worker, daemon=True)
+    thread.start()
+    def event_generator():
+        try:
+            while True:
+                try:
+                    ev = q.get(timeout=0.5)
+                except Exception:
+                    if not thread.is_alive():
+                        break
+                    continue
+                # SSE format
+                s = f"data: {json.dumps(ev)}\n\n"
+                yield s.encode('utf-8')
+            # drain any remaining events
+            while not q.empty():
+                ev = q.get()
+                s = f"data: {json.dumps(ev)}\n\n"
+                yield s.encode('utf-8')
+        finally:
+            try:
+                if os.path.exists(tmp_path):
+                    os.remove(tmp_path)
+            except Exception:
+                pass
+    return StreamingResponse(event_generator(), media_type='text/event-stream')
+@router.get("/report")
+async def download_report(path: str):
+    if not os.path.exists(path):
+        raise HTTPException(status_code=404, detail='Report not found')
+    return FileResponse(path, media_type='text/markdown', filename=os.path.basename(path))

app/schemas/models.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from pydantic import BaseModel, Field
+from typing import List, Optional
+class RouterOutput(BaseModel):
+    route: str
+    contains_visual: bool
+    visual_types: List[str]
+    reason: str = Field(..., min_length=8)
+    confidence: float = Field(..., ge=0.0, le=1.0)
+class KeyComponent(BaseModel):
+    name: str
+    description: str
+    extraction_confidence: Optional[float] = Field(None, ge=0.0, le=1.0)
+class DiagramExtraction(BaseModel):
+    schema_id: str = Field("diagram_v1")
+    pdf_page: int
+    printed_page: Optional[str]
+    title: str
+    category: str
+    summary: str
+    key_components: List[KeyComponent] = Field(default_factory=list)
+    relationships: str
+    raw_text: str
+    extraction_confidence: float = Field(..., ge=0.0, le=1.0)
+class SimpleExtraction(BaseModel):
+    schema_id: str = Field("simple_v1")
+    pdf_page: int
+    printed_page: Optional[str]
+    topic: str
+    summary: str
+    content_markdown: str
+    important_dates_or_entities: List[str] = Field(default_factory=list)
+    extraction_confidence: float = Field(..., ge=0.0, le=1.0)

app/services/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Service package initialiser.
+Avoid importing submodules at package import time to prevent circular
+imports (modules should import specific submodules directly where needed).
+"""
+__all__ = ["model_client", "qdrant_service", "pipeline_service"]

app/services/model_client.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""
+Model client and embedding factories.
+Read API keys from environment variables.
+"""
+import os
+from typing import Optional
+from ..core import config as core_config
+genai_client = None
+embeddings = None
+qdrant_client = None
+def init_genai_client(api_key: Optional[str] = None):
+    global genai_client
+    try:
+        from google import genai
+        if api_key is None:
+            cfg = core_config.load_config()
+            api_key = cfg.get("GOOGLE_API_KEY")
+        genai_client = genai.Client(api_key=api_key) if api_key else None
+    except Exception:
+        genai_client = None
+    return genai_client
+def init_embeddings(voyage_api_key: Optional[str] = None):
+    global embeddings
+    try:
+        from langchain_voyageai import VoyageAIEmbeddings
+        if voyage_api_key is None:
+            cfg = core_config.load_config()
+            voyage_api_key = cfg.get("VOYAGE_API_KEY")
+        if voyage_api_key:
+            os.environ.setdefault("VOYAGE_API_KEY", voyage_api_key)
+            embeddings = VoyageAIEmbeddings(model="voyage-3-large")
+            return embeddings
+    except Exception:
+        pass
+    return None
+def init_qdrant_client(url: Optional[str] = None, api_key: Optional[str] = None):
+    global qdrant_client
+    try:
+        from qdrant_client import QdrantClient
+        if url is None or api_key is None:
+            cfg = core_config.load_config()
+            if url is None:
+                url = cfg.get("QDRANT_URL")
+            if api_key is None:
+                api_key = cfg.get("QDRANT_API_KEY")
+        if url:
+            qdrant_client = QdrantClient(url=url, api_key=api_key, prefer_grpc=False)
+            return qdrant_client
+    except Exception:
+        qdrant_client = None
+    return None
+class ModelClient:
+    """Simple wrapper that exposes current clients as properties.
+    The module keeps module-level references (genai_client, embeddings, qdrant_client)
+    and this wrapper exposes them dynamically so other modules can import
+    `model_client` and access attributes like `model_client.genai_client`.
+    """
+    @property
+    def genai_client(self):
+        return genai_client
+    @property
+    def embeddings(self):
+        return embeddings
+    @property
+    def qdrant_client(self):
+        return qdrant_client
+    def init_all(self):
+        init_genai_client()
+        init_embeddings()
+        init_qdrant_client()
+model_client = ModelClient()

app/services/pipeline_service.py ADDED Viewed

	@@ -0,0 +1,756 @@

+"""
+Full pipeline service adapted from the user's original script.
+This module expects API clients to be available via `app.services.model_client`.
+Per project configuration, this pipeline performs real model calls and Qdrant ingestion.
+"""
+import os
+import time
+import random
+import re
+import gc
+import queue
+import threading
+from typing import List, Optional, Callable, Any, Dict, Tuple
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from threading import BoundedSemaphore, Event, Lock
+from pdf2image import convert_from_path, pdfinfo_from_path
+from pydantic import BaseModel, Field
+from tqdm import tqdm
+from ..schemas import models as schema_models
+from . import model_client
+from google import genai as _genai_module
+from google.genai import types as genai_types
+from .. import utils as app_utils
+from . import qdrant_service
+import tempfile
+import logging
+# note: don't capture clients at import time; use the factory `model_client` to get live instances
+# ---------------------------
+# Configuration (tuned for faster processing)
+# ---------------------------
+ROUTER_WORKERS = int(os.environ.get("ROUTER_WORKERS", 16))
+SIMPLE_WORKERS = int(os.environ.get("SIMPLE_WORKERS", 12))
+COMPLEX_WORKERS = int(os.environ.get("COMPLEX_WORKERS", 6))
+FLASH_CONCURRENCY = SIMPLE_WORKERS
+PRO_CONCURRENCY = COMPLEX_WORKERS
+FLASH_MIN_INTERVAL = float(os.environ.get("FLASH_MIN_INTERVAL", 0.05))
+PRO_MIN_INTERVAL = float(os.environ.get("PRO_MIN_INTERVAL", 0.20))
+RETRY_ATTEMPTS = int(os.environ.get("RETRY_ATTEMPTS", 3))
+# Circuit breaker tuning (env override)
+CIRCUIT_THRESHOLD = int(os.environ.get("CIRCUIT_THRESHOLD", 8))
+CIRCUIT_WINDOW = float(os.environ.get("CIRCUIT_WINDOW", 60.0))
+# logger for this module
+logger = logging.getLogger("pdf_extraction.pipeline")
+if not logger.handlers:
+    ch = logging.StreamHandler()
+    ch.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s"))
+    logger.addHandler(ch)
+    logger.setLevel(logging.INFO)
+# Token-bucket rate limiter settings (tunable via env to match Colab expectations)
+FLASH_RATE = float(os.environ.get("FLASH_RATE", 4.0))  # calls per second for flash family
+PRO_RATE = float(os.environ.get("PRO_RATE", 1.0))    # calls per second for pro family
+class TokenBucket:
+    """Simple thread-safe token bucket limiter.
+    - rate: tokens added per second
+    - capacity: maximum tokens
+    """
+    def __init__(self, rate: float, capacity: Optional[float] = None):
+        self.rate = float(rate)
+        self.capacity = float(capacity or rate)
+        self._tokens = self.capacity
+        self._last = time.time()
+        self._lock = threading.Lock()
+    def wait(self):
+        with self._lock:
+            now = time.time()
+            elapsed = now - self._last
+            # refill
+            self._tokens = min(self.capacity, self._tokens + elapsed * self.rate)
+            self._last = now
+            if self._tokens >= 1.0:
+                self._tokens -= 1.0
+                return
+            # need to wait until next token available
+            needed = (1.0 - self._tokens) / self.rate
+        # sleep outside the lock
+        time.sleep(needed)
+        with self._lock:
+            # after sleeping, consume one token (guard against races)
+            self._tokens = max(0.0, self._tokens - 1.0)
+# instantiate global rate limiters
+flash_rate_limiter = TokenBucket(FLASH_RATE, capacity=max(FLASH_RATE, 1.0))
+pro_rate_limiter = TokenBucket(PRO_RATE, capacity=max(PRO_RATE, 1.0))
+# Semaphores and locks
+flash_sema = BoundedSemaphore(FLASH_CONCURRENCY)
+pro_sema = BoundedSemaphore(PRO_CONCURRENCY)
+flash_lock = Lock()
+pro_lock = Lock()
+_last_flash = 0.0
+_last_pro = 0.0
+# Simple in-memory circuit breaker: model_name -> (consecutive_failures, last_failure_time)
+_circuit_breaker: Dict[str, Tuple[int, float]] = {}
+def flash_wait():
+    global _last_flash
+    with flash_lock:
+        now = time.time()
+        delta = now - _last_flash
+        if delta < FLASH_MIN_INTERVAL:
+            time.sleep(FLASH_MIN_INTERVAL - delta)
+        _last_flash = time.time()
+def pro_wait():
+    global _last_pro
+    with pro_lock:
+        now = time.time()
+        delta = now - _last_pro
+        if delta < PRO_MIN_INTERVAL:
+            time.sleep(PRO_MIN_INTERVAL - delta)
+        _last_pro = time.time()
+# ---------------------------
+# Category taxonomy (strict)
+# ---------------------------
+ALLOWED_COMPLEX_CATEGORIES = {
+    "Labeled Equipment Diagram",
+    "Exploded Parts Diagram",
+    "Technical Schematic",
+    "Flowchart",
+    "Process Diagram",
+    "Wiring Diagram",
+    "Choropleth Map",
+    "Geographic Reference Map",
+    "Infographic",
+    "Complex Table",
+    "Annotated Photograph",
+    "Safety Label Diagram",
+}
+# alias schema classes
+RouterOutput = schema_models.RouterOutput
+KeyComponent = schema_models.KeyComponent
+DiagramExtraction = schema_models.DiagramExtraction
+SimpleExtraction = schema_models.SimpleExtraction
+# Prompts (kept as in user input)
+ROUTER_PROMPT = r"""
+SYSTEM:
+You are the ROUTER AGENT that classifies ONE PAGE IMAGE into either 'complex' or 'simple' based only on visible visuals and visible text.
+Do NOT guess. Use "[ILLEGIBLE]" for unreadable text.
+OUTPUT EXACTLY one JSON object and NOTHING else with these fields:
+{
+  "route": "complex" | "simple",
+  "contains_visual": true | false,
+  "visual_types": ["map","infographic","chart","diagram","complex_table","photo","logo","other"],
+  "reason": "<8-120 characters plain English>",
+  "confidence": 0.00
+}
+If confidence < 0.70 set "route" to "complex".
+"""
+COMPLEX_PROMPT = r"""
+SYSTEM:
+You are a Technical Diagram & Visual Extraction Specialist.
+You will be given ONE PAGE IMAGE (diagram, map, flowchart, complex table, infographic, or annotated photo).
+Produce EXACTLY one JSON matching the schema below and NOTHING else.
+Rules:
+- Transcribe ALL visible labels, legend entries, axis ticks and captions verbatim. Use "[ILLEGIBLE]" for unreadable fragments.
+- Do NOT invent values, units, or relationships not visible.
+- Choose EXACTLY ONE category from the provided list (do not create new names).
+- Provide 'printed_page' if a printed page number is visible on the page (e.g., 'PAGE 2' or '4'); otherwise use null or "[ILLEGIBLE]".
+- Provide extraction_confidence 0.00–1.00 reflecting overall certainty.
+ALLOWED CATEGORIES:
+Labeled Equipment Diagram, Exploded Parts Diagram, Technical Schematic, Flowchart,
+Process Diagram, Wiring Diagram, Choropleth Map, Geographic Reference Map, Infographic,
+Complex Table, Annotated Photograph, Safety Label Diagram, Other
+SCHEMA:
+{
+  "schema_id":"diagram_v1",
+  "pdf_page": <integer - program will supply; model may also include printed_page string or [ILLEGIBLE]>,
+  "printed_page": "<string|null>",
+  "title": "<string>",
+  "category": "<one of the allowed categories>",
+  "summary": "<2-sentence factual summary>",
+  "key_components":[
+     {"name":"<label or [ILLEGIBLE]>","description":"<verbatim descriptor or short spatial hint>","extraction_confidence":0.00}
+  ],
+  "relationships":"<explicit relationships visible or '[NONE]'>",
+  "raw_text":"<all remaining visible text verbatim or [ILLEGIBLE]>",
+  "extraction_confidence": 0.00
+}
+"""
+SIMPLE_PROMPT = r"""
+SYSTEM:
+You are a Document Transcription Specialist. You will be given ONE PAGE IMAGE primarily containing readable text (paragraphs, headings, simple tables).
+Produce EXACTLY one JSON matching the schema below and NOTHING else.
+Rules:
+- Transcribe text verbatim. Use "[ILLEGIBLE]" for unreadable fragments.
+- Convert simple 1-row-per-record tables into Markdown tables.
+- Provide 'printed_page' if visible; otherwise null or "[ILLEGIBLE]".
+- Provide extraction_confidence 0.00–1.00.
+SCHEMA:
+{
+  "schema_id":"simple_v1",
+  "pdf_page": <integer - program will supply>,
+  "printed_page":"<string|null>",
+  "topic":"<string>",
+  "summary":"<2-sentence summary strictly from visible text>",
+  "content_markdown":"<full page transcribed into Markdown>",
+  "important_dates_or_entities":["<exact strings seen>"],
+  "extraction_confidence": 0.00
+}
+"""
+# Helpers: JSON substring extraction & pydantic-agnostic parse
+def extract_json_substring(raw_text: str) -> str:
+    if not raw_text:
+        return raw_text
+    try:
+        start = raw_text.index("{")
+        end = raw_text.rfind("}")
+        if start >= 0 and end > start:
+            return raw_text[start:end+1]
+    except Exception:
+        pass
+    return raw_text
+def parse_with_schema(schema_cls: Any, raw_json_str: str):
+    try:
+        parsed = schema_cls.model_validate_json(raw_json_str)
+        return parsed
+    except Exception:
+        try:
+            parsed = schema_cls.parse_raw(raw_json_str)
+            return parsed
+        except Exception as e:
+            raise e
+# Safe API call with backoff & rate shaping
+def safe_generate_content(model_name: str, contents: list, config_obj: Any = None, is_flash: bool = False, is_pro: bool = False):
+    """Make a model call with retries, spacing, semaphores and provider-aware backoff.
+    This function attempts to parse provider RetryInfo / Retry-After hints from
+    the exception (when available) and prefers that delay over the local
+    exponential backoff. It still records failures for the circuit-breaker.
+    """
+    def _parse_retry_after(exc: Exception) -> Optional[float]:
+        """Extract seconds from common Retry-After / RetryInfo patterns in exceptions."""
+        # 1) Try to read common response-like attributes
+        resp = getattr(exc, "response", None) or getattr(exc, "http_response", None)
+        if resp is not None:
+            headers = getattr(resp, "headers", None) or getattr(resp, "header", None)
+            if headers and isinstance(headers, dict):
+                ra = headers.get("Retry-After") or headers.get("retry-after")
+                if ra:
+                    try:
+                        return float(ra)
+                    except Exception:
+                        pass
+        # 2) Parse textual RetryInfo (e.g. "retryDelay": "9s") from str(exc)
+        s = str(exc)
+        m = re.search(r"retryDelay[\"']?\s*[:=]\s*[\"']?(\d+(?:\.\d+)?)s", s, flags=re.IGNORECASE)
+        if m:
+            try:
+                return float(m.group(1))
+            except Exception:
+                pass
+        m2 = re.search(r"Retry-After\s*[:=]?\s*(\d+(?:\.\d+)?)(?:s|\s|$)", s, flags=re.IGNORECASE)
+        if m2:
+            try:
+                return float(m2.group(1))
+            except Exception:
+                pass
+        return None
+    # quick fail if client not configured to avoid poisoning circuit-breaker
+    if model_client.genai_client is None:
+        raise RuntimeError("GenAI client not configured. Ensure GOOGLE_API_KEY is set and the app was restarted.")
+    base_delay = 0.5
+    for attempt in range(1, RETRY_ATTEMPTS + 1):
+        try:
+            # simple circuit breaker per-model
+            info = _circuit_breaker.get(model_name)
+            if info:
+                failures, last_time = info
+                if failures >= CIRCUIT_THRESHOLD and (time.time() - last_time) < CIRCUIT_WINDOW:
+                    logger.warning("Circuit open for %s (failures=%s, last=%s)", model_name, failures, last_time)
+                    raise RuntimeError(f"Circuit open for {model_name}")
+            if is_flash:
+                flash_wait()
+                with flash_sema:
+                    resp = model_client.genai_client.models.generate_content(model=model_name, contents=contents, config=config_obj)
+            elif is_pro:
+                pro_wait()
+                with pro_sema:
+                    resp = model_client.genai_client.models.generate_content(model=model_name, contents=contents, config=config_obj)
+            else:
+                resp = model_client.genai_client.models.generate_content(model=model_name, contents=contents, config=config_obj)
+            # success -> reset circuit breaker for this model
+            if model_name in _circuit_breaker:
+                _circuit_breaker.pop(model_name, None)
+                logger.info("Circuit breaker reset for %s after successful call", model_name)
+            return resp
+        except Exception as e:
+            # record failure
+            failures, last_time = _circuit_breaker.get(model_name, (0, 0.0))
+            failures += 1
+            _circuit_breaker[model_name] = (failures, time.time())
+            logger.warning("Model %s failure recorded (count=%s): %s", model_name, failures, e)
+            # Try to honor provider's Retry-After / RetryInfo if present
+            retry_seconds = _parse_retry_after(e)
+            s = str(e).lower()
+            if any(k in s for k in ("429", "rate", "quota", "resource exhausted")):
+                # compute backoff: prefer provider-specified retry, otherwise exponential
+                if retry_seconds and retry_seconds > 0:
+                    wait = max(retry_seconds, 0.5)
+                else:
+                    wait = base_delay * (2 ** (attempt - 1)) + random.uniform(0.05, 0.3)
+                if attempt < RETRY_ATTEMPTS:
+                    logger.warning("%s rate-limited. Attempt %s/%s - sleeping %.2fs (provider_retry=%s)", model_name, attempt, RETRY_ATTEMPTS, wait, retry_seconds)
+                    time.sleep(wait)
+                    continue
+            # transient server/connection errors
+            if attempt < RETRY_ATTEMPTS:
+                wait = 0.3 + random.uniform(0, 0.5)
+                logger.warning("%s transient error. Retry %s/%s after %.2fs... Error: %s", model_name, attempt, RETRY_ATTEMPTS, wait, e)
+                time.sleep(wait)
+                continue
+            raise
+# validate_and_retry wrapper
+def validate_and_retry(call_fn: Callable[[], Any], schema_cls: Any, page_index: int, min_confidence: float = 0.60, max_attempts: int = 3) -> (dict, str):
+    last_raw = None
+    for attempt in range(1, max_attempts + 1):
+        resp = call_fn()
+        raw = getattr(resp, "text", None) or str(resp)
+        last_raw = raw
+        candidate = extract_json_substring(raw)
+        try:
+            parsed_obj = parse_with_schema(schema_cls, candidate)
+            data = parsed_obj.model_dump() if hasattr(parsed_obj, "model_dump") else parsed_obj.dict()
+            data["pdf_page"] = page_index + 1
+            conf = data.get("extraction_confidence") or data.get("confidence")
+            if conf is None:
+                return data, raw
+            try:
+                conf = float(conf)
+            except:
+                conf = 0.0
+            if schema_cls is DiagramExtraction:
+                cat = data.get("category", "")
+                if cat not in ALLOWED_COMPLEX_CATEGORIES:
+                    data["category"] = "Other"
+            if "printed_page" in data:
+                if not data["printed_page"] or data["printed_page"] == "[ILLEGIBLE]":
+                    data["printed_page"] = None
+            if conf < min_confidence:
+                if attempt < max_attempts:
+                    time.sleep(0.2 * attempt + random.uniform(0.02, 0.1))
+                    continue
+                else:
+                    return data, raw
+            if "summary" in data and isinstance(data["summary"], str):
+                if len(data["summary"].strip()) < 20 and attempt < max_attempts:
+                    time.sleep(0.15 + random.uniform(0, 0.1))
+                    continue
+            return data, raw
+        except Exception as e:
+            print(f"   WARNING: parsing failed for page {page_index+1} attempt {attempt}. Error: {e}")
+            if attempt < max_attempts:
+                time.sleep(0.3 * attempt + random.uniform(0.02, 0.2))
+                continue
+            raw_excerpt = (last_raw or "")[:1000]
+            raise RuntimeError(f"Parsing/validation failed after {max_attempts} attempts for page {page_index+1}. Raw excerpt (first 1000 chars):\n{raw_excerpt}\nError: {e}")
+# Markdown normalization
+def normalize_markdown(md: str) -> str:
+    lines = md.splitlines()
+    normalized = []
+    prev = None
+    for line in lines:
+        s = line.strip()
+        if not s:
+            normalized.append("")
+            prev = ""
+            continue
+        if re.match(r'^[A-Z0-9][A-Z0-9 \-\/\(\)\.]{3,}$', s) and sum(1 for c in s if c.isalpha()) >= 3:
+            def smart_title(text):
+                parts = text.split()
+                out = []
+                for w in parts:
+                    if w.isupper() and len(w) <= 4:
+                        out.append(w)
+                    else:
+                        out.append(w.capitalize())
+                return " ".join(out)
+            normalized.append("## " + smart_title(s))
+        elif s.endswith(":") and len(s) < 80:
+            normalized.append("### " + s.rstrip(":"))
+        else:
+            normalized.append(line)
+        prev = s
+    return "\n".join(normalized)
+# Worker functions (using genai client)
+def get_image(pdf_path: str, page_index: int):
+    try:
+        images = convert_from_path(pdf_path, first_page=page_index+1, last_page=page_index+1, fmt="jpeg")
+        return images[0] if images else None
+    except Exception:
+        return None
+def router_worker(pdf_path: str, page_index: int) -> Dict:
+    img = get_image(pdf_path, page_index)
+    result = {"page_index": page_index, "route": "complex", "raw": None}
+    if img is None:
+        return result
+    def call():
+        cfg = {
+            "response_mime_type": "application/json",
+            "response_json_schema": RouterOutput.model_json_schema(),
+            "temperature": 0.0
+        }
+        return safe_generate_content(model_name="gemini-2.0-flash", contents=[img, ROUTER_PROMPT], config_obj=cfg, is_flash=True)
+    try:
+        resp = call()
+        raw = getattr(resp, "text", None) or str(resp)
+        result["raw"] = raw
+        try:
+            parsed = parse_with_schema(RouterOutput, extract_json_substring(raw))
+            out = parsed.model_dump() if hasattr(parsed, "model_dump") else parsed.dict()
+        except Exception:
+            out = {"route": "complex", "contains_visual": True, "visual_types": ["other"], "reason": "parse_failed", "confidence": 0.0}
+        result["route"] = out.get("route", "complex")
+        return result
+    finally:
+        try: img.close()
+        except: pass
+        gc.collect()
+def simple_worker(pdf_path: str, page_index: int) -> Dict:
+    img = get_image(pdf_path, page_index)
+    out = {"page_index": page_index, "type": "SIMPLE", "data": None, "error": None, "raw": None}
+    if img is None:
+        out["error"] = "image_load_failed"
+        return out
+    def call():
+        cfg = genai_types.GenerateContentConfig(
+            response_mime_type="application/json",
+            response_json_schema=SimpleExtraction.model_json_schema(),
+            thinking_config=genai_types.ThinkingConfig(thinking_budget=0),
+            media_resolution="media_resolution_medium",
+            temperature=0.0
+        )
+        return safe_generate_content(model_name="gemini-2.5-flash-preview-09-2025", contents=[img, SIMPLE_PROMPT], config_obj=cfg, is_flash=True)
+    try:
+        data, raw = validate_and_retry(call, SimpleExtraction, page_index, min_confidence=0.6, max_attempts=RETRY_ATTEMPTS)
+        data["pdf_page"] = page_index + 1
+        if "printed_page" not in data or data.get("printed_page") in ("", "[ILLEGIBLE]"):
+            data["printed_page"] = None
+        if "content_markdown" in data and isinstance(data["content_markdown"], str):
+            data["content_markdown"] = normalize_markdown(data["content_markdown"])
+        out["data"] = data
+        out["raw"] = raw
+        return out
+    except Exception as e:
+        out["error"] = str(e)
+        out["raw"] = None
+        return out
+    finally:
+        try: img.close()
+        except: pass
+        gc.collect()
+def complex_worker(pdf_path: str, page_index: int) -> Dict:
+    img = get_image(pdf_path, page_index)
+    out = {"page_index": page_index, "type": "COMPLEX", "data": None, "error": None, "raw": None}
+    if img is None:
+        out["error"] = "image_load_failed"
+        return out
+    def call():
+        cfg = genai_types.GenerateContentConfig(
+            response_mime_type="application/json",
+            response_json_schema=DiagramExtraction.model_json_schema(),
+            thinking_config=genai_types.ThinkingConfig(thinking_level="low"),
+            media_resolution="media_resolution_high",
+            temperature=0.0
+        )
+        return safe_generate_content(model_name="gemini-3-pro-preview", contents=[img, COMPLEX_PROMPT], config_obj=cfg, is_pro=True)
+    try:
+        data, raw = validate_and_retry(call, DiagramExtraction, page_index, min_confidence=0.6, max_attempts=RETRY_ATTEMPTS)
+        if data.get("category") not in ALLOWED_COMPLEX_CATEGORIES:
+            data["category"] = "Other"
+        data["pdf_page"] = page_index + 1
+        if "printed_page" not in data or data.get("printed_page") in ("", "[ILLEGIBLE]"):
+            data["printed_page"] = None
+        out["data"] = data
+        out["raw"] = raw
+        return out
+    except Exception as e:
+        out["error"] = str(e)
+        return out
+    finally:
+        try: img.close()
+        except: pass
+        gc.collect()
+# Producer / Consumer (streaming)
+simple_queue = queue.Queue()
+complex_queue = queue.Queue()
+router_finished = Event()
+def router_producer(pdf_path: str, total_pages: int):
+    print("   [Router] Scanning pages and routing...")
+    with ThreadPoolExecutor(max_workers=ROUTER_WORKERS) as ex:
+        futures = {ex.submit(router_worker, pdf_path, i): i for i in range(total_pages)}
+        for fut in as_completed(futures):
+            res = fut.result()
+            idx = res["page_index"]
+            route = res.get("route", "complex")
+            if route == "complex":
+                complex_queue.put(idx)
+            else:
+                simple_queue.put(idx)
+    print("   [Router] Done.")
+    router_finished.set()
+def consumer_processor(pdf_path: str, results: list):
+    print("   [Consumer] Starting workers...")
+    with ThreadPoolExecutor(max_workers=SIMPLE_WORKERS + COMPLEX_WORKERS) as ex:
+        futures = []
+        while True:
+            if router_finished.is_set() and simple_queue.empty() and complex_queue.empty():
+                break
+            while not simple_queue.empty():
+                idx = simple_queue.get_nowait()
+                futures.append(ex.submit(simple_worker, pdf_path, idx))
+            while not complex_queue.empty():
+                idx = complex_queue.get_nowait()
+                futures.append(ex.submit(complex_worker, pdf_path, idx))
+            time.sleep(0.03)
+        for fut in tqdm(as_completed(futures), total=len(futures), unit="page"):
+            try:
+                r = fut.result()
+            except Exception as e:
+                r = {"page_index": None, "type": "FAILED", "data": None, "error": str(e)}
+            results.append(r)
+    print("   [Consumer] All tasks finished.")
+def save_results(results: List[dict], out_md: str = "final_report.md"):
+    results_sorted = sorted([r for r in results if r.get("page_index") is not None], key=lambda x: x["page_index"])
+    with open(out_md, "w", encoding="utf-8") as f:
+        f.write("# Extraction Report\n\n")
+        f.write(f"**Generated:** {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}\n\n")
+        f.write("---\n\n")
+        f.write("## Table of Contents\n\n")
+        for r in results_sorted:
+            p = r["page_index"] + 1
+            typ = r.get("type", "UNKNOWN")
+            title = ""
+            if r.get("data") and isinstance(r["data"], dict):
+                title = r["data"].get("title") or r["data"].get("topic") or ""
+            f.write(f"- [{typ} — Page {p}]{' — ' + title if title else ''}\n")
+        f.write("\n---\n\n")
+        for r in results_sorted:
+            p = r["page_index"] + 1
+            typ = r.get("type", "UNKNOWN")
+            f.write(f"## Page {p} — {typ}\n\n")
+            f.write(f"- **PDF page index:** {r['page_index']+1}\n")
+            if r.get("data"):
+                data = r["data"]
+                printed = data.get("printed_page")
+                confidence = data.get("extraction_confidence", data.get("confidence", None))
+                f.write(f"- **Printed page:** {printed if printed else 'N/A'}\n")
+                f.write(f"- **Extraction confidence:** {confidence if confidence is not None else 'N/A'}\n\n")
+                if typ == "COMPLEX" or data.get("schema_id") == "diagram_v1":
+                    f.write(f"### Title\n\n{data.get('title','(no title)')}\n\n")
+                    f.write(f"### Category\n\n{data.get('category','Other')}\n\n")
+                    f.write(f"### Summary\n\n{data.get('summary','(no summary)')}\n\n")
+                    if data.get("key_components"):
+                        f.write("### Key Components\n\n")
+                        for comp in data.get("key_components", []):
+                            name = comp.get("name", "(no name)")
+                            desc = comp.get("description", "")
+                            conf = comp.get("extraction_confidence", None)
+                            f.write(f"- **{name}** — {desc}" + (f" (confidence: {conf})" if conf is not None else "") + "\n")
+                        f.write("\n")
+                    f.write("### Relationships / Notes\n\n")
+                    f.write(f"{data.get('relationships','[NONE]')}\n\n")
+                    if data.get("raw_text"):
+                        f.write("### Raw Text (verbatim)\n\n")
+                        f.write("> " + "\n> ".join(str(data.get("raw_text","")).splitlines()) + "\n\n")
+                    else:
+                        f.write("### Raw Text (verbatim)\n\nN/A\n\n")
+                elif typ == "SIMPLE" or data.get("schema_id") == "simple_v1":
+                    f.write(f"### Topic\n\n{data.get('topic','(no topic)')}\n\n")
+                    f.write(f"### Summary\n\n{data.get('summary','(no summary)')}\n\n")
+                    f.write("### Content\n\n")
+                    content_md = data.get("content_markdown", "")
+                    if content_md:
+                        f.write(content_md + "\n\n")
+                    else:
+                        f.write("(no content)\n\n")
+                    if data.get("important_dates_or_entities"):
+                        f.write("### Important Dates / Entities\n\n")
+                        for ent in data.get("important_dates_or_entities", []):
+                            f.write(f"- {ent}\n")
+                        f.write("\n")
+                    else:
+                        f.write("### Important Dates / Entities\n\nN/A\n\n")
+                else:
+                    f.write("### Extracted Fields\n\n")
+                    for k, v in data.items():
+                        if k in ("content_markdown", "raw_text"):
+                            continue
+                        f.write(f"- **{k}**: {v}\n")
+                    f.write("\n")
+                    if data.get("content_markdown"):
+                        f.write("### Content\n\n")
+                        f.write(data.get("content_markdown") + "\n\n")
+            else:
+                f.write("### Extraction failed or returned no data\n\n")
+                f.write(f"**Error:** {r.get('error')}\n\n")
+            f.write("\n---\n\n")
+    print(f"Saved Markdown: {os.path.abspath(out_md)}")
+    print("Note: raw model outputs are not saved to disk by design.")
+    return os.path.abspath(out_md)
+def run_pipeline(
+    pdf_path: str,
+    max_pages: Optional[int] = None,
+    out_md: Optional[str] = None,
+    progress_hook: Optional[Callable[[dict], None]] = None,
+    doc_id: Optional[str] = None,
+    original_filename: Optional[str] = None,
+):
+    if not os.path.exists(pdf_path):
+        raise FileNotFoundError(f"{pdf_path} not found")
+    info = pdfinfo_from_path(pdf_path)
+    total_pages = info.get("Pages", 0)
+    if max_pages is None:
+        pages_to_process = total_pages
+    else:
+        pages_to_process = min(max_pages, total_pages)
+    print(f"Processing {pages_to_process}/{total_pages} pages from {os.path.basename(pdf_path)}")
+    results = []
+    # start producer
+    if progress_hook:
+        progress_hook({"event": "started", "pages_total": pages_to_process, "pdf": os.path.basename(pdf_path)})
+    # start producer
+    t = threading.Thread(target=router_producer, args=(pdf_path, pages_to_process))
+    t.start()
+    # run consumer in main thread (blocks)
+    consumer_processor(pdf_path, results)
+    t.join()
+    # save markdown report to temp file if not provided
+    if out_md is None:
+        fd, tmp_md = tempfile.mkstemp(prefix="report_", suffix=".md", dir=app_utils.DATA_DIR)
+        os.close(fd)
+        out_md = tmp_md
+    report_path = save_results(results, out_md=out_md)
+    pages_processed = len([r for r in results if r.get("page_index") is not None])
+    if progress_hook:
+        progress_hook({"event": "report_saved", "report_path": report_path, "pages_processed": pages_processed})
+    # Chunk and ingest into Qdrant
+    try:
+        if progress_hook:
+            progress_hook({"event": "chunking_started", "report_path": report_path})
+        chunks = qdrant_service.chunk_markdown_by_page(report_path)
+        if progress_hook:
+            progress_hook({"event": "chunking_finished", "chunks": len(chunks)})
+        if progress_hook:
+            progress_hook({"event": "ingest_started", "collection": os.environ.get("QDRANT_COLLECTION", "manual_pages")})
+        # determine batch size from env (default 256)
+        try:
+            batch_size = int(os.environ.get("QDRANT_BATCH_SIZE", 256))
+        except Exception:
+            batch_size = 256
+        ingest_res = qdrant_service.ingest_chunks_into_qdrant(
+            chunks,
+            collection_name=os.environ.get("QDRANT_COLLECTION", "manual_pages"),
+            batch_size=batch_size,
+            progress_hook=progress_hook,
+        )
+        if progress_hook:
+            progress_hook({"event": "ingest_finished", "result": ingest_res})
+        # if successful ingestion, persist metadata and cleanup
+        if isinstance(ingest_res, dict) and ingest_res.get("ingested"):
+            # append metadata entry if doc_id provided
+            try:
+                if doc_id or original_filename:
+                    entry = {"uuid": doc_id or "", "original_filename": original_filename or os.path.basename(pdf_path), "report": report_path, "created_at": time.time()}
+                    app_utils.append_metadata_entry(entry)
+            except Exception as e:
+                print(f"Warning: failed to append metadata: {e}")
+            # remove temp files
+            try:
+                if os.path.exists(pdf_path):
+                    os.remove(pdf_path)
+                if os.path.exists(report_path):
+                    os.remove(report_path)
+            except Exception as e:
+                print(f"Warning: failed to remove temp files: {e}")
+    except Exception as e:
+        if progress_hook:
+            progress_hook({"event": "error", "error": str(e)})
+        raise
+    if progress_hook:
+        progress_hook({"event": "completed", "pages_processed": pages_processed, "ingest_result": ingest_res})
+    return {"report_path": report_path, "pages_processed": pages_processed, "results": results, "ingest": ingest_res}

app/services/qdrant_service.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import re
+import time
+import random
+from typing import List, Dict, Optional, Callable
+import os
+from .model_client import init_qdrant_client, init_embeddings
+# chunking regex
+PAGE_SPLIT_RE = re.compile(r'(?m)^(##\s+Page\s+\d+.*)$')
+def chunk_markdown_by_page(md_path: str) -> List[Dict]:
+    with open(md_path, 'r', encoding='utf-8') as f:
+        md = f.read()
+    parts = PAGE_SPLIT_RE.split(md)
+    chunks = []
+    preamble = parts[0].strip()
+    if preamble:
+        chunks.append({
+            'id': 'page_0', 'page': 0, 'page_type': None, 'text': preamble, 'char_length': len(preamble)
+        })
+    i = 1
+    while i < len(parts):
+        header = parts[i].strip()
+        body = parts[i+1].strip() if i+1 < len(parts) else ''
+        m = re.search(r'Page\s+(\d+)', header)
+        page_num = int(m.group(1)) if m else None
+        page_type = None
+        if 'SIMPLE' in header.upper():
+            page_type = 'SIMPLE'
+        elif 'COMPLEX' in header.upper():
+            page_type = 'COMPLEX'
+        full_text = f"{header}\n\n{body}".strip()
+        chunks.append({'id': f'page_{page_num}', 'page': page_num, 'page_type': page_type, 'text': full_text, 'char_length': len(full_text)})
+        i += 2
+    return chunks
+def ingest_chunks_into_qdrant(
+    chunks: List[Dict],
+    collection_name: str = 'manual_pages',
+    batch_size: int = 256,
+    progress_hook: Optional[Callable[[dict], None]] = None,
+    retry_attempts: int = 3,
+) -> Dict:
+    """Ingest chunks into Qdrant using Voyage embeddings and langchain vector store.
+    Implements chunked/batched upserts. Calls `progress_hook` after each batch when provided.
+    This function performs real ingestion and does not support dry-run.
+    """
+    qc = init_qdrant_client()
+    emb = init_embeddings()
+    if qc is None or emb is None:
+        return {'error': 'qdrant-or-embeddings-missing'}
+    try:
+        # lazy import heavy libs
+        from langchain_qdrant import QdrantVectorStore
+        from langchain_core.documents import Document
+        # compute vector size by embedding a small sample
+        try:
+            sample_vec = emb.embed_query('sample size')
+            vector_size = len(sample_vec)
+        except Exception:
+            vector_size = None
+        # create collection if not exists
+        try:
+            existing = [c.name for c in qc.get_collections().collections]
+        except Exception:
+            existing = []
+        if collection_name not in existing and vector_size is not None:
+            from qdrant_client.models import VectorParams, Distance
+            qc.create_collection(collection_name=collection_name, vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE))
+        # build documents (skip page 0 preamble)
+        docs_all = []
+        for c in chunks:
+            if c.get('page') in (None, 0):
+                continue
+            docs_all.append(Document(page_content=c['text'], metadata={'chunk_id': c['id'], 'page': c['page'], 'page_type': c.get('page_type'), 'char_length': c.get('char_length')}))
+        total_docs = len(docs_all)
+        if total_docs == 0:
+            return {'ingested': 0, 'collection': collection_name}
+        store = QdrantVectorStore(client=qc, collection_name=collection_name, embedding=emb)
+        # helper to get embeddings for a list of texts with fallback
+        def embed_texts(texts: List[str]) -> List[List[float]]:
+            # try common batch method names used by embedding wrappers
+            if hasattr(emb, 'embed_documents'):
+                return emb.embed_documents(texts)
+            if hasattr(emb, 'embed_texts'):
+                return emb.embed_texts(texts)
+            # fallback to per-item embedding
+            return [emb.embed_query(t) for t in texts]
+        ingested = 0
+        # process in batches
+        for i in range(0, total_docs, batch_size):
+            batch_docs = docs_all[i:i+batch_size]
+            texts = [d.page_content for d in batch_docs]
+            # get embeddings with retries
+            last_err = None
+            for attempt in range(1, retry_attempts + 1):
+                try:
+                    vectors = embed_texts(texts)
+                    break
+                except Exception as e:
+                    last_err = e
+                    if attempt < retry_attempts:
+                        time.sleep(0.5 * attempt + random.uniform(0, 0.2))
+                        continue
+                    raise
+            # attach vectors to documents via metadata (QdrantVectorStore will compute embeddings again if not provided),
+            # but many vector stores accept raw embeddings; we can use store.client upsert directly if needed.
+            # We'll attempt store.add_documents(batch) and fall back to per-doc add if necessary.
+            success = False
+            for attempt in range(1, retry_attempts + 1):
+                try:
+                    # The high-level API will call embedding again unless we upsert directly; it's acceptable for now.
+                    store.add_documents(batch_docs)
+                    success = True
+                    break
+                except Exception as e:
+                    last_err = e
+                    if attempt < retry_attempts:
+                        time.sleep(0.4 * attempt + random.uniform(0, 0.2))
+                        continue
+                    raise
+            if not success:
+                raise RuntimeError(f"Failed to ingest batch starting at {i}: {last_err}")
+            ingested += len(batch_docs)
+            # emit progress
+            if progress_hook:
+                progress_hook({
+                    'event': 'ingest_batch',
+                    'batch_index': i // batch_size,
+                    'batch_size': len(batch_docs),
+                    'total_docs': total_docs,
+                    'ingested_so_far': ingested,
+                })
+        return {'ingested': ingested, 'collection': collection_name}
+    except Exception as e:
+        return {'error': str(e)}

app/utils.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import tempfile
+from typing import Tuple
+import json
+import threading
+# metadata storage for lightweight JSON DB
+DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data"))
+os.makedirs(DATA_DIR, exist_ok=True)
+METADATA_PATH = os.path.join(DATA_DIR, "metadata.json")
+_metadata_lock = threading.Lock()
+def save_upload_file_tmp(upload_file) -> Tuple[str, str]:
+    """Save a FastAPI UploadFile to a temporary file and return (tmp_path, filename)."""
+    suffix = os.path.splitext(upload_file.filename)[1]
+    fd, tmp_path = tempfile.mkstemp(suffix=suffix)
+    with os.fdopen(fd, "wb") as out:
+        content = upload_file.file.read()
+        out.write(content)
+    return tmp_path, upload_file.filename
+def append_metadata_entry(entry: dict):
+    """Append an entry to the metadata JSON file (list of entries). Thread-safe."""
+    with _metadata_lock:
+        data = []
+        if os.path.exists(METADATA_PATH):
+            try:
+                with open(METADATA_PATH, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+            except Exception:
+                data = []
+        data.append(entry)
+        with open(METADATA_PATH, "w", encoding="utf-8") as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+def read_metadata() -> list:
+    with _metadata_lock:
+        if not os.path.exists(METADATA_PATH):
+            return []
+        try:
+            with open(METADATA_PATH, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except Exception:
+            return []

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+fastapi
+uvicorn[standard]
+pydantic
+pdf2image
+qdrant-client
+langchain-voyageai
+google-genai
+langchain
+python-multipart
+python-dotenv
+langchain_qdrant

scripts/check_genai_key.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#!/usr/bin/env python3
+"""Quick script to validate Google GenAI API key and model access.
+Run from repository root (inside venv) like:
+    python3 scripts/check_genai_key.py
+It will attempt to initialize the genai client using the same code in
+`app.services.model_client` and make a lightweight request to validate the
+key. The script prints a clear message for success / failure and includes
+any provider error details.
+"""
+import importlib
+import traceback
+import sys
+print("Checking GenAI API key and model access...")
+try:
+    mc = importlib.import_module("app.services.model_client")
+except Exception as e:
+    print("Failed to import app.services.model_client:", e)
+    traceback.print_exc()
+    sys.exit(2)
+# Try to initialize client (reads env var GOOGLE_API_KEY)
+client = None
+try:
+    c = mc.init_genai_client()
+    # the init_genai_client returns the client and also assigns mc.genai_client
+    client = c or getattr(mc, "genai_client", None)
+except Exception as e:
+    print("init_genai_client raised exception:", e)
+    traceback.print_exc()
+    sys.exit(2)
+if not client:
+    print("No GenAI client configured. Please set GOOGLE_API_KEY in environment.")
+    sys.exit(1)
+print("GenAI client created. Attempting a lightweight API call to verify key and model access...")
+# Try to call a safe method. We attempt to use models.list() or models.get()
+# if available, otherwise fall back to a small generate_content call.
+try:
+    models_api = getattr(client, "models", None)
+    if models_api is None:
+        print("Client has no .models attribute; cannot proceed.")
+        sys.exit(3)
+    # Prefer listing models if available
+    if hasattr(models_api, "list"):
+        try:
+            res = models_api.list()
+            print("Models list call succeeded. Sample output:")
+            print(res)
+            sys.exit(0)
+        except Exception as e:
+            print("models.list() failed (continuing to try other checks):", e)
+    if hasattr(models_api, "get"):
+        try:
+            # Try to fetch a commonly available model
+            model_name = "gemini-2.0-flash"
+            res = models_api.get(model=model_name)
+            print(f"models.get('{model_name}') succeeded:")
+            print(res)
+            sys.exit(0)
+        except Exception as e:
+            print("models.get() failed (continuing to try generate_content):", e)
+    # Fallback: small generate_content call (may consume quota)
+    # Use a very small prompt
+    try:
+        prompt = "Ping"
+        print("Calling models.generate_content with a tiny prompt (may hit quota)...")
+        resp = models_api.generate_content(model="gemini-2.0-flash", contents=[{"type": "text", "text": prompt}])
+        text = getattr(resp, "text", None) or str(resp)
+        print("generate_content succeeded, response preview:")
+        print(text[:1000])
+        sys.exit(0)
+    except Exception as e:
+        print("generate_content failed:", e)
+        traceback.print_exc()
+        # Inspect exception for structured error info
+        try:
+            err_str = str(e)
+            print("Exception text:\n", err_str)
+        except Exception:
+            pass
+        sys.exit(3)
+except Exception as e:
+    print("Unexpected error while validating key:", e)
+    traceback.print_exc()
+    sys.exit(2)