Spaces:

LeomordKaly
/

secureagentrag-api

Running

App Files Files Community

LeomordKaly commited on 6 days ago

Commit

09fee34

verified ·

1 Parent(s): b6574e8

deploy: phase 3 BYOK backend (Dockerfile.hf, FastAPI on 7860)

Browse files

Files changed (2) hide show

core/extraction.py +190 -0
interfaces/api.py +153 -2

core/extraction.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""Structured-data extraction: document text -> JSON against a field schema.
+This is the *extraction mode* (Tier X) — the second face of the platform next to
+RAG Q&A. Instead of "ask a question, get a cited answer," the caller supplies a
+small **field schema** (name + type + description per field) and gets a single
+validated JSON object back. No retrieval, no vector DB — just parse → one
+``json_mode`` LLM call → validate.
+It reuses the same inference router as the RAG pipeline, so the visitor's BYOK
+key powers the call and the same sensitivity-routing applies (HIGH-sensitivity
+docs stay local on a self-hosted deploy). Kept framework-free so it is unit
+testable without FastAPI.
+"""
+from __future__ import annotations
+import json
+import re
+from dataclasses import dataclass
+from utils.logging import get_logger
+logger = get_logger(__name__)
+# Bound the document text fed to the model so a long PDF cannot blow the
+# token budget / rate limit. Extraction targets a handful of fields, so the
+# salient content is almost always near the top of the document.
+MAX_EXTRACTION_CHARS = 12_000
+# Field types we coerce to. Anything else is treated as a string.
+_ALLOWED_TYPES = frozenset({"string", "number", "integer", "boolean", "date"})
+@dataclass(frozen=True)
+class ExtractionField:
+    """One field to pull out of a document.
+    Attributes:
+        name: JSON key to emit (e.g. ``"total_amount"``).
+        type: One of ``string`` / ``number`` / ``integer`` / ``boolean`` /
+            ``date``. Unknown types fall back to ``string``.
+        description: Plain-language hint that tells the model what to look for
+            (e.g. "the grand total including VAT, as a number").
+    """
+    name: str
+    type: str = "string"
+    description: str = ""
+    def safe_type(self) -> str:
+        t = (self.type or "string").lower().strip()
+        return t if t in _ALLOWED_TYPES else "string"
+def normalise_fields(raw_fields: list[dict]) -> list[ExtractionField]:
+    """Coerce a list of raw field dicts into validated ``ExtractionField`` objects.
+    Drops entries without a usable ``name``; caps the count so a caller cannot
+    request hundreds of fields in one prompt. Raises ``ValueError`` when nothing
+    usable remains.
+    """
+    out: list[ExtractionField] = []
+    for f in raw_fields or []:
+        if not isinstance(f, dict):
+            continue
+        name = str(f.get("name", "")).strip()
+        if not name:
+            continue
+        out.append(
+            ExtractionField(
+                name=name,
+                type=str(f.get("type", "string")),
+                description=str(f.get("description", "")).strip(),
+            )
+        )
+        if len(out) >= 25:  # hard cap — keep the prompt + output bounded
+            break
+    if not out:
+        raise ValueError("no usable fields in the extraction schema")
+    return out
+def build_extraction_prompt(text: str, fields: list[ExtractionField]) -> str:
+    """Build a strict JSON-only extraction prompt."""
+    field_lines = "\n".join(
+        f'- "{f.name}" ({f.safe_type()}): {f.description or "extract this field"}' for f in fields
+    )
+    keys = ", ".join(f'"{f.name}"' for f in fields)
+    return (
+        "You are a precise document data-extraction engine. Extract the fields "
+        "below from the DOCUMENT and return a SINGLE valid JSON object — nothing "
+        "else, no markdown fences, no commentary.\n\n"
+        "RULES:\n"
+        "1. Output exactly these keys and no others: " + keys + ".\n"
+        "2. Use the field type as a hint. Numbers as JSON numbers, booleans as "
+        "true/false, dates as ISO-8601 strings (YYYY-MM-DD) when possible.\n"
+        "3. If a field is not present in the document, set its value to null. "
+        "Do NOT invent values.\n"
+        "4. Answer in the document's own language for free-text values "
+        "(Arabic documents -> Arabic values).\n\n"
+        f"FIELDS:\n{field_lines}\n\n"
+        f"DOCUMENT:\n{text[:MAX_EXTRACTION_CHARS]}\n\n"
+        "Return ONLY the JSON object:"
+    )
+def parse_extraction_response(raw: str, fields: list[ExtractionField]) -> dict:
+    """Parse the model's JSON, keep only the requested keys, coerce types.
+    Robust to a model that wraps the JSON in ``` fences or adds a ``<think>``
+    preamble. Always returns a dict with **every** requested key present
+    (missing -> ``None``), so the caller gets a stable shape.
+    """
+    cleaned = re.sub(r"<think>.*?</think>", "", raw or "", flags=re.DOTALL | re.IGNORECASE)
+    cleaned = cleaned.strip()
+    # Strip a leading ```json / ``` fence if present.
+    if cleaned.startswith("```"):
+        cleaned = cleaned.split("\n", 1)[1] if "\n" in cleaned else ""
+        if cleaned.rstrip().endswith("```"):
+            cleaned = cleaned.rsplit("```", 1)[0]
+    cleaned = cleaned.strip()
+    # Fall back to the first {...} block if there is still surrounding prose.
+    if not cleaned.startswith("{"):
+        m = re.search(r"\{.*\}", cleaned, flags=re.DOTALL)
+        cleaned = m.group(0) if m else "{}"
+    try:
+        data = json.loads(cleaned)
+    except json.JSONDecodeError:
+        logger.warning("extraction_json_parse_failed", preview=cleaned[:120])
+        data = {}
+    if not isinstance(data, dict):
+        data = {}
+    result: dict = {}
+    for f in fields:
+        result[f.name] = _coerce(data.get(f.name), f.safe_type())
+    return result
+def _coerce(value: object, typ: str):
+    """Best-effort coerce a raw JSON value to the requested field type."""
+    if value is None:
+        return None
+    try:
+        if typ == "integer":
+            return int(float(str(value).replace(",", "").strip()))
+        if typ == "number":
+            return float(str(value).replace(",", "").strip())
+        if typ == "boolean":
+            if isinstance(value, bool):
+                return value
+            return str(value).strip().lower() in ("true", "yes", "1", "نعم")
+        # string / date — return as-is string
+        return value if isinstance(value, (str, int, float, bool)) else str(value)
+    except (ValueError, TypeError):
+        return value  # keep the raw value rather than dropping data
+async def extract_fields(
+    text: str,
+    fields: list[ExtractionField],
+    *,
+    prefer_cloud: bool = True,
+    sensitivity_level: str = "low",
+) -> dict:
+    """Run one ``json_mode`` extraction call and return the validated result.
+    Returns a dict: ``{"fields": {...}, "model": str, "provider": str,
+    "latency_ms": float, "raw": str}``. Never raises on a bad LLM response —
+    returns all-null fields so the caller always gets a stable shape.
+    """
+    from core.agents.router import call_llm_with_decision
+    prompt = build_extraction_prompt(text, fields)
+    raw, decision, response = await call_llm_with_decision(
+        prompt,
+        system_prompt="You output only valid JSON. No prose, no markdown fences.",
+        sensitivity_level=sensitivity_level,
+        prefer_cloud=prefer_cloud,
+        json_mode=True,
+    )
+    parsed = parse_extraction_response(raw or "", fields)
+    return {
+        "fields": parsed,
+        "model": decision.model if decision else "unknown",
+        "provider": decision.provider if decision else "unknown",
+        "latency_ms": response.latency_ms if response else 0.0,
+        "raw": raw or "",
+    }

interfaces/api.py CHANGED Viewed

@@ -417,8 +417,9 @@ if _FASTAPI_AVAILABLE:
         async def byok_corpus() -> dict:
             """Summarise the base demo corpus -- source files + metadata.
-            Scrolls the root tenant Qdrant collection (the 10 hand-curated
-            demo docs) and groups points by ``source_file``. Returns one
             row per file with the chunk count, sensitivity label, and
             roles -- never the chunk text. Visitor uploads under
             ``documents_sess_<sid>`` are NOT included (those live in the
@@ -1190,6 +1191,156 @@ if _FASTAPI_AVAILABLE:
                 "deleted_chunks": deleted,
             }
     @app.post("/query", response_model=QueryResponse, tags=["rag"])
     async def query_endpoint(
         body: QueryRequest,

         async def byok_corpus() -> dict:
             """Summarise the base demo corpus -- source files + metadata.
+            Scrolls the root tenant Qdrant collection (the hand-curated demo
+            docs — English RBAC + Arabic Egypt) and groups points by
+            ``source_file``. Returns one
             row per file with the chunk count, sensitivity label, and
             roles -- never the chunk text. Visitor uploads under
             ``documents_sess_<sid>`` are NOT included (those live in the
                 "deleted_chunks": deleted,
             }
+        # ── BYOK extraction mode (doc -> structured JSON) ────────────────
+        # The platform's second face next to RAG Q&A: upload a document + a
+        # field schema, get back one validated JSON object. No retrieval, no
+        # vector DB — parse -> one json_mode LLM call -> validate. Reuses the
+        # inference router (visitor's BYOK key powers it; sensitivity routing
+        # applies) and lands on the same audit chain. See ADR-041 / Tier X.
+        from fastapi import Form
+        @app.post("/byok/extract", tags=["byok"])
+        async def byok_extract(
+            request: _FastApiRequest,
+            file: Annotated[UploadFile, File(...)],
+            fields: Annotated[str, Form(...)],
+            creds: Annotated[ByokCreds, Depends(extract_byok)],
+        ) -> dict:
+            """Extract a caller-defined field schema from an uploaded document.
+            ``fields`` is a JSON string: ``[{"name","type","description"}, ...]``.
+            Returns ``{fields: {...}, model, provider, latency_ms}``. Same
+            throttle / BYOK-runtime contract as ``/byok/chat``.
+            """
+            from core.extraction import extract_fields, normalise_fields
+            # Parse + validate the schema first (cheap, fail fast).
+            try:
+                raw_fields = json.loads(fields)
+                if not isinstance(raw_fields, list):
+                    raise ValueError("fields must be a JSON array")
+                schema = normalise_fields(raw_fields)
+            except (json.JSONDecodeError, ValueError) as exc:
+                raise HTTPException(
+                    status.HTTP_400_BAD_REQUEST,
+                    detail={"reason": "bad_schema", "error": str(exc)},
+                ) from exc
+            # Throttle owner-key fallback exactly like chat.
+            if not creds.byok_active():
+                throttle = get_owner_key_throttle()
+                ok, meta = throttle.allow(client_ip_from_request(request))
+                if not ok:
+                    raise HTTPException(
+                        status.HTTP_429_TOO_MANY_REQUESTS,
+                        detail={
+                            "reason": meta["reason"],
+                            "retry_after_seconds": meta["retry_after"],
+                            "hint": (
+                                "Owner-key fallback exhausted for this IP. Paste "
+                                "your own LLM key to continue — never stored."
+                            ),
+                        },
+                    )
+            # Validate ext + size (mirror the upload caps).
+            filename = file.filename or "upload"
+            ext = ("." + filename.rsplit(".", 1)[-1].lower()) if "." in filename else ""
+            allowed = {e.lower() for e in settings.byok_upload_allowed_extensions}
+            if ext not in allowed:
+                raise HTTPException(
+                    status.HTTP_400_BAD_REQUEST,
+                    detail={"reason": "unsupported_extension", "allowed": sorted(allowed)},
+                )
+            max_bytes = int(settings.byok_upload_max_bytes)
+            buf = bytearray()
+            while True:
+                chunk = await file.read(64 * 1024)
+                if not chunk:
+                    break
+                buf.extend(chunk)
+                if len(buf) > max_bytes:
+                    raise HTTPException(
+                        status.HTTP_413_CONTENT_TOO_LARGE,
+                        detail={"reason": "file_too_large", "limit_bytes": max_bytes},
+                    )
+            if not buf:
+                raise HTTPException(status.HTTP_400_BAD_REQUEST, detail={"reason": "empty_file"})
+            # Spool + parse to text via the existing loaders.
+            import os as _os
+            import tempfile as _tempfile
+            from ingestion.loaders import load_document
+            safe_name = (
+                "".join(c if (c.isalnum() or c in "._-") else "_" for c in filename) or "upload"
+            )
+            tmp_dir = _tempfile.mkdtemp(prefix=f"byok_extract_{creds.session_id}_")
+            tmp_path = _os.path.join(tmp_dir, safe_name)
+            _t0 = __import__("time").perf_counter()
+            try:
+                with open(tmp_path, "wb") as fh:
+                    fh.write(bytes(buf))
+                try:
+                    docs = await asyncio.to_thread(load_document, tmp_path)
+                except Exception as exc:
+                    raise HTTPException(
+                        status.HTTP_422_UNPROCESSABLE_ENTITY,
+                        detail={"reason": "parse_failed", "error": str(exc)},
+                    ) from exc
+                text = "\n\n".join(d.text for d in docs if d.text).strip()
+                if not text:
+                    raise HTTPException(
+                        status.HTTP_422_UNPROCESSABLE_ENTITY,
+                        detail={
+                            "reason": "no_text",
+                            "hint": "No extractable text (scanned image PDFs need OCR).",
+                        },
+                    )
+                _byok_tok = set_byok_runtime(_byok_runtime_for(creds))
+                try:
+                    result = await extract_fields(
+                        text, schema, prefer_cloud=True, sensitivity_level="low"
+                    )
+                finally:
+                    reset_byok_runtime(_byok_tok)
+            finally:
+                try:
+                    _os.remove(tmp_path)
+                    _os.rmdir(tmp_dir)
+                except OSError:
+                    pass
+            elapsed_ms = (__import__("time").perf_counter() - _t0) * 1000
+            try:
+                audit_logger.log_query(
+                    user_id=f"demo-{creds.session_id}",
+                    org_id=_DEMO_ORG_ID,
+                    query=f"[extract] {safe_name} ({len(schema)} fields)",
+                    response_summary=f"extracted {len(result['fields'])} fields",
+                    sensitivity="low",
+                    status="success",
+                    latency_ms=elapsed_ms,
+                    action_hint="extract",
+                    byok_used=creds.has_user_key(),
+                    synth_provider=result["provider"],
+                    synth_model=result["model"],
+                )
+            except Exception as exc:  # pragma: no cover - defensive
+                logger.warning("byok_extract_audit_failed", error=str(exc))
+            return {
+                "session_id": creds.session_id,
+                "filename": safe_name,
+                "byok_used": creds.has_user_key(),
+                "fields": result["fields"],
+                "provider": result["provider"],
+                "model": result["model"],
+                "latency_ms": elapsed_ms,
+            }
     @app.post("/query", response_model=QueryResponse, tags=["rag"])
     async def query_endpoint(
         body: QueryRequest,