from __future__ import annotations from dataclasses import dataclass, field from typing import List, Dict from services.model_router import ModelRouter from services.json_parser import extract_json from config.prompts import DOCUMENT_EXTRACT_SYSTEM, DOCUMENT_VISION_PROMPT @dataclass class DocumentConcepts: topics: List[str] definitions: List[Dict[str, str]] facts: List[str] formulae: List[str] ocr_text: str = "" # raw OCR output (if from image) class DocumentAgent: """ Uses MiniCPM-V for all document understanding. - Text input: MiniCPM in text mode → concept extraction - Image input: MiniCPM in vision mode → OCR + concept extraction in one call Nemotron is never called here. """ def __init__(self, router: ModelRouter): self._router = router def extract(self, raw_text: str) -> DocumentConcepts: """Text path: MiniCPM extracts structured concepts from text.""" prompt = f"{DOCUMENT_EXTRACT_SYSTEM}\n\nExtract concepts from:\n\n{raw_text}" raw = self._router.understand(prompt=prompt) try: data = extract_json(raw) except ValueError as exc: raise ValueError(f"DocumentAgent: could not parse JSON. {exc}") from exc return DocumentConcepts(topics=data.get("topics",[]), definitions=data.get("definitions",[]), facts=data.get("facts",[]), formulae=data.get("formulae",[])) def extract_from_image(self, image_b64: str) -> DocumentConcepts: """ Image path: MiniCPM does OCR + concept extraction in a single vision call. Returns concepts AND the raw OCR text (stored in ocr_text for downstream use). """ raw = self._router.understand(prompt=DOCUMENT_VISION_PROMPT, image_b64=image_b64) try: data = extract_json(raw) except ValueError as exc: raise ValueError(f"DocumentAgent: could not parse JSON from image. {exc}") from exc return DocumentConcepts(topics=data.get("topics",[]), definitions=data.get("definitions",[]), facts=data.get("facts",[]), formulae=data.get("formulae",[]), ocr_text=data.get("ocr_text",""))