Spaces:
Running
Running
| from __future__ import annotations | |
| from dataclasses import dataclass, field | |
| from typing import List, Dict | |
| from services.model_router import ModelRouter | |
| from services.json_parser import extract_json | |
| from config.prompts import DOCUMENT_EXTRACT_SYSTEM, DOCUMENT_VISION_PROMPT | |
| class DocumentConcepts: | |
| topics: List[str] | |
| definitions: List[Dict[str, str]] | |
| facts: List[str] | |
| formulae: List[str] | |
| ocr_text: str = "" # raw OCR output (if from image) | |
| class DocumentAgent: | |
| """ | |
| Uses MiniCPM-V for all document understanding. | |
| - Text input: MiniCPM in text mode → concept extraction | |
| - Image input: MiniCPM in vision mode → OCR + concept extraction in one call | |
| Nemotron is never called here. | |
| """ | |
| def __init__(self, router: ModelRouter): self._router = router | |
| def extract(self, raw_text: str) -> DocumentConcepts: | |
| """Text path: MiniCPM extracts structured concepts from text.""" | |
| prompt = f"{DOCUMENT_EXTRACT_SYSTEM}\n\nExtract concepts from:\n\n{raw_text}" | |
| raw = self._router.understand(prompt=prompt) | |
| try: | |
| data = extract_json(raw) | |
| except ValueError as exc: | |
| raise ValueError(f"DocumentAgent: could not parse JSON. {exc}") from exc | |
| return DocumentConcepts(topics=data.get("topics",[]), definitions=data.get("definitions",[]), | |
| facts=data.get("facts",[]), formulae=data.get("formulae",[])) | |
| def extract_from_image(self, image_b64: str) -> DocumentConcepts: | |
| """ | |
| Image path: MiniCPM does OCR + concept extraction in a single vision call. | |
| Returns concepts AND the raw OCR text (stored in ocr_text for downstream use). | |
| """ | |
| raw = self._router.understand(prompt=DOCUMENT_VISION_PROMPT, image_b64=image_b64) | |
| try: | |
| data = extract_json(raw) | |
| except ValueError as exc: | |
| raise ValueError(f"DocumentAgent: could not parse JSON from image. {exc}") from exc | |
| return DocumentConcepts(topics=data.get("topics",[]), definitions=data.get("definitions",[]), | |
| facts=data.get("facts",[]), formulae=data.get("formulae",[]), | |
| ocr_text=data.get("ocr_text","")) | |