Study-with-ChampAI / agents /document_agent.py
SolusOps's picture
feat: agents package
dc124db verified
Raw
History Blame Contribute Delete
2.24 kB
from __future__ import annotations
from dataclasses import dataclass, field
from typing import List, Dict
from services.model_router import ModelRouter
from services.json_parser import extract_json
from config.prompts import DOCUMENT_EXTRACT_SYSTEM, DOCUMENT_VISION_PROMPT
@dataclass
class DocumentConcepts:
topics: List[str]
definitions: List[Dict[str, str]]
facts: List[str]
formulae: List[str]
ocr_text: str = "" # raw OCR output (if from image)
class DocumentAgent:
"""
Uses MiniCPM-V for all document understanding.
- Text input: MiniCPM in text mode → concept extraction
- Image input: MiniCPM in vision mode → OCR + concept extraction in one call
Nemotron is never called here.
"""
def __init__(self, router: ModelRouter): self._router = router
def extract(self, raw_text: str) -> DocumentConcepts:
"""Text path: MiniCPM extracts structured concepts from text."""
prompt = f"{DOCUMENT_EXTRACT_SYSTEM}\n\nExtract concepts from:\n\n{raw_text}"
raw = self._router.understand(prompt=prompt)
try:
data = extract_json(raw)
except ValueError as exc:
raise ValueError(f"DocumentAgent: could not parse JSON. {exc}") from exc
return DocumentConcepts(topics=data.get("topics",[]), definitions=data.get("definitions",[]),
facts=data.get("facts",[]), formulae=data.get("formulae",[]))
def extract_from_image(self, image_b64: str) -> DocumentConcepts:
"""
Image path: MiniCPM does OCR + concept extraction in a single vision call.
Returns concepts AND the raw OCR text (stored in ocr_text for downstream use).
"""
raw = self._router.understand(prompt=DOCUMENT_VISION_PROMPT, image_b64=image_b64)
try:
data = extract_json(raw)
except ValueError as exc:
raise ValueError(f"DocumentAgent: could not parse JSON from image. {exc}") from exc
return DocumentConcepts(topics=data.get("topics",[]), definitions=data.get("definitions",[]),
facts=data.get("facts",[]), formulae=data.get("formulae",[]),
ocr_text=data.get("ocr_text",""))