File size: 2,240 Bytes
dc124db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from __future__ import annotations
from dataclasses import dataclass, field
from typing import List, Dict
from services.model_router import ModelRouter
from services.json_parser import extract_json
from config.prompts import DOCUMENT_EXTRACT_SYSTEM, DOCUMENT_VISION_PROMPT

@dataclass
class DocumentConcepts:
    topics: List[str]
    definitions: List[Dict[str, str]]
    facts: List[str]
    formulae: List[str]
    ocr_text: str = ""  # raw OCR output (if from image)

class DocumentAgent:
    """
    Uses MiniCPM-V for all document understanding.
    - Text input: MiniCPM in text mode → concept extraction
    - Image input: MiniCPM in vision mode → OCR + concept extraction in one call
    Nemotron is never called here.
    """
    def __init__(self, router: ModelRouter): self._router = router

    def extract(self, raw_text: str) -> DocumentConcepts:
        """Text path: MiniCPM extracts structured concepts from text."""
        prompt = f"{DOCUMENT_EXTRACT_SYSTEM}\n\nExtract concepts from:\n\n{raw_text}"
        raw = self._router.understand(prompt=prompt)
        try:
            data = extract_json(raw)
        except ValueError as exc:
            raise ValueError(f"DocumentAgent: could not parse JSON. {exc}") from exc
        return DocumentConcepts(topics=data.get("topics",[]), definitions=data.get("definitions",[]),
                                facts=data.get("facts",[]), formulae=data.get("formulae",[]))

    def extract_from_image(self, image_b64: str) -> DocumentConcepts:
        """
        Image path: MiniCPM does OCR + concept extraction in a single vision call.
        Returns concepts AND the raw OCR text (stored in ocr_text for downstream use).
        """
        raw = self._router.understand(prompt=DOCUMENT_VISION_PROMPT, image_b64=image_b64)
        try:
            data = extract_json(raw)
        except ValueError as exc:
            raise ValueError(f"DocumentAgent: could not parse JSON from image. {exc}") from exc
        return DocumentConcepts(topics=data.get("topics",[]), definitions=data.get("definitions",[]),
                                facts=data.get("facts",[]), formulae=data.get("formulae",[]),
                                ocr_text=data.get("ocr_text",""))