File size: 2,924 Bytes
dc79584
 
ecf7bf7
 
 
 
 
 
 
 
dc79584
 
 
ecf7bf7
 
 
 
 
 
 
 
 
678c3d1
ecf7bf7
 
 
 
 
dc79584
ecf7bf7
 
 
 
 
 
 
 
678c3d1
dc79584
ecf7bf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
678c3d1
ecf7bf7
 
678c3d1
ecf7bf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
678c3d1
ecf7bf7
dc79584
 
ecf7bf7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import logging
from typing import List
from pathlib import Path

from pdf2image import convert_from_path
from PIL import Image

from chandra.model import InferenceManager
from chandra.model.schema import BatchInputItem

logger = logging.getLogger("ocr_engine")

# Lazy-load global so the model loads only once
_chandra_manager: InferenceManager | None = None

def _get_chandra_manager() -> InferenceManager:
    global _chandra_manager
    if _chandra_manager is None:
        logger.info("[Chandra OCR] Loading model (method='hf')...")
        _chandra_manager = InferenceManager(method="hf")
    return _chandra_manager


def extract_text(file_path: str) -> str:
    """
    Extract text from a PDF or Image using Chandra OCR.
    Returns full text content only (no confidence scores).
    """

    path = Path(file_path)
    if not path.exists():
        logger.error(f"[Chandra OCR] File not found: {file_path}")
        return ""

    # Load PDF pages or image
    images: List[Image.Image] = []

    try:
        # PDF
        if path.suffix.lower() == ".pdf":
            try:
                images = convert_from_path(str(path))   # You can set dpi=300 if needed
            except Exception as e:
                logger.error(f"[Chandra OCR] PDF conversion error: {e}", exc_info=True)
                return ""

        # Image formats
        elif path.suffix.lower() in {".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp"}:
            try:
                images = [Image.open(str(path)).convert("RGB")]
            except Exception as e:
                logger.error(f"[Chandra OCR] Image open error: {e}", exc_info=True)
                return ""

        else:
            logger.error(f"[Chandra OCR] Unsupported file type: {path.suffix}")
            return ""

        if not images:
            logger.error(f"[Chandra OCR] No images/pages loaded from file: {file_path}")
            return ""

        # Prepare OCR batch
        manager = _get_chandra_manager()
        batch = [
            BatchInputItem(
                image=img,
                prompt_type="ocr_layout"
            )
            for img in images
        ]

        # Run OCR
        logger.info(f"[Chandra OCR] Running OCR on {len(batch)} pages...")
        results = manager.generate(batch)

        # Join pages into final text
        text_blocks = []

        for i, result in enumerate(results):
            page_text = getattr(result, "markdown", None) or getattr(result, "raw", "") or ""
            page_text = page_text.strip()
            text_blocks.append(f"--- Page {i+1} ---\n{page_text}")

        final_text = "\n\n".join(text_blocks).strip()

        if not final_text:
            logger.error(f"[Chandra OCR] OCR returned empty text for {file_path}")

        return final_text

    except Exception as e:
        logger.error(f"[Chandra OCR] Unexpected error: {e}", exc_info=True)
        return ""