Vachudev commited on
Commit
678c3d1
·
verified ·
1 Parent(s): 88f995b

chandra ocr

Browse files
Files changed (1) hide show
  1. ocr_engine.py +113 -47
ocr_engine.py CHANGED
@@ -1,63 +1,129 @@
1
- import pytesseract
2
- from pytesseract import Output
3
- from pdf2image import convert_from_path
4
- from PIL import Image
5
  import os
 
6
  import logging
7
- import numpy as np
 
 
8
 
9
  logger = logging.getLogger("ocr_engine")
10
 
11
- def extract_text_and_conf(file_path: str) -> tuple[str, float]:
12
  """
13
- Extracts text AND confidence score from a PDF or Image.
14
  Returns: (text_content, average_confidence_0_to_100)
 
 
 
 
15
  """
16
  if not os.path.exists(file_path):
 
17
  return "", 0.0
18
 
19
- text_content = ""
20
- confidences = []
21
-
 
 
 
 
 
22
  try:
23
- images = []
24
- # 1. Load Images
25
- if file_path.lower().endswith('.pdf'):
26
- try:
27
- images = convert_from_path(file_path)
28
- except Exception as e:
29
- logger.error(f"PDF Convert Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  return "", 0.0
31
- elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
32
- try:
33
- images = [Image.open(file_path)]
34
- except Exception as e:
35
- logger.error(f"Image Open Error: {e}")
36
  return "", 0.0
37
-
38
- # 2. Process Each Page
39
- for i, image in enumerate(images):
40
- # A. Get Layout-Preserved Text (Best for LLM)
41
- page_text = pytesseract.image_to_string(image)
42
- text_content += f"--- Page {i+1} ---\n{page_text}\n"
43
-
44
- # B. Get Confidence Data (Best for KPIs)
45
- # data_dict keys: ['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text']
46
- data = pytesseract.image_to_data(image, output_type=Output.DICT)
47
-
48
- # Filter valid confidences (ignore -1 which usually means whitespace/block info)
49
- for conf in data['conf']:
50
- # Tesseract returns -1 for structural elements (not words)
51
- if conf != -1:
52
- confidences.append(conf)
53
-
54
- # 3. Calculate Average Confidence
55
- avg_conf = 0.0
56
- if confidences:
57
- avg_conf = sum(confidences) / len(confidences)
58
-
59
- return text_content.strip(), round(avg_conf, 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  except Exception as e:
62
- logger.error(f"OCR Critical Error: {e}")
63
- return "", 0.0
 
 
 
 
 
1
  import os
2
+ import json
3
  import logging
4
+ import subprocess
5
+ import tempfile
6
+ from typing import Tuple, List
7
 
8
  logger = logging.getLogger("ocr_engine")
9
 
10
+ def extract_text_and_conf(file_path: str) -> Tuple[str, float]:
11
  """
12
+ Extracts text AND confidence score from a PDF or Image using Chandra OCR.
13
  Returns: (text_content, average_confidence_0_to_100)
14
+
15
+ Requirements:
16
+ - pip install chandra-ocr
17
+ - 'chandra' CLI must be on PATH
18
  """
19
  if not os.path.exists(file_path):
20
+ logger.error(f"File not found: {file_path}")
21
  return "", 0.0
22
 
23
+ # Decide which Chandra subcommand to use
24
+ ext = os.path.splitext(file_path)[1].lower()
25
+ if ext == ".pdf":
26
+ chandra_cmd = ["chandra", "pdf"]
27
+ else:
28
+ # For png/jpg/webp/tiff/etc.
29
+ chandra_cmd = ["chandra", "ocr"]
30
+
31
  try:
32
+ with tempfile.TemporaryDirectory() as tmpdir:
33
+ out_jsonl = os.path.join(tmpdir, "out.jsonl")
34
+
35
+ # Build CLI command:
36
+ # chandra pdf <file> --format jsonl --out out.jsonl
37
+ # chandra ocr <file> --format jsonl --out out.jsonl
38
+ cmd: List[str] = (
39
+ chandra_cmd
40
+ + [
41
+ file_path,
42
+ "--format", "jsonl",
43
+ "--out", out_jsonl,
44
+ # optional flags you can tweak:
45
+ # "--lang", "en", # if you want to force language
46
+ # "--dpi", "300", # PDF/image rasterization DPI
47
+ # "--rotate", "auto", # auto-rotate skewed pages
48
+ ]
49
+ )
50
+
51
+ logger.info(f"Running Chandra OCR: {' '.join(cmd)}")
52
+ proc = subprocess.run(
53
+ cmd,
54
+ check=False,
55
+ capture_output=True,
56
+ text=True,
57
+ )
58
+
59
+ if proc.returncode != 0:
60
+ logger.error(
61
+ f"Chandra OCR failed (exit={proc.returncode}). "
62
+ f"stdout: {proc.stdout}\nstderr: {proc.stderr}"
63
+ )
64
  return "", 0.0
65
+
66
+ if not os.path.exists(out_jsonl):
67
+ logger.error(f"Chandra OCR did not produce expected output file: {out_jsonl}")
 
 
68
  return "", 0.0
69
+
70
+ text_parts: List[str] = []
71
+ confidences: List[float] = []
72
+
73
+ # JSONL structure (simplified from docs/blog):
74
+ # {
75
+ # "page": 1,
76
+ # "blocks": [
77
+ # {
78
+ # "type": "text",
79
+ # "bbox": [...],
80
+ # "lines": [
81
+ # {"text": "Some line", "conf": 0.97, "tokens": [...]}
82
+ # ]
83
+ # }
84
+ # ]
85
+ # }
86
+ page_counter = 0
87
+ with open(out_jsonl, "r", encoding="utf-8") as f:
88
+ for raw_line in f:
89
+ raw_line = raw_line.strip()
90
+ if not raw_line:
91
+ continue
92
+
93
+ page_data = json.loads(raw_line)
94
+ page_counter += 1
95
+ page_num = page_data.get("page", page_counter)
96
+
97
+ # Add page separator (similar to your Tesseract version)
98
+ text_parts.append(f"--- Page {page_num} ---")
99
+
100
+ for block in page_data.get("blocks", []):
101
+ if block.get("type") != "text":
102
+ continue
103
+
104
+ for line in block.get("lines", []):
105
+ line_text = line.get("text", "")
106
+ if line_text:
107
+ text_parts.append(line_text)
108
+
109
+ # Prefer line-level confidence if present
110
+ if "conf" in line and line["conf"] is not None:
111
+ confidences.append(float(line["conf"]))
112
+ else:
113
+ # Fall back to token-level conf if available
114
+ for tok in line.get("tokens", []):
115
+ if "conf" in tok and tok["conf"] is not None:
116
+ confidences.append(float(tok["conf"]))
117
+
118
+ full_text = "\n".join(text_parts).strip()
119
+
120
+ avg_conf = 0.0
121
+ if confidences:
122
+ # Chandra's JSON typically gives conf ∈ [0,1]; scale to 0–100
123
+ avg_conf = (sum(confidences) / len(confidences)) * 100.0
124
+
125
+ return full_text, round(avg_conf, 2)
126
 
127
  except Exception as e:
128
+ logger.error(f"OCR Critical Error (Chandra): {e}", exc_info=True)
129
+ return "", 0.0