krishnachoudhary-hclguvi commited on
Commit
483f7ec
·
unverified ·
1 Parent(s): f4a6b1e

Improve OCR with multipass EasyOCR and confidence filtering

Browse files
Files changed (1) hide show
  1. extractors/ocr_extractor.py +70 -13
extractors/ocr_extractor.py CHANGED
@@ -81,6 +81,40 @@ def _preprocess_image(image: Image.Image) -> Image.Image:
81
  return image
82
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def _reconstruct_from_boxes(results: list) -> str:
85
  """ Reconstruct text layout from bounding boxes.
86
  Sort by top, then group by 'lines' based on y-coordinate.
@@ -149,20 +183,43 @@ def extract_image(file_path: str) -> ExtractionResult:
149
  try:
150
  reader = get_easyocr_reader()
151
  if reader:
152
- # EasyOCR works well with both original and preprocessed images
153
- # We'll use a slightly preprocessed version for consistency
154
- # Perform OCR with layout awareness
155
- # Adjusting thresholds for better numeric and tabular capture
156
- results = reader.readtext(
157
- file_path,
158
- detail=1,
159
- paragraph=False, # We want individual boxes for layout reconstruction
160
- canvas_size=1200, # Shrunk to detect huge fonts (like certificate names) that CRAFT misses
161
- contrast_ths=0.1 # Reset to 0.1 so colored/light text isn't dropped
 
 
 
 
162
  )
163
-
164
- # Reconstruct full layout from bounding boxes
165
- text = _reconstruct_from_boxes(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  if text.strip():
168
  elapsed = (time.time() - start_time) * 1000
 
81
  return image
82
 
83
 
84
+ def _preprocess_color_text(image: Image.Image) -> Image.Image:
85
+ """Preprocess image to preserve colored headline text (e.g., certificates)."""
86
+ rgb = image.convert("RGB")
87
+ rgb = ImageEnhance.Color(rgb).enhance(2.2)
88
+ rgb = ImageEnhance.Contrast(rgb).enhance(1.25)
89
+ rgb = rgb.filter(ImageFilter.SHARPEN)
90
+ return rgb
91
+
92
+
93
+ def _filter_easyocr_results(results: list, min_conf: float = 0.25) -> list:
94
+ """Drop very low-confidence and non-informative EasyOCR boxes."""
95
+ filtered = []
96
+ for item in results or []:
97
+ if len(item) < 3:
98
+ continue
99
+ text = str(item[1]).strip()
100
+ conf = float(item[2])
101
+ if conf < min_conf:
102
+ continue
103
+ if not any(ch.isalnum() for ch in text):
104
+ continue
105
+ filtered.append(item)
106
+ return filtered
107
+
108
+
109
+ def _score_extracted_text(text: str) -> int:
110
+ """Heuristic score to choose best OCR pass output."""
111
+ if not text:
112
+ return 0
113
+ alpha_num = sum(1 for c in text if c.isalnum())
114
+ penalties = sum(1 for c in text if c in "{}[]|~`")
115
+ return alpha_num - (penalties * 3)
116
+
117
+
118
  def _reconstruct_from_boxes(results: list) -> str:
119
  """ Reconstruct text layout from bounding boxes.
120
  Sort by top, then group by 'lines' based on y-coordinate.
 
183
  try:
184
  reader = get_easyocr_reader()
185
  if reader:
186
+ with Image.open(file_path) as src_img:
187
+ base_img = src_img.convert("RGB")
188
+
189
+ # Pass 1: standard detection with lower thresholds for certificate layouts.
190
+ results_default = reader.readtext(
191
+ np.array(base_img),
192
+ detail=1,
193
+ paragraph=False,
194
+ canvas_size=1200,
195
+ contrast_ths=0.1,
196
+ mag_ratio=1.2,
197
+ text_threshold=0.6,
198
+ low_text=0.25,
199
+ link_threshold=0.25,
200
  )
201
+
202
+ # Pass 2: boosted color/contrast to recover orange/blue headings.
203
+ color_img = _preprocess_color_text(base_img)
204
+ results_color = reader.readtext(
205
+ np.array(color_img),
206
+ detail=1,
207
+ paragraph=False,
208
+ canvas_size=1200,
209
+ contrast_ths=0.05,
210
+ mag_ratio=1.2,
211
+ text_threshold=0.55,
212
+ low_text=0.2,
213
+ link_threshold=0.2,
214
+ )
215
+
216
+ filtered_default = _filter_easyocr_results(results_default)
217
+ filtered_color = _filter_easyocr_results(results_color)
218
+
219
+ text_default = _reconstruct_from_boxes(filtered_default)
220
+ text_color = _reconstruct_from_boxes(filtered_color)
221
+
222
+ text = text_default if _score_extracted_text(text_default) >= _score_extracted_text(text_color) else text_color
223
 
224
  if text.strip():
225
  elapsed = (time.time() - start_time) * 1000