Spaces:
Sleeping
Sleeping
krishnachoudhary-hclguvi commited on
Improve OCR with multipass EasyOCR and confidence filtering
Browse files- extractors/ocr_extractor.py +70 -13
extractors/ocr_extractor.py
CHANGED
|
@@ -81,6 +81,40 @@ def _preprocess_image(image: Image.Image) -> Image.Image:
|
|
| 81 |
return image
|
| 82 |
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
def _reconstruct_from_boxes(results: list) -> str:
|
| 85 |
""" Reconstruct text layout from bounding boxes.
|
| 86 |
Sort by top, then group by 'lines' based on y-coordinate.
|
|
@@ -149,20 +183,43 @@ def extract_image(file_path: str) -> ExtractionResult:
|
|
| 149 |
try:
|
| 150 |
reader = get_easyocr_reader()
|
| 151 |
if reader:
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
#
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
detail=1,
|
| 159 |
-
paragraph=False,
|
| 160 |
-
canvas_size=1200,
|
| 161 |
-
contrast_ths=0.1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
)
|
| 163 |
-
|
| 164 |
-
#
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
if text.strip():
|
| 168 |
elapsed = (time.time() - start_time) * 1000
|
|
|
|
| 81 |
return image
|
| 82 |
|
| 83 |
|
| 84 |
+
def _preprocess_color_text(image: Image.Image) -> Image.Image:
|
| 85 |
+
"""Preprocess image to preserve colored headline text (e.g., certificates)."""
|
| 86 |
+
rgb = image.convert("RGB")
|
| 87 |
+
rgb = ImageEnhance.Color(rgb).enhance(2.2)
|
| 88 |
+
rgb = ImageEnhance.Contrast(rgb).enhance(1.25)
|
| 89 |
+
rgb = rgb.filter(ImageFilter.SHARPEN)
|
| 90 |
+
return rgb
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _filter_easyocr_results(results: list, min_conf: float = 0.25) -> list:
|
| 94 |
+
"""Drop very low-confidence and non-informative EasyOCR boxes."""
|
| 95 |
+
filtered = []
|
| 96 |
+
for item in results or []:
|
| 97 |
+
if len(item) < 3:
|
| 98 |
+
continue
|
| 99 |
+
text = str(item[1]).strip()
|
| 100 |
+
conf = float(item[2])
|
| 101 |
+
if conf < min_conf:
|
| 102 |
+
continue
|
| 103 |
+
if not any(ch.isalnum() for ch in text):
|
| 104 |
+
continue
|
| 105 |
+
filtered.append(item)
|
| 106 |
+
return filtered
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def _score_extracted_text(text: str) -> int:
|
| 110 |
+
"""Heuristic score to choose best OCR pass output."""
|
| 111 |
+
if not text:
|
| 112 |
+
return 0
|
| 113 |
+
alpha_num = sum(1 for c in text if c.isalnum())
|
| 114 |
+
penalties = sum(1 for c in text if c in "{}[]|~`")
|
| 115 |
+
return alpha_num - (penalties * 3)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
def _reconstruct_from_boxes(results: list) -> str:
|
| 119 |
""" Reconstruct text layout from bounding boxes.
|
| 120 |
Sort by top, then group by 'lines' based on y-coordinate.
|
|
|
|
| 183 |
try:
|
| 184 |
reader = get_easyocr_reader()
|
| 185 |
if reader:
|
| 186 |
+
with Image.open(file_path) as src_img:
|
| 187 |
+
base_img = src_img.convert("RGB")
|
| 188 |
+
|
| 189 |
+
# Pass 1: standard detection with lower thresholds for certificate layouts.
|
| 190 |
+
results_default = reader.readtext(
|
| 191 |
+
np.array(base_img),
|
| 192 |
+
detail=1,
|
| 193 |
+
paragraph=False,
|
| 194 |
+
canvas_size=1200,
|
| 195 |
+
contrast_ths=0.1,
|
| 196 |
+
mag_ratio=1.2,
|
| 197 |
+
text_threshold=0.6,
|
| 198 |
+
low_text=0.25,
|
| 199 |
+
link_threshold=0.25,
|
| 200 |
)
|
| 201 |
+
|
| 202 |
+
# Pass 2: boosted color/contrast to recover orange/blue headings.
|
| 203 |
+
color_img = _preprocess_color_text(base_img)
|
| 204 |
+
results_color = reader.readtext(
|
| 205 |
+
np.array(color_img),
|
| 206 |
+
detail=1,
|
| 207 |
+
paragraph=False,
|
| 208 |
+
canvas_size=1200,
|
| 209 |
+
contrast_ths=0.05,
|
| 210 |
+
mag_ratio=1.2,
|
| 211 |
+
text_threshold=0.55,
|
| 212 |
+
low_text=0.2,
|
| 213 |
+
link_threshold=0.2,
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
filtered_default = _filter_easyocr_results(results_default)
|
| 217 |
+
filtered_color = _filter_easyocr_results(results_color)
|
| 218 |
+
|
| 219 |
+
text_default = _reconstruct_from_boxes(filtered_default)
|
| 220 |
+
text_color = _reconstruct_from_boxes(filtered_color)
|
| 221 |
+
|
| 222 |
+
text = text_default if _score_extracted_text(text_default) >= _score_extracted_text(text_color) else text_color
|
| 223 |
|
| 224 |
if text.strip():
|
| 225 |
elapsed = (time.time() - start_time) * 1000
|