Spaces:

Krish-05
/

text-extraction-api

Sleeping

App Files Files Community

krishnachoudhary-hclguvi commited on Apr 2

Commit

483f7ec

unverified ·

1 Parent(s): f4a6b1e

Improve OCR with multipass EasyOCR and confidence filtering

Browse files

Files changed (1) hide show

extractors/ocr_extractor.py +70 -13

extractors/ocr_extractor.py CHANGED Viewed

@@ -81,6 +81,40 @@ def _preprocess_image(image: Image.Image) -> Image.Image:
     return image
 def _reconstruct_from_boxes(results: list) -> str:
     """ Reconstruct text layout from bounding boxes.
         Sort by top, then group by 'lines' based on y-coordinate.
@@ -149,20 +183,43 @@ def extract_image(file_path: str) -> ExtractionResult:
         try:
             reader = get_easyocr_reader()
             if reader:
-                # EasyOCR works well with both original and preprocessed images
-                # We'll use a slightly preprocessed version for consistency
-                # Perform OCR with layout awareness
-                # Adjusting thresholds for better numeric and tabular capture
-                results = reader.readtext(
-                    file_path,
-                    detail=1,
-                    paragraph=False, # We want individual boxes for layout reconstruction
-                    canvas_size=1200, # Shrunk to detect huge fonts (like certificate names) that CRAFT misses
-                    contrast_ths=0.1  # Reset to 0.1 so colored/light text isn't dropped
                 )
-                # Reconstruct full layout from bounding boxes
-                text = _reconstruct_from_boxes(results)
                 if text.strip():
                     elapsed = (time.time() - start_time) * 1000

     return image
+def _preprocess_color_text(image: Image.Image) -> Image.Image:
+    """Preprocess image to preserve colored headline text (e.g., certificates)."""
+    rgb = image.convert("RGB")
+    rgb = ImageEnhance.Color(rgb).enhance(2.2)
+    rgb = ImageEnhance.Contrast(rgb).enhance(1.25)
+    rgb = rgb.filter(ImageFilter.SHARPEN)
+    return rgb
+def _filter_easyocr_results(results: list, min_conf: float = 0.25) -> list:
+    """Drop very low-confidence and non-informative EasyOCR boxes."""
+    filtered = []
+    for item in results or []:
+        if len(item) < 3:
+            continue
+        text = str(item[1]).strip()
+        conf = float(item[2])
+        if conf < min_conf:
+            continue
+        if not any(ch.isalnum() for ch in text):
+            continue
+        filtered.append(item)
+    return filtered
+def _score_extracted_text(text: str) -> int:
+    """Heuristic score to choose best OCR pass output."""
+    if not text:
+        return 0
+    alpha_num = sum(1 for c in text if c.isalnum())
+    penalties = sum(1 for c in text if c in "{}[]|~`")
+    return alpha_num - (penalties * 3)
 def _reconstruct_from_boxes(results: list) -> str:
     """ Reconstruct text layout from bounding boxes.
         Sort by top, then group by 'lines' based on y-coordinate.
         try:
             reader = get_easyocr_reader()
             if reader:
+                with Image.open(file_path) as src_img:
+                    base_img = src_img.convert("RGB")
+                # Pass 1: standard detection with lower thresholds for certificate layouts.
+                results_default = reader.readtext(
+                    np.array(base_img),
+                    detail=1,
+                    paragraph=False,
+                    canvas_size=1200,
+                    contrast_ths=0.1,
+                    mag_ratio=1.2,
+                    text_threshold=0.6,
+                    low_text=0.25,
+                    link_threshold=0.25,
                 )
+                # Pass 2: boosted color/contrast to recover orange/blue headings.
+                color_img = _preprocess_color_text(base_img)
+                results_color = reader.readtext(
+                    np.array(color_img),
+                    detail=1,
+                    paragraph=False,
+                    canvas_size=1200,
+                    contrast_ths=0.05,
+                    mag_ratio=1.2,
+                    text_threshold=0.55,
+                    low_text=0.2,
+                    link_threshold=0.2,
+                )
+                filtered_default = _filter_easyocr_results(results_default)
+                filtered_color = _filter_easyocr_results(results_color)
+                text_default = _reconstruct_from_boxes(filtered_default)
+                text_color = _reconstruct_from_boxes(filtered_color)
+                text = text_default if _score_extracted_text(text_default) >= _score_extracted_text(text_color) else text_color
                 if text.strip():
                     elapsed = (time.time() - start_time) * 1000