Spaces:

hkai20000
/

ocrAPP

Sleeping

App Files Files Community

hkai20000 commited on Jan 31

Commit

3f371b2

verified ·

1 Parent(s): 92a1e4b

Update main.py

Browse files

Files changed (1) hide show

main.py +37 -26

main.py CHANGED Viewed

@@ -99,9 +99,9 @@ def get_ocr_predictor(det_arch: str, reco_arch: str):
             det_arch=det_arch,
             reco_arch=reco_arch,
             pretrained=True,
-            assume_straight_pages=False,  # Better for complex layouts
-            straighten_pages=True,        # Auto-correct rotation
-            detect_orientation=True,      # Detect page orientation
             preserve_aspect_ratio=True    # Keep proportions
         )
         ocr_model_cache[cache_key] = predictor
@@ -197,55 +197,66 @@ def basic_cleanup(text: str) -> str:
 def extract_text_structured(result) -> str:
     """
     Extract text from docTR result preserving logical structure.
-    Groups text blocks by vertical position for better table handling.
-    Sorts words within each line by x-position (left to right).
     """
     all_lines = []
     for page in result.pages:
         for block in page.blocks:
             for line in block.lines:
-                # Collect all words with their positions
-                words_in_line = []
                 for word in line.words:
-                    words_in_line.append({
                         'text': word.value,
-                        'x': word.geometry[0][0],  # x position (left edge)
-                        'y': word.geometry[0][1]   # y position (top edge)
                     })
-                if not words_in_line:
                     continue
                 # Sort words by x position (left to right)
-                words_in_line.sort(key=lambda w: w['x'])
-                # Build line text from sorted words
-                line_text = " ".join([w['text'] for w in words_in_line])
                 if line_text.strip():
-                    min_y = min(w['y'] for w in words_in_line)
-                    min_x = min(w['x'] for w in words_in_line)
                     all_lines.append({
                         'text': line_text.strip(),
-                        'y': min_y,
                         'x': min_x
                     })
-    # Sort lines by y position (top to bottom), then x (left to right for same row)
     all_lines.sort(key=lambda l: (round(l['y'] * 20) / 20, l['x']))
-    # Build final text with line breaks between different y-groups
-    result_text = ""
-    prev_y = -1
     for line_info in all_lines:
         current_y_group = round(line_info['y'] * 20) / 20
-        if prev_y != -1 and current_y_group != prev_y:
-            result_text += "\n"
-        result_text += line_info['text'] + " "
-        prev_y = current_y_group
-    return result_text.strip()
 def extract_words_with_boxes(result) -> list:
     """

             det_arch=det_arch,
             reco_arch=reco_arch,
             pretrained=True,
+            assume_straight_pages=True,   # Assume pages are already straight
+            straighten_pages=False,       # Don't auto-rotate (was causing issues)
+            detect_orientation=False,     # Don't detect orientation (was inverting text)
             preserve_aspect_ratio=True    # Keep proportions
         )
         ocr_model_cache[cache_key] = predictor
 def extract_text_structured(result) -> str:
     """
     Extract text from docTR result preserving logical structure.
+    Explicitly sorts words by x-coordinate and lines by y-coordinate.
     """
     all_lines = []
     for page in result.pages:
         for block in page.blocks:
             for line in block.lines:
+                # Collect words with their x-positions
+                words_data = []
                 for word in line.words:
+                    # geometry is ((x_min, y_min), (x_max, y_max))
+                    x_pos = word.geometry[0][0]  # Left edge x-coordinate
+                    y_pos = word.geometry[0][1]  # Top edge y-coordinate
+                    words_data.append({
                         'text': word.value,
+                        'x': x_pos,
+                        'y': y_pos
                     })
+                if not words_data:
                     continue
                 # Sort words by x position (left to right)
+                words_data.sort(key=lambda w: w['x'])
+                line_text = " ".join([w['text'] for w in words_data])
+                avg_y = sum(w['y'] for w in words_data) / len(words_data)
+                min_x = min(w['x'] for w in words_data)
                 if line_text.strip():
                     all_lines.append({
                         'text': line_text.strip(),
+                        'y': avg_y,
                         'x': min_x
                     })
+    # Sort lines by y position (top to bottom)
     all_lines.sort(key=lambda l: (round(l['y'] * 20) / 20, l['x']))
+    # Group lines by y-position and build final text
+    result_lines = []
+    prev_y_group = -1
+    current_line_parts = []
     for line_info in all_lines:
         current_y_group = round(line_info['y'] * 20) / 20
+        if prev_y_group != -1 and current_y_group != prev_y_group:
+            if current_line_parts:
+                result_lines.append(" ".join(current_line_parts))
+            current_line_parts = [line_info['text']]
+        else:
+            current_line_parts.append(line_info['text'])
+        prev_y_group = current_y_group
+    if current_line_parts:
+        result_lines.append(" ".join(current_line_parts))
+    return "\n".join(result_lines)
 def extract_words_with_boxes(result) -> list:
     """