Spaces:

KarthiEz
/

Paddleocr

Sleeping

App Files Files Community

KarthiEz commited on Jan 2

Commit

a8a9f71

verified ·

1 Parent(s): 55774c7

Update app.py

Browse files

Files changed (1) hide show

app.py +201 -31

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ from paddleocr import PaddleOCR
 # --------- Config knobs ----------
 LANG = os.getenv("OCR_LANG", "en")
-USE_GPU = os.getenv("OCR_USE_GPU", "false").lower() == "true"
 DET = os.getenv("OCR_DET_MODEL", "ch_PP-OCRv4_det")
 REC = os.getenv("OCR_REC_MODEL", "en_PP-OCRv4")
 CLS = True
@@ -46,6 +46,7 @@ def _build_ocr(use_cls: bool) -> PaddleOCR:
         show_log=False
     )
 _OCR = _build_ocr(CLS)
 class TextBlock:
@@ -53,10 +54,11 @@ class TextBlock:
     def __init__(self, text: str, confidence: float, bbox: List[List[int]], img_width: int, img_height: int):
         self.text = text
         self.confidence = confidence
-        self.bbox = bbox
         self.img_width = img_width
         self.img_height = img_height
         x_coords = [p[0] for p in bbox]
         y_coords = [p[1] for p in bbox]
         self.x_min = min(x_coords)
@@ -68,62 +70,211 @@ class TextBlock:
         self.width = self.x_max - self.x_min
         self.height = self.y_max - self.y_min
-def _sort_blocks_by_layout(blocks: List[TextBlock], tolerance: float = 0.05) -> List[TextBlock]:
-    """Sort text blocks to preserve reading order (top-to-bottom, left-to-right)."""
     if not blocks:
         return blocks
-    max_height = max(b.y_max for b in blocks) if blocks else 1
     def get_sort_key(block: TextBlock):
-        line_group = int(block.y_min / (max_height * tolerance))
-        return (line_group, block.x_min)
     return sorted(blocks, key=get_sort_key)
 def _reconstruct_text_with_layout(blocks: List[TextBlock], img_width: int, img_height: int) -> str:
-    """Reconstruct text preserving layout structure using bounding box positions."""
     if not blocks:
         return ""
     sorted_blocks = _sort_blocks_by_layout(blocks)
     lines = []
     current_line = []
     current_y = None
-    line_tolerance = img_height * 0.02  # 2% of image height for line grouping
     for block in sorted_blocks:
-        if current_y is None or abs(block.y_min - current_y) > line_tolerance:
-            if current_line:
-                lines.append(_format_line(current_line, img_width))
             current_line = [block]
-            current_y = block.y_min
         else:
-            current_line.append(block)
-            current_y = sum(b.y_min for b in current_line) / len(current_line)
     if current_line:
-        lines.append(_format_line(current_line, img_width))
-    return "\n".join(lines)
-def _format_line(blocks: List[TextBlock], img_width: int) -> str:
-    """Format a line of text blocks, preserving horizontal spacing."""
     if not blocks:
         return ""
     blocks = sorted(blocks, key=lambda b: b.x_min)
-    parts = []
-    prev_x_end = 0
     for block in blocks:
-        gap = block.x_min - prev_x_end
-        if gap > img_width * 0.05:
-            num_spaces = max(1, int(gap / (img_width * 0.01)))
-            parts.append(" " * min(num_spaces, 10))
         parts.append(block.text)
         prev_x_end = block.x_max
@@ -131,7 +282,12 @@ def _format_line(blocks: List[TextBlock], img_width: int) -> str:
     return "".join(parts)
 def ocr_image(pil_img: Image.Image, preserve_layout: bool = True) -> Tuple[List[TextBlock], str]:
-    """Perform OCR on image and return both structured blocks and formatted text."""
     img_cv = _pil_to_cv(pil_img)
     img_width, img_height = pil_img.size
@@ -143,6 +299,7 @@ def ocr_image(pil_img: Image.Image, preserve_layout: bool = True) -> Tuple[List[
     except RuntimeError as e:
         msg = str(e).lower()
         if "primitive" in msg or "mkldnn" in msg or "predictor.run" in msg:
             fallback_ocr = _build_ocr(False)
             result = _run(fallback_ocr, False)
         else:
@@ -153,7 +310,7 @@ def ocr_image(pil_img: Image.Image, preserve_layout: bool = True) -> Tuple[List[
         return blocks, ""
     for line in result[0]:
-        bbox = line[0]
         txt = line[1][0]
         conf = float(line[1][1])
@@ -161,9 +318,11 @@ def ocr_image(pil_img: Image.Image, preserve_layout: bool = True) -> Tuple[List[
             block = TextBlock(txt, conf, bbox, img_width, img_height)
             blocks.append(block)
     if preserve_layout:
         formatted_text = _reconstruct_text_with_layout(blocks, img_width, img_height)
     else:
         formatted_text = "\n".join([b.text for b in blocks])
     return blocks, formatted_text
@@ -183,7 +342,12 @@ def read_pdf_pages(filepath: str):
     return pages
 def extract_text_from_file(filepath: str, preserve_layout: bool = True) -> Tuple[str, Dict[str, Any]]:
-    """Extract text from file with layout preservation."""
     lower = filepath.lower()
     all_blocks = []
     all_texts = []
@@ -220,7 +384,12 @@ def extract_text_from_file(filepath: str, preserve_layout: bool = True) -> Tuple
     return final_text or "[No text detected]", metadata
 def infer(file_obj, preserve_layout: bool) -> Tuple[str, str]:
-    """Main inference function."""
     try:
         if file_obj is None:
             return "No file uploaded.", "{}"
@@ -274,4 +443,5 @@ if __name__ == "__main__":
         server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
         server_port=int(os.getenv("PORT", "7860")),
         show_error=True
-    )

 # --------- Config knobs ----------
 LANG = os.getenv("OCR_LANG", "en")
+USE_GPU = os.getenv("OCR_USE_GPU", "false").lower() == "true"  # Spaces CPU → keep false
 DET = os.getenv("OCR_DET_MODEL", "ch_PP-OCRv4_det")
 REC = os.getenv("OCR_REC_MODEL", "en_PP-OCRv4")
 CLS = True
         show_log=False
     )
+# Primary OCR instance (CLS on). If CLS crashes, we'll rebuild w/o CLS just-in-time.
 _OCR = _build_ocr(CLS)
 class TextBlock:
     def __init__(self, text: str, confidence: float, bbox: List[List[int]], img_width: int, img_height: int):
         self.text = text
         self.confidence = confidence
+        self.bbox = bbox  # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
         self.img_width = img_width
         self.img_height = img_height
+        # Calculate bounding box properties
         x_coords = [p[0] for p in bbox]
         y_coords = [p[1] for p in bbox]
         self.x_min = min(x_coords)
         self.width = self.x_max - self.x_min
         self.height = self.y_max - self.y_min
+def _sort_blocks_by_layout(blocks: List[TextBlock]) -> List[TextBlock]:
+    """
+    Sort text blocks to preserve reading order (top-to-bottom, left-to-right).
+    """
     if not blocks:
         return blocks
     def get_sort_key(block: TextBlock):
+        # Primary sort by Y position (top to bottom)
+        # Secondary sort by X position (left to right)
+        return (block.y_min, block.x_min)
     return sorted(blocks, key=get_sort_key)
 def _reconstruct_text_with_layout(blocks: List[TextBlock], img_width: int, img_height: int) -> str:
+    """
+    Reconstruct text preserving layout structure using a character-grid approach.
+    This preserves exact positions and handles multi-column layouts better.
+    """
     if not blocks:
         return ""
     sorted_blocks = _sort_blocks_by_layout(blocks)
+    # Calculate adaptive line tolerance based on average block height
+    if blocks:
+        avg_height = sum(b.height for b in blocks) / len(blocks)
+        # Line tolerance should be based on block height, not image height
+        line_tolerance = max(avg_height * 0.5, img_height * 0.01, 8)  # At least 8 pixels
+    else:
+        line_tolerance = img_height * 0.015
+    # Group blocks into lines based on Y position
     lines = []
     current_line = []
     current_y = None
     for block in sorted_blocks:
+        # Check if this block is on a new line
+        if current_y is None:
+            # First block
             current_line = [block]
+            current_y = block.center_y
         else:
+            # Check vertical overlap or proximity
+            y_diff = abs(block.center_y - current_y)
+            # Also check if blocks overlap vertically
+            overlaps = any(
+                not (block.y_max < b.y_min or block.y_min > b.y_max)
+                for b in current_line
+            )
+            if y_diff <= line_tolerance or overlaps:
+                # Same line, add to current line
+                current_line.append(block)
+                # Update current_y to weighted average (by block height)
+                total_height = sum(b.height for b in current_line)
+                current_y = sum(b.center_y * b.height for b in current_line) / total_height
+            else:
+                # New line
+                if current_line:
+                    lines.append(current_line)
+                current_line = [block]
+                current_y = block.center_y
+    # Add last line
     if current_line:
+        lines.append(current_line)
+    # Format each line preserving exact positions
+    formatted_lines = []
+    for line_blocks in lines:
+        # Try character-grid approach first, fall back to gap-based if needed
+        formatted_line = _format_line_with_positions(line_blocks, img_width)
+        formatted_lines.append(formatted_line)
+    return "\n".join(formatted_lines)
+def _format_line_with_positions(blocks: List[TextBlock], img_width: int) -> str:
+    """
+    Format a line of text blocks using a character-grid approach to preserve exact positions.
+    Uses a fixed character width to map X positions to column positions.
+    """
     if not blocks:
         return ""
+    # Sort blocks left to right
     blocks = sorted(blocks, key=lambda b: b.x_min)
+    # Use a character grid approach: map X positions to character positions
+    # Estimate character width based on average block width
+    if blocks:
+        # Calculate average character width more accurately
+        char_widths = []
+        for b in blocks:
+            if len(b.text) > 0:
+                char_widths.append(b.width / len(b.text))
+        if char_widths:
+            avg_char_width = sum(char_widths) / len(char_widths)
+        else:
+            avg_char_width = img_width / 100
+        # Use a reasonable character width (pixels per character)
+        char_width = max(avg_char_width * 0.7, img_width / 150)  # At least 150 chars per line
+    else:
+        char_width = img_width / 100
+    # Build line using character positions
+    grid_size = int(img_width / char_width) + 1
+    line_chars = [' '] * grid_size
     for block in blocks:
+        # Calculate start and end positions in character grid
+        start_pos = int(block.x_min / char_width)
+        end_pos = int(block.x_max / char_width) + 1
+        # Ensure positions are within bounds
+        start_pos = max(0, min(start_pos, grid_size - 1))
+        end_pos = max(start_pos, min(end_pos, grid_size))
+        # Place text in the grid, character by character
+        text = block.text
+        text_len = len(text)
+        grid_len = end_pos - start_pos
+        if text_len > 0 and grid_len > 0:
+            # Distribute text characters across the grid positions
+            for i, char in enumerate(text):
+                pos = start_pos + int(i * grid_len / text_len)
+                if pos < grid_size:
+                    # Only overwrite if it's a space or same character
+                    if line_chars[pos] == ' ':
+                        line_chars[pos] = char
+                    elif line_chars[pos] != char and i == 0:
+                        # If first char conflicts, try next position
+                        if pos + 1 < grid_size and line_chars[pos + 1] == ' ':
+                            line_chars[pos + 1] = char
+    # Convert grid to string, removing trailing spaces
+    result = ''.join(line_chars).rstrip()
+    # If grid approach didn't work well (too sparse or too compressed), fall back to gap-based approach
+    text_length = sum(len(b.text) for b in blocks)
+    if len(result.strip()) < text_length * 0.6 or len(result) > text_length * 3:
+        return _format_line_with_gaps(blocks, img_width)
+    return result
+def _format_line_with_gaps(blocks: List[TextBlock], img_width: int) -> str:
+    """
+    Format a line of text blocks preserving gaps between blocks.
+    More accurate spacing calculation based on actual pixel positions.
+    """
+    if not blocks:
+        return ""
+    # Sort blocks left to right
+    blocks = sorted(blocks, key=lambda b: b.x_min)
+    # Estimate average character width for spacing - use median for better accuracy
+    if blocks:
+        char_widths = []
+        for b in blocks:
+            if len(b.text) > 0:
+                char_widths.append(b.width / len(b.text))
+        if char_widths:
+            # Use median to avoid outliers
+            char_widths.sort()
+            mid = len(char_widths) // 2
+            avg_char_width = char_widths[mid] if len(char_widths) % 2 == 1 else (char_widths[mid-1] + char_widths[mid]) / 2
+        else:
+            avg_char_width = img_width / 100
+    else:
+        avg_char_width = img_width / 100
+    parts = []
+    prev_x_end = None
+    for block in blocks:
+        if prev_x_end is not None:
+            # Calculate gap between previous block and current block
+            gap = block.x_min - prev_x_end
+            # Determine spacing based on gap
+            if gap < 0:
+                # Overlapping blocks - add minimal space
+                parts.append(" ")
+            elif gap < avg_char_width * 0.2:
+                # Very small gap - likely should be connected, but add space for safety
+                parts.append(" ")
+            elif gap < avg_char_width * 1.5:
+                # Small gap - single space
+                parts.append(" ")
+            else:
+                # Larger gap - calculate number of spaces
+                num_spaces = max(1, int(gap / avg_char_width))
+                # Cap at reasonable maximum to avoid excessive spacing
+                num_spaces = min(num_spaces, 30)
+                parts.append(" " * num_spaces)
+        else:
+            # First block - check if it starts far from left edge
+            if block.x_min > img_width * 0.02:  # More than 2% from left
+                # Add leading spaces
+                num_spaces = max(1, int(block.x_min / avg_char_width))
+                num_spaces = min(num_spaces, 20)
+                parts.append(" " * num_spaces)
         parts.append(block.text)
         prev_x_end = block.x_max
     return "".join(parts)
 def ocr_image(pil_img: Image.Image, preserve_layout: bool = True) -> Tuple[List[TextBlock], str]:
+    """
+    Perform OCR on image and return both structured blocks and formatted text.
+    Returns:
+        Tuple of (list of TextBlock objects, formatted text string)
+    """
     img_cv = _pil_to_cv(pil_img)
     img_width, img_height = pil_img.size
     except RuntimeError as e:
         msg = str(e).lower()
         if "primitive" in msg or "mkldnn" in msg or "predictor.run" in msg:
+            # One-time fallback without angle classifier
             fallback_ocr = _build_ocr(False)
             result = _run(fallback_ocr, False)
         else:
         return blocks, ""
     for line in result[0]:
+        bbox = line[0]  # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
         txt = line[1][0]
         conf = float(line[1][1])
             block = TextBlock(txt, conf, bbox, img_width, img_height)
             blocks.append(block)
+    # Generate formatted text
     if preserve_layout:
         formatted_text = _reconstruct_text_with_layout(blocks, img_width, img_height)
     else:
+        # Simple concatenation (original behavior)
         formatted_text = "\n".join([b.text for b in blocks])
     return blocks, formatted_text
     return pages
 def extract_text_from_file(filepath: str, preserve_layout: bool = True) -> Tuple[str, Dict[str, Any]]:
+    """
+    Extract text from file with layout preservation.
+    Returns:
+        Tuple of (formatted text, metadata dict with blocks info)
+    """
     lower = filepath.lower()
     all_blocks = []
     all_texts = []
     return final_text or "[No text detected]", metadata
 def infer(file_obj, preserve_layout: bool) -> Tuple[str, str]:
+    """
+    Main inference function.
+    Returns:
+        Tuple of (formatted text, metadata JSON string)
+    """
     try:
         if file_obj is None:
             return "No file uploaded.", "{}"
         server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
         server_port=int(os.getenv("PORT", "7860")),
         show_error=True
+    )