Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on about 1 month ago

Commit

7249be5

verified ·

1 Parent(s): bc2e64c

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +215 -72

working_yolo_pipeline.py CHANGED Viewed

@@ -17,6 +17,30 @@ torch.load = patched_torch_load
 import json
 import argparse
 import os
@@ -511,10 +535,71 @@ def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> Li
     return sorted(final_separators)
 def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
                                 top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
-    """Extract word data with OCR caching to avoid redundant Tesseract runs."""
     word_data = page.get_text("words")
     if len(word_data) > 0:
@@ -524,45 +609,40 @@ def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
             word_data = _ocr_cache.get_ocr(pdf_path, page_num)
         else:
             try:
-                # --- OPTIMIZATION START ---
-                # 1. Render at Higher Resolution (Zoom 4.0 = ~300 DPI)
                 zoom_level = 4.0
                 pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
-                # 2. Convert directly to OpenCV format (Faster than PIL)
                 img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
-                if pix.n == 3:
-                    img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
-                elif pix.n == 4:
-                    img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
-                # 3. Apply Preprocessing (Thresholding)
-                processed_img = preprocess_image_for_ocr(img_np)
-                # 4. Optimized Tesseract Config
-                # --psm 6: Assume a single uniform block of text (Great for columns/questions)
-                # --oem 3: Default engine (LSTM)
-                custom_config = r'--oem 3 --psm 6'
-                data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT,
-                                                 config=custom_config)
                 full_word_data = []
-                for i in range(len(data['level'])):
-                    text = data['text'][i].strip()
-                    if text:
-                        # Scale coordinates back to PDF points
-                        x1 = data['left'][i] / zoom_level
-                        y1 = data['top'][i] / zoom_level
-                        x2 = (data['left'][i] + data['width'][i]) / zoom_level
-                        y2 = (data['top'][i] + data['height'][i]) / zoom_level
-                        full_word_data.append((text, x1, y1, x2, y2))
                 word_data = full_word_data
                 _ocr_cache.set_ocr(pdf_path, page_num, word_data)
-                # --- OPTIMIZATION END ---
             except Exception as e:
-                print(f"  ❌ OCR Error in detection phase: {e}")
                 return []
     # Apply margin filtering
@@ -572,6 +652,17 @@ def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
     return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
 def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
     img_data = pix.samples
     img = np.frombuffer(img_data, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
@@ -992,58 +1083,110 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
                 })
         else:
             # === START OF OPTIMIZED OCR BLOCK ===
             try:
-                # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
                 ocr_zoom = 4.0
                 pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
-                # Convert PyMuPDF Pixmap to OpenCV format
-                img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
-                                                                                    pix_ocr.n)
                 if pix_ocr.n == 3:
                     img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
                 elif pix_ocr.n == 4:
                     img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
-                # 2. Preprocess (Binarization)
-                processed_img = preprocess_image_for_ocr(img_ocr_np)
-                # 3. Run Tesseract with Optimized Configuration
-                custom_config = r'--oem 3 --psm 6'
-                hocr_data = pytesseract.image_to_data(
-                    processed_img,
-                    output_type=pytesseract.Output.DICT,
-                    config=custom_config
-                )
-                for i in range(len(hocr_data['level'])):
-                    text = hocr_data['text'][i] # Retrieve raw Tesseract text
-                    # --- FIX: SANITIZE TEXT AND THEN STRIP ---
-                    cleaned_text = sanitize_text(text).strip()
-                    if cleaned_text and hocr_data['conf'][i] > -1:
-                        # 4. Coordinate Mapping
-                        scale_adjustment = scale_factor / ocr_zoom
-                        x1 = int(hocr_data['left'][i] * scale_adjustment)
-                        y1 = int(hocr_data['top'][i] * scale_adjustment)
-                        w = int(hocr_data['width'][i] * scale_adjustment)
-                        h = int(hocr_data['height'][i] * scale_adjustment)
-                        x2 = x1 + w
-                        y2 = y1 + h
-                        raw_ocr_output.append({
-                            'type': 'text',
-                            'word': cleaned_text, # Use the sanitized word
-                            'confidence': float(hocr_data['conf'][i]),
-                            'bbox': [x1, y1, x2, y2],
-                            'y0': y1,
-                            'x0': x1
-                        })
             except Exception as e:
-                print(f"  ❌ Tesseract OCR Error: {e}")
             # === END OF OPTIMIZED OCR BLOCK ===
     # ====================================================================

+#==================================================================================
+#RAPID OCR
+#==================================================================================
+from rapidocr import RapidOCR, OCRVersion
+# Initialize RapidOCR (v5 is generally the most accurate current version)
+# We use return_word_box=True to get word-level precision similar to Tesseract's image_to_data
+ocr_engine = RapidOCR(params={
+    "Det.ocr_version": OCRVersion.PPOCRV5,
+    "Rec.ocr_version": OCRVersion.PPOCRV5,
+    "return_word_box": True
+})
+#==================================================================================
+#RAPID OCR
+#==================================================================================
 import json
 import argparse
 import os
     return sorted(final_separators)
+#======================================================================================================================================
+# def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
+#                                 top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
+#     """Extract word data with OCR caching to avoid redundant Tesseract runs."""
+#     word_data = page.get_text("words")
+#     if len(word_data) > 0:
+#         word_data = [(w[4], w[0], w[1], w[2], w[3]) for w in word_data]
+#     else:
+#         if _ocr_cache.has_ocr(pdf_path, page_num):
+#             word_data = _ocr_cache.get_ocr(pdf_path, page_num)
+#         else:
+#             try:
+#                 # --- OPTIMIZATION START ---
+#                 # 1. Render at Higher Resolution (Zoom 4.0 = ~300 DPI)
+#                 zoom_level = 4.0
+#                 pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
+#                 # 2. Convert directly to OpenCV format (Faster than PIL)
+#                 img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
+#                 if pix.n == 3:
+#                     img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
+#                 elif pix.n == 4:
+#                     img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
+#                 # 3. Apply Preprocessing (Thresholding)
+#                 processed_img = preprocess_image_for_ocr(img_np)
+#                 # 4. Optimized Tesseract Config
+#                 # --psm 6: Assume a single uniform block of text (Great for columns/questions)
+#                 # --oem 3: Default engine (LSTM)
+#                 custom_config = r'--oem 3 --psm 6'
+#                 data = pytesseract.image_to_data(processed_img, output_type=pytesseract.Output.DICT,
+#                                                  config=custom_config)
+#                 full_word_data = []
+#                 for i in range(len(data['level'])):
+#                     text = data['text'][i].strip()
+#                     if text:
+#                         # Scale coordinates back to PDF points
+#                         x1 = data['left'][i] / zoom_level
+#                         y1 = data['top'][i] / zoom_level
+#                         x2 = (data['left'][i] + data['width'][i]) / zoom_level
+#                         y2 = (data['top'][i] + data['height'][i]) / zoom_level
+#                         full_word_data.append((text, x1, y1, x2, y2))
+#                 word_data = full_word_data
+#                 _ocr_cache.set_ocr(pdf_path, page_num, word_data)
+#                 # --- OPTIMIZATION END ---
+#             except Exception as e:
+#                 print(f"  ❌ OCR Error in detection phase: {e}")
+#                 return []
+#     # Apply margin filtering
+#     page_height = page.rect.height
+#     y_min = page_height * top_margin_percent
+#     y_max = page_height * (1 - bottom_margin_percent)
+#     return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
+#============================================================================================================
 def get_word_data_for_detection(page: fitz.Page, pdf_path: str, page_num: int,
                                 top_margin_percent=0.10, bottom_margin_percent=0.10) -> list:
     word_data = page.get_text("words")
     if len(word_data) > 0:
             word_data = _ocr_cache.get_ocr(pdf_path, page_num)
         else:
             try:
+                # 1. Render at Higher Resolution
                 zoom_level = 4.0
                 pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
                 img_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
+                # Convert to BGR for RapidOCR
+                if pix.n == 3: img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
+                elif pix.n == 4: img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGR)
+                # 2. Run RapidOCR
+                # RapidOCR returns: [[box, text, score], ...]
+                # where box is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+                results, _ = ocr_engine(img_np)
                 full_word_data = []
+                if results:
+                    for box, text, score in results:
+                        text = text.strip()
+                        if text:
+                            # 3. Convert Polygon to BBox and Scale back to PDF points
+                            xs = [p[0] for p in box]
+                            ys = [p[1] for p in box]
+                            x1 = min(xs) / zoom_level
+                            y1 = min(ys) / zoom_level
+                            x2 = max(xs) / zoom_level
+                            y2 = max(ys) / zoom_level
+                            full_word_data.append((text, x1, y1, x2, y2))
                 word_data = full_word_data
                 _ocr_cache.set_ocr(pdf_path, page_num, word_data)
             except Exception as e:
+                print(f"  ❌ RapidOCR Error in detection phase: {e}")
                 return []
     # Apply margin filtering
     return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
+#=========================================================================================================================================
+#=============================================================================================================================================
 def pixmap_to_numpy(pix: fitz.Pixmap) -> np.ndarray:
     img_data = pix.samples
     img = np.frombuffer(img_data, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
                 })
         else:
             # === START OF OPTIMIZED OCR BLOCK ===
+            # try:
+            #     # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
+            #     ocr_zoom = 4.0
+            #     pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
+            #     # Convert PyMuPDF Pixmap to OpenCV format
+            #     img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width,
+            #                                                                         pix_ocr.n)
+            #     if pix_ocr.n == 3:
+            #         img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
+            #     elif pix_ocr.n == 4:
+            #         img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
+            #     # 2. Preprocess (Binarization)
+            #     processed_img = preprocess_image_for_ocr(img_ocr_np)
+            #     # 3. Run Tesseract with Optimized Configuration
+            #     custom_config = r'--oem 3 --psm 6'
+            #     hocr_data = pytesseract.image_to_data(
+            #         processed_img,
+            #         output_type=pytesseract.Output.DICT,
+            #         config=custom_config
+            #     )
+            #     for i in range(len(hocr_data['level'])):
+            #         text = hocr_data['text'][i] # Retrieve raw Tesseract text
+            #         # --- FIX: SANITIZE TEXT AND THEN STRIP ---
+            #         cleaned_text = sanitize_text(text).strip()
+            #         if cleaned_text and hocr_data['conf'][i] > -1:
+            #             # 4. Coordinate Mapping
+            #             scale_adjustment = scale_factor / ocr_zoom
+            #             x1 = int(hocr_data['left'][i] * scale_adjustment)
+            #             y1 = int(hocr_data['top'][i] * scale_adjustment)
+            #             w = int(hocr_data['width'][i] * scale_adjustment)
+            #             h = int(hocr_data['height'][i] * scale_adjustment)
+            #             x2 = x1 + w
+            #             y2 = y1 + h
+            #             raw_ocr_output.append({
+            #                 'type': 'text',
+            #                 'word': cleaned_text, # Use the sanitized word
+            #                 'confidence': float(hocr_data['conf'][i]),
+            #                 'bbox': [x1, y1, x2, y2],
+            #                 'y0': y1,
+            #                 'x0': x1
+            #             })
+            # except Exception as e:
+        #     print(f"  ❌ Tesseract OCR Error: {e}")
+#=============================================================================================================================================================
+#=============================================================================================================================================================
+            else:
+            # === START OF RAPIDOCR BLOCK ===
             try:
+                # 1. Re-render Page at High Resolution (Standardizing to Zoom 4.0)
                 ocr_zoom = 4.0
                 pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
+                # Convert PyMuPDF Pixmap to OpenCV format (BGR)
+                img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(
+                    pix_ocr.height, pix_ocr.width, pix_ocr.n
+                )
                 if pix_ocr.n == 3:
                     img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
                 elif pix_ocr.n == 4:
                     img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
+                # 2. Run RapidOCR (Models handle preprocessing internally)
+                results, _ = ocr_engine(img_ocr_np)
+                if results:
+                    # Calculate scaling from OCR image (4.0) to your pipeline standard (scale_factor=2.0)
+                    scale_adjustment = scale_factor / ocr_zoom
+                    for box, text, score in results:
+                        # Sanitize and clean text
+                        cleaned_text = sanitize_text(text).strip()
+                        if cleaned_text:
+                            # 3. Coordinate Mapping (Convert 4-point polygon to x1, y1, x2, y2)
+                            xs = [p[0] for p in box]
+                            ys = [p[1] for p in box]
+                            x1 = int(min(xs) * scale_adjustment)
+                            y1 = int(min(ys) * scale_adjustment)
+                            x2 = int(max(xs) * scale_adjustment)
+                            y2 = int(max(ys) * scale_adjustment)
+                            raw_ocr_output.append({
+                                'type': 'text',
+                                'word': cleaned_text,
+                                'confidence': float(score) * 100, # Converting 0-1.0 to 0-100 scale
+                                'bbox': [x1, y1, x2, y2],
+                                'y0': y1,
+                                'x0': x1
+                            })
             except Exception as e:
+                print(f"  ❌ RapidOCR Fallback Error: {e}")
+            # === END OF RAPIDOCR BLOCK ====================================================================================================================================
+#===========================================================================================================================================================================
             # === END OF OPTIMIZED OCR BLOCK ===
     # ====================================================================