Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Nov 27, 2025

Commit

bb2061e

verified ·

1 Parent(s): b8d54ea

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +199 -103

working_yolo_pipeline.py CHANGED Viewed

@@ -139,42 +139,76 @@ from sklearn.metrics.pairwise import cosine_similarity
 #=============================================================================
 #-----EXPERIMENT LATEX
-#=============================================================================
-# --- NEW IMPORTS ---
-from pix2text import Pix2Text
 import logging
-# -------------------
-# ============================================================================
-# --- CONFIGURATION AND CONSTANTS ---
-# ... (Your existing constants like WEIGHTS_PATH, OCR_JSON_OUTPUT_DIR, etc.)
-# ============================================================================
 # ============================================================================
-# --- PIX2TEXT INITIALIZATION AND HELPER ---
 # ============================================================================
 # Set up logging to WARNING level to suppress excessive output from model libraries
 logging.basicConfig(level=logging.WARNING)
-logging.getLogger('pix2text').setLevel(logging.WARNING)
-# Initialize Pix2Text model globally (expensive operation, do it once)
-p2t = None
 try:
-    # Use 'yolox_tiny' for faster inference AND configure PyTorch backend
-    p2t = Pix2Text(
-        analyzer_config={'model_name': 'yolox_tiny'},
-        # ⬇️ ADD THESE LINES TO USE PYTORCH INSTEAD OF ONNX ⬇️
-        text_config={
-            'rec_model_backend': 'pytorch',
-            'det_model_backend': 'pytorch'
-        }
-    )
-    print("✅ Pix2Text model initialized successfully with PyTorch backend for equation conversion.")
 except Exception as e:
-    print(f"❌ Error initializing Pix2Text model. Equations will not be converted: {e}")
-    p2t = None
@@ -273,66 +307,11 @@ except Exception as e:
-def get_latex_from_base64(base64_string: str) -> str:
-    """
-    Decodes a Base64 image string, uses Pix2Text to recognize the formula,
-    and returns the LaTeX code, stripped of all whitespace, as requested,
-    and corrects unintended double backslashes.
-    """
-    if p2t is None:
-        return "[P2T_ERROR: Model not initialized]"
-    try:
-        # 1. Decode Base64 to Image
-        image_data = base64.b64decode(base64_string)
-        image = Image.open(io.BytesIO(image_data))
-        # 2. Recognize text and formulas
-        # Use keep_original_image=False to save memory
-        result = p2t.recognize(image, save_formula_images=False, use_analyzer=True, keep_original_image=False)
-        # 3. Parse the result for LaTeX
-        extracted_latex_parts = []
-        if isinstance(result, list):
-            for item in result:
-                # Use .text for structured output, item itself for string output
-                text = item.text if hasattr(item, 'text') else str(item)
-                extracted_latex_parts.append(text)
-        elif isinstance(result, str):
-             extracted_latex_parts = [result]
-        # Join with a space first, then clean all whitespace
-        extracted_latex = " ".join(extracted_latex_parts).strip()
-        # *** CORE CHANGE 1: Remove all spaces/line breaks ***
-        cleaned_latex = extracted_latex.replace('\\\\', '\\')
-        final_latex = re.sub(r'\s+', '', cleaned_latex)
-        if not cleaned_latex:
-             return "[P2T_WARNING: No formula found]"
-        # *** CORE CHANGE 2: Fix unintended double backslashes for LaTeX rendering ***
-        # This replaces every sequence of two literal backslashes ('\\') with one literal backslash ('\'),
-        # ensuring LaTeX commands like '\frac' are correctly formed.
-        # Return the clean and corrected LaTeX string.
-        return final_latex
-    except Exception as e:
-        # Catch any unexpected errors
-        print(f"  ❌ Pix2Text Recognition failed: {e}")
-        return f"[P2T_ERROR: Recognition failed: {e}]"
 # def get_latex_from_base64(base64_string: str) -> str:
 #     """
-#     Decodes a Base64 image string, uses Pix2Text to recognize the formula,
-#     returns the LaTeX code stripped of all whitespace, and collapses unintended
-#     repeated backslashes into a single backslash.
 #     """
 #     if p2t is None:
 #         return "[P2T_ERROR: Model not initialized]"
@@ -341,37 +320,41 @@ def get_latex_from_base64(base64_string: str) -> str:
 #         # 1. Decode Base64 to Image
 #         image_data = base64.b64decode(base64_string)
 #         image = Image.open(io.BytesIO(image_data))
 #         # 2. Recognize text and formulas
-#         result = p2t.recognize(
-#             image, save_formula_images=False, use_analyzer=True, keep_original_image=False
-#         )
 #         # 3. Parse the result for LaTeX
 #         extracted_latex_parts = []
 #         if isinstance(result, list):
 #             for item in result:
 #                 text = item.text if hasattr(item, 'text') else str(item)
 #                 extracted_latex_parts.append(text)
 #         elif isinstance(result, str):
-#             extracted_latex_parts = [result]
-#         # Join then strip
 #         extracted_latex = " ".join(extracted_latex_parts).strip()
-#         # Remove all whitespace/newlines/tabs as requested
-#         cleaned_latex = re.sub(r'\s+', '', extracted_latex)
 #         if not cleaned_latex:
-#             return "[P2T_WARNING: No formula found]"
-#         # COLLAPSE any run of 2 or more backslashes into a single backslash.
-#         # This handles inputs like '\\\\sqrt' or '\\\\\\frac' robustly.
-#         final_latex = re.sub(r'\\{2,}', r'\\', cleaned_latex)
 #         return final_latex
 #     except Exception as e:
 #         print(f"  ❌ Pix2Text Recognition failed: {e}")
 #         return f"[P2T_ERROR: Recognition failed: {e}]"
@@ -379,6 +362,58 @@ def get_latex_from_base64(base64_string: str) -> str:
 # # Initialize the YOLO model
@@ -2229,7 +2264,55 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, label
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Complete Pipeline")
     parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
     parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
@@ -2258,10 +2341,23 @@ if __name__ == "__main__":
     )
     # -----------------------------
     if final_json_data:
         with open(final_output_path, 'w', encoding='utf-8') as f:
-            json.dump(final_json_data, f, indent=2, ensure_ascii=False)
         print(f"\n✅ Final Data Saved: {final_output_path}")
     else:
         print("\n❌ Pipeline Failed.")
-        sys.exit(1)

 #=============================================================================
 #-----EXPERIMENT LATEX
+# #=============================================================================
+# # --- NEW IMPORTS ---
+# from pix2text import Pix2Text
+# import logging
+# # -------------------
+# # ============================================================================
+# # --- CONFIGURATION AND CONSTANTS ---
+# # ... (Your existing constants like WEIGHTS_PATH, OCR_JSON_OUTPUT_DIR, etc.)
+# # ============================================================================
+# # ============================================================================
+# # --- PIX2TEXT INITIALIZATION AND HELPER ---
+# # ============================================================================
+# # Set up logging to WARNING level to suppress excessive output from model libraries
+# logging.basicConfig(level=logging.WARNING)
+# logging.getLogger('pix2text').setLevel(logging.WARNING)
+# # Initialize Pix2Text model globally (expensive operation, do it once)
+# p2t = None
+# try:
+#     # Use 'yolox_tiny' for faster inference AND configure PyTorch backend
+#     p2t = Pix2Text(
+#         analyzer_config={'model_name': 'yolox_tiny'},
+#         # ⬇️ ADD THESE LINES TO USE PYTORCH INSTEAD OF ONNX ⬇️
+#         text_config={
+#             'rec_model_backend': 'pytorch',
+#             'det_model_backend': 'pytorch'
+#         }
+#     )
+#     print("✅ Pix2Text model initialized successfully with PyTorch backend for equation conversion.")
+# except Exception as e:
+#     print(f"❌ Error initializing Pix2Text model. Equations will not be converted: {e}")
+#     p2t = None
 import logging
+from transformers import TrOCRProcessor
+# NOTE: Using optimum.onnxruntime for faster inference, as suggested by your sample script.
+# If you run into issues, you may need to fall back to the standard
+# 'transformers.VisionEncoderDecoderModel' if ORTModelForVision2Seq is not found/working.
+from optimum.onnxruntime import ORTModelForVision2Seq
 # ============================================================================
+# --- TR-OCR/ORT MODEL INITIALIZATION ---
 # ============================================================================
 # Set up logging to WARNING level to suppress excessive output from model libraries
 logging.basicConfig(level=logging.WARNING)
+processor = None
+ort_model = None
 try:
+    MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
+    processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
+    # Initialize the model for ONNX Runtime
+    # NOTE: Set use_cache=False to avoid caching warnings/issues if reloading
+    ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
+    print("✅ ORTModelForVision2Seq and TrOCRProcessor initialized successfully for equation conversion.")
 except Exception as e:
+    print(f"❌ Error initializing TrOCR/ORT model. Equations will not be converted: {e}")
+    processor = None
+    ort_model = None
 # def get_latex_from_base64(base64_string: str) -> str:
 #     """
+#     Decodes a Base64 image string, uses Pix2Text to recognize the formula,
+#     and returns the LaTeX code, stripped of all whitespace, as requested,
+#     and corrects unintended double backslashes.
 #     """
 #     if p2t is None:
 #         return "[P2T_ERROR: Model not initialized]"
 #         # 1. Decode Base64 to Image
 #         image_data = base64.b64decode(base64_string)
 #         image = Image.open(io.BytesIO(image_data))
 #         # 2. Recognize text and formulas
+#         # Use keep_original_image=False to save memory
+#         result = p2t.recognize(image, save_formula_images=False, use_analyzer=True, keep_original_image=False)
 #         # 3. Parse the result for LaTeX
 #         extracted_latex_parts = []
 #         if isinstance(result, list):
 #             for item in result:
+#                 # Use .text for structured output, item itself for string output
 #                 text = item.text if hasattr(item, 'text') else str(item)
 #                 extracted_latex_parts.append(text)
 #         elif isinstance(result, str):
+#              extracted_latex_parts = [result]
+#         # Join with a space first, then clean all whitespace
 #         extracted_latex = " ".join(extracted_latex_parts).strip()
+#         # *** CORE CHANGE 1: Remove all spaces/line breaks ***
+#         cleaned_latex = extracted_latex.replace('\\\\', '\\')
+#         final_latex = re.sub(r'\s+', '', cleaned_latex)
 #         if not cleaned_latex:
+#              return "[P2T_WARNING: No formula found]"
+#         # *** CORE CHANGE 2: Fix unintended double backslashes for LaTeX rendering ***
+#         # This replaces every sequence of two literal backslashes ('\\') with one literal backslash ('\'),
+#         # ensuring LaTeX commands like '\frac' are correctly formed.
+#         # Return the clean and corrected LaTeX string.
 #         return final_latex
 #     except Exception as e:
+#         # Catch any unexpected errors
 #         print(f"  ❌ Pix2Text Recognition failed: {e}")
 #         return f"[P2T_ERROR: Recognition failed: {e}]"
+def get_latex_from_base64(base64_string: str) -> str:
+    """
+    Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
+    to recognize the formula. It cleans the output by removing spaces and
+    crucially, replacing double backslashes with single backslashes for correct LaTeX.
+    """
+    if ort_model is None or processor is None:
+        return "[MODEL_ERROR: Model not initialized]"
+    try:
+        # 1. Decode Base64 to Image
+        image_data = base64.b64decode(base64_string)
+        # We must ensure the image is RGB format for the model input
+        image = Image.open(io.BytesIO(image_data)).convert('RGB')
+        # 2. Preprocess the image
+        pixel_values = processor(images=image, return_tensors="pt").pixel_values
+        # 3. Text Generation (OCR)
+        generated_ids = ort_model.generate(pixel_values)
+        raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        if not raw_generated_text:
+            return "[OCR_WARNING: No formula found]"
+        latex_string = raw_generated_text[0]
+        # --- 4. Post-processing and Cleanup ---
+        # A. Remove all spaces/line breaks
+        cleaned_latex = re.sub(r'\s+', '', latex_string)
+        # B. CRITICAL FIX: Replace double backslashes with single backslashes.
+        # This addresses the over-escaping issue.
+        final_output = cleaned_latex.replace('\\\\', '\\')
+        # Return the clean LaTeX string (e.g., $$a=\frac{F}{2m}$$)
+        return final_output
+    except Exception as e:
+        # Catch any unexpected errors
+        print(f"  ❌ TR-OCR Recognition failed: {e}")
+        return f"[TR_OCR_ERROR: Recognition failed: {e}]"
 # # Initialize the YOLO model
+# if __name__ == "__main__":
+#     parser = argparse.ArgumentParser(description="Complete Pipeline")
+#     parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
+#     parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
+#     parser.add_argument("--ls_output_path", type=str, default=None, help="Label Studio Output Path")
+#     # --- ADDED ARGUMENT FOR DEBUGGING ---
+#     parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
+#                         help="Debug path for raw BIO tag predictions (JSON).")
+#     # ------------------------------------
+#     args = parser.parse_args()
+#     pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
+#     final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
+#     ls_output_path = os.path.abspath(
+#         args.ls_output_path if args.ls_output_path else f"{pdf_name}_label_studio_tasks.json")
+#     # --- CALCULATE RAW PREDICTIONS OUTPUT PATH ---
+#     # raw_predictions_output_path = os.path.abspath(
+#     #     args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
+#     # ---------------------------------------------
+#     # --- UPDATED FUNCTION CALL ---
+#     final_json_data = run_document_pipeline(
+#         args.input_pdf,
+#         args.layoutlmv3_model_path,
+#         ls_output_path,
+#         # raw_predictions_output_path # Pass the new argument
+#     )
+#     # -----------------------------
+#     if final_json_data:
+#         with open(final_output_path, 'w', encoding='utf-8') as f:
+#             json.dump(final_json_data, f, indent=2, ensure_ascii=False)
+#         print(f"\n✅ Final Data Saved: {final_output_path}")
+#     else:
+#         print("\n❌ Pipeline Failed.")
+#         sys.exit(1)
 if __name__ == "__main__":
+    # Ensure 'json', 'argparse', 'os', and 'sys' are imported at the top of your script
+    # import json
+    # import argparse
+    # import os
+    # import sys
     parser = argparse.ArgumentParser(description="Complete Pipeline")
     parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
     parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
     )
     # -----------------------------
+    # 🛑 CRITICAL FIX: CUSTOM JSON SAVING TO REMOVE DOUBLE BACKSLASHES 🛑
     if final_json_data:
+        # 1. Dump the Python object to a standard JSON string.
+        # This uses json.dumps which correctly escapes single backslashes ('\') to ('\\').
+        json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
+        # 2. **UNDO ESCAPING:** Replace every instance of the JSON-escaped backslash ('\\')
+        # with a single literal backslash ('\'). This forces the file content to be correct for LaTeX.
+        final_output_content = json_str.replace('\\\\', '\\')
+        # 3. Write the corrected string content to the file.
         with open(final_output_path, 'w', encoding='utf-8') as f:
+            f.write(final_output_content)
         print(f"\n✅ Final Data Saved: {final_output_path}")
     else:
         print("\n❌ Pipeline Failed.")
+        sys.exit(1)