Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Nov 27, 2025

Commit

6a25d35

verified ·

1 Parent(s): b652b08

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +173 -171

working_yolo_pipeline.py CHANGED Viewed

@@ -178,77 +178,77 @@ from sklearn.metrics.pairwise import cosine_similarity
-# import logging
-# from transformers import TrOCRProcessor
-# # NOTE: Using optimum.onnxruntime for faster inference, as suggested by your sample script.
-# # If you run into issues, you may need to fall back to the standard
-# # 'transformers.VisionEncoderDecoderModel' if ORTModelForVision2Seq is not found/working.
-# from optimum.onnxruntime import ORTModelForVision2Seq
 import logging
-from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-# NOTE: We are replacing the ORTModelForVision2Seq import due to the ModuleNotFoundError
-# from optimum.onnxruntime import ORTModelForVision2Seq  <-- REMOVE THIS
-# # ============================================================================
-# # --- TR-OCR/ORT MODEL INITIALIZATION ---
-# # ============================================================================
-# # Set up logging to WARNING level to suppress excessive output from model libraries
-# logging.basicConfig(level=logging.WARNING)
-# processor = None
-# ort_model = None
-# try:
-#     MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
-#     processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
-#     # Initialize the model for ONNX Runtime
-#     # NOTE: Set use_cache=False to avoid caching warnings/issues if reloading
-#     ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
-#     print("✅ ORTModelForVision2Seq and TrOCRProcessor initialized successfully for equation conversion.")
-# except Exception as e:
-#     print(f"❌ Error initializing TrOCR/ORT model. Equations will not be converted: {e}")
-#     processor = None
-#     ort_model = None
 # ============================================================================
-# --- TR-OCR/PYTORCH MODEL INITIALIZATION ---
 # ============================================================================
 logging.basicConfig(level=logging.WARNING)
 processor = None
-pt_model = None # Renaming the variable from 'ort_model' to 'pt_model' for clarity
 try:
     MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
     processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
-    # Initialize the standard PyTorch model instead of the ORT model
-    pt_model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
-    # CRITICAL: Since you want CPU-ONLY, explicitly ensure the model is on CPU
-    if torch.cuda.is_available():
-        # Although you requested CPU-only, check if CUDA is available
-        # and ensure you take the necessary steps to force CPU or use the correct runtime environment.
-        # For simplicity, if torch is installed for CPU, it will default to CPU.
-        pass
-    print("✅ VisionEncoderDecoderModel (PyTorch) and TrOCRProcessor initialized successfully for equation conversion.")
 except Exception as e:
-    print(f"❌ Error initializing TrOCR/PyTorch model. Equations will not be converted: {e}")
     processor = None
-    pt_model = None
@@ -402,62 +402,13 @@ except Exception as e:
-# def get_latex_from_base64(base64_string: str) -> str:
-#     """
-#     Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
-#     to recognize the formula. It cleans the output by removing spaces and
-#     crucially, replacing double backslashes with single backslashes for correct LaTeX.
-#     """
-#     if ort_model is None or processor is None:
-#         return "[MODEL_ERROR: Model not initialized]"
-#     try:
-#         # 1. Decode Base64 to Image
-#         image_data = base64.b64decode(base64_string)
-#         # We must ensure the image is RGB format for the model input
-#         image = Image.open(io.BytesIO(image_data)).convert('RGB')
-#         # 2. Preprocess the image
-#         pixel_values = processor(images=image, return_tensors="pt").pixel_values
-#         # 3. Text Generation (OCR)
-#         generated_ids = ort_model.generate(pixel_values)
-#         raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-#         if not raw_generated_text:
-#             return "[OCR_WARNING: No formula found]"
-#         latex_string = raw_generated_text[0]
-#         # --- 4. Post-processing and Cleanup ---
-#         # A. Remove all spaces/line breaks
-#         cleaned_latex = re.sub(r'\s+', '', latex_string)
-#         # B. CRITICAL FIX: Replace double backslashes with single backslashes.
-#         # This addresses the over-escaping issue.
-#         final_output = cleaned_latex.replace('\\\\', '\\')
-#         # Return the clean LaTeX string (e.g., $$a=\frac{F}{2m}$$)
-#         return final_output
-#     except Exception as e:
-#         # Catch any unexpected errors
-#         print(f"  ❌ TR-OCR Recognition failed: {e}")
-#         return f"[TR_OCR_ERROR: Recognition failed: {e}]"
 def get_latex_from_base64(base64_string: str) -> str:
     """
-    Decodes a Base64 image string and uses the pre-initialized TrOCR/PyTorch model
-    to recognize the formula. It cleans the output by removing spaces and
     crucially, replacing double backslashes with single backslashes for correct LaTeX.
     """
-    # Check the new model variable
-    if pt_model is None or processor is None:
         return "[MODEL_ERROR: Model not initialized]"
     try:
@@ -470,8 +421,7 @@ def get_latex_from_base64(base64_string: str) -> str:
         pixel_values = processor(images=image, return_tensors="pt").pixel_values
         # 3. Text Generation (OCR)
-        # Use the PyTorch model's generate method
-        generated_ids = pt_model.generate(pixel_values)
         raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
         if not raw_generated_text:
@@ -485,17 +435,69 @@ def get_latex_from_base64(base64_string: str) -> str:
         cleaned_latex = re.sub(r'\s+', '', latex_string)
         # B. CRITICAL FIX: Replace double backslashes with single backslashes.
-        final_output = cleaned_latex.replace('\\\\', '\\')
-        return final_output
     except Exception as e:
         print(f"  ❌ TR-OCR Recognition failed: {e}")
         return f"[TR_OCR_ERROR: Recognition failed: {e}]"
@@ -2351,55 +2353,7 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, label
-# if __name__ == "__main__":
-#     parser = argparse.ArgumentParser(description="Complete Pipeline")
-#     parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
-#     parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
-#     parser.add_argument("--ls_output_path", type=str, default=None, help="Label Studio Output Path")
-#     # --- ADDED ARGUMENT FOR DEBUGGING ---
-#     parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
-#                         help="Debug path for raw BIO tag predictions (JSON).")
-#     # ------------------------------------
-#     args = parser.parse_args()
-#     pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
-#     final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
-#     ls_output_path = os.path.abspath(
-#         args.ls_output_path if args.ls_output_path else f"{pdf_name}_label_studio_tasks.json")
-#     # --- CALCULATE RAW PREDICTIONS OUTPUT PATH ---
-#     # raw_predictions_output_path = os.path.abspath(
-#     #     args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
-#     # ---------------------------------------------
-#     # --- UPDATED FUNCTION CALL ---
-#     final_json_data = run_document_pipeline(
-#         args.input_pdf,
-#         args.layoutlmv3_model_path,
-#         ls_output_path,
-#         # raw_predictions_output_path # Pass the new argument
-#     )
-#     # -----------------------------
-#     if final_json_data:
-#         with open(final_output_path, 'w', encoding='utf-8') as f:
-#             json.dump(final_json_data, f, indent=2, ensure_ascii=False)
-#         print(f"\n✅ Final Data Saved: {final_output_path}")
-#     else:
-#         print("\n❌ Pipeline Failed.")
-#         sys.exit(1)
 if __name__ == "__main__":
-    # Ensure 'json', 'argparse', 'os', and 'sys' are imported at the top of your script
-    # import json
-    # import argparse
-    # import os
-    # import sys
     parser = argparse.ArgumentParser(description="Complete Pipeline")
     parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
     parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
@@ -2421,30 +2375,78 @@ if __name__ == "__main__":
     # --- UPDATED FUNCTION CALL ---
     final_json_data = run_document_pipeline(
-        args.input_pdf,
-        args.layoutlmv3_model_path,
-        ls_output_path,
         # raw_predictions_output_path # Pass the new argument
     )
     # -----------------------------
-    # 🛑 CRITICAL FIX: CUSTOM JSON SAVING TO REMOVE DOUBLE BACKSLASHES 🛑
     if final_json_data:
-        # 1. Dump the Python object to a standard JSON string.
-        # This uses json.dumps which correctly escapes single backslashes ('\') to ('\\').
-        json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
-        # 2. **UNDO ESCAPING:** Replace every instance of the JSON-escaped backslash ('\\')
-        # with a single literal backslash ('\'). This forces the file content to be correct for LaTeX.
-        final_output_content = json_str.replace('\\\\', '\\')
-        # 3. Write the corrected string content to the file.
         with open(final_output_path, 'w', encoding='utf-8') as f:
-            f.write(final_output_content)
         print(f"\n✅ Final Data Saved: {final_output_path}")
     else:
         print("\n❌ Pipeline Failed.")
         sys.exit(1)

 import logging
+from transformers import TrOCRProcessor
+# NOTE: Using optimum.onnxruntime for faster inference, as suggested by your sample script.
+# If you run into issues, you may need to fall back to the standard
+# 'transformers.VisionEncoderDecoderModel' if ORTModelForVision2Seq is not found/working.
+from optimum.onnxruntime import ORTModelForVision2Seq
+# import logging
+# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+# # NOTE: We are replacing the ORTModelForVision2Seq import due to the ModuleNotFoundError
+# # from optimum.onnxruntime import ORTModelForVision2Seq  <-- REMOVE THIS
 # ============================================================================
+# --- TR-OCR/ORT MODEL INITIALIZATION ---
 # ============================================================================
+# Set up logging to WARNING level to suppress excessive output from model libraries
 logging.basicConfig(level=logging.WARNING)
 processor = None
+ort_model = None
 try:
     MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
     processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
+    # Initialize the model for ONNX Runtime
+    # NOTE: Set use_cache=False to avoid caching warnings/issues if reloading
+    ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
+    print("✅ ORTModelForVision2Seq and TrOCRProcessor initialized successfully for equation conversion.")
 except Exception as e:
+    print(f"❌ Error initializing TrOCR/ORT model. Equations will not be converted: {e}")
     processor = None
+    ort_model = None
+#
+# # ============================================================================
+# # --- TR-OCR/PYTORCH MODEL INITIALIZATION ---
+# # ============================================================================
+# logging.basicConfig(level=logging.WARNING)
+#
+# processor = None
+# pt_model = None # Renaming the variable from 'ort_model' to 'pt_model' for clarity
+#
+# try:
+#     MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
+#     processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
+#
+#     # Initialize the standard PyTorch model instead of the ORT model
+#     pt_model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
+#
+#     # CRITICAL: Since you want CPU-ONLY, explicitly ensure the model is on CPU
+#     if torch.cuda.is_available():
+#         # Although you requested CPU-only, check if CUDA is available
+#         # and ensure you take the necessary steps to force CPU or use the correct runtime environment.
+#         # For simplicity, if torch is installed for CPU, it will default to CPU.
+#         pass
+#
+#     print("✅ VisionEncoderDecoderModel (PyTorch) and TrOCRProcessor initialized successfully for equation conversion.")
+# except Exception as e:
+#     print(f"❌ Error initializing TrOCR/PyTorch model. Equations will not be converted: {e}")
+#     processor = None
+#     pt_model = None
 def get_latex_from_base64(base64_string: str) -> str:
     """
+    Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
+    to recognize the formula. It cleans the output by removing spaces and
     crucially, replacing double backslashes with single backslashes for correct LaTeX.
     """
+    if ort_model is None or processor is None:
         return "[MODEL_ERROR: Model not initialized]"
     try:
         pixel_values = processor(images=image, return_tensors="pt").pixel_values
         # 3. Text Generation (OCR)
+        generated_ids = ort_model.generate(pixel_values)
         raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
         if not raw_generated_text:
         cleaned_latex = re.sub(r'\s+', '', latex_string)
         # B. CRITICAL FIX: Replace double backslashes with single backslashes.
+        # This addresses the over-escaping issue.
+        # final_output = cleaned_latex.replace('\\\\', '\\')
+        # Return the clean LaTeX string (e.g., $$a=\frac{F}{2m}$$)
+        #return final_output
+        return cleaned_latex
     except Exception as e:
+        # Catch any unexpected errors
         print(f"  ❌ TR-OCR Recognition failed: {e}")
         return f"[TR_OCR_ERROR: Recognition failed: {e}]"
+#
+# def get_latex_from_base64(base64_string: str) -> str:
+#     """
+#     Decodes a Base64 image string and uses the pre-initialized TrOCR/PyTorch model
+#     to recognize the formula. It cleans the output by removing spaces and
+#     crucially, replacing double backslashes with single backslashes for correct LaTeX.
+#     """
+#     # Check the new model variable
+#     if pt_model is None or processor is None:
+#         return "[MODEL_ERROR: Model not initialized]"
+#
+#     try:
+#         # 1. Decode Base64 to Image
+#         image_data = base64.b64decode(base64_string)
+#         # We must ensure the image is RGB format for the model input
+#         image = Image.open(io.BytesIO(image_data)).convert('RGB')
+#
+#         # 2. Preprocess the image
+#         pixel_values = processor(images=image, return_tensors="pt").pixel_values
+#
+#         # 3. Text Generation (OCR)
+#         # Use the PyTorch model's generate method
+#         generated_ids = pt_model.generate(pixel_values)
+#         raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+#
+#         if not raw_generated_text:
+#             return "[OCR_WARNING: No formula found]"
+#
+#         latex_string = raw_generated_text[0]
+#
+#         # --- 4. Post-processing and Cleanup ---
+#
+#         # A. Remove all spaces/line breaks
+#         cleaned_latex = re.sub(r'\s+', '', latex_string)
+#
+#         # B. CRITICAL FIX: Replace double backslashes with single backslashes.
+#         final_output = cleaned_latex.replace('\\\\', '\\')
+#
+#         return final_output
+#
+#     except Exception as e:
+#         print(f"  ❌ TR-OCR Recognition failed: {e}")
+#         return f"[TR_OCR_ERROR: Recognition failed: {e}]"
+#
+#
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Complete Pipeline")
     parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
     parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
     # --- UPDATED FUNCTION CALL ---
     final_json_data = run_document_pipeline(
+        args.input_pdf,
+        args.layoutlmv3_model_path,
+        ls_output_path,
         # raw_predictions_output_path # Pass the new argument
     )
     # -----------------------------
     if final_json_data:
         with open(final_output_path, 'w', encoding='utf-8') as f:
+            json.dump(final_json_data, f, indent=2, ensure_ascii=False)
         print(f"\n✅ Final Data Saved: {final_output_path}")
     else:
         print("\n❌ Pipeline Failed.")
         sys.exit(1)
+# if __name__ == "__main__":
+#     # Ensure 'json', 'argparse', 'os', and 'sys' are imported at the top of your script
+#     # import json
+#     # import argparse
+#     # import os
+#     # import sys
+#
+#     parser = argparse.ArgumentParser(description="Complete Pipeline")
+#     parser.add_argument("--input_pdf", type=str, required=True, help="Input PDF")
+#     parser.add_argument("--layoutlmv3_model_path", type=str, default=DEFAULT_LAYOUTLMV3_MODEL_PATH, help="Model Path")
+#     parser.add_argument("--ls_output_path", type=str, default=None, help="Label Studio Output Path")
+#     # --- ADDED ARGUMENT FOR DEBUGGING ---
+#     parser.add_argument("--raw_preds_path", type=str, default='BIO_debug.json',
+#                         help="Debug path for raw BIO tag predictions (JSON).")
+#     # ------------------------------------
+#     args = parser.parse_args()
+#
+#     pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
+#     final_output_path = os.path.abspath(f"{pdf_name}_final_output_embedded.json")
+#     ls_output_path = os.path.abspath(
+#         args.ls_output_path if args.ls_output_path else f"{pdf_name}_label_studio_tasks.json")
+#     # --- CALCULATE RAW PREDICTIONS OUTPUT PATH ---
+#     # raw_predictions_output_path = os.path.abspath(
+#     #     args.raw_preds_path if args.raw_preds_path else f"{pdf_name}_raw_predictions_debug.json")
+#     # ---------------------------------------------
+#
+#     # --- UPDATED FUNCTION CALL ---
+#     final_json_data = run_document_pipeline(
+#         args.input_pdf,
+#         args.layoutlmv3_model_path,
+#         ls_output_path,
+#         # raw_predictions_output_path # Pass the new argument
+#     )
+#     # -----------------------------
+#
+#     # 🛑 CRITICAL FIX: CUSTOM JSON SAVING TO REMOVE DOUBLE BACKSLASHES 🛑
+#     if final_json_data:
+#         # 1. Dump the Python object to a standard JSON string.
+#         # This uses json.dumps which correctly escapes single backslashes ('\') to ('\\').
+#         json_str = json.dumps(final_json_data, indent=2, ensure_ascii=False)
+#
+#         # 2. **UNDO ESCAPING:** Replace every instance of the JSON-escaped backslash ('\\')
+#         # with a single literal backslash ('\'). This forces the file content to be correct for LaTeX.
+#         final_output_content = json_str.replace('\\\\', '\\')
+#
+#         # 3. Write the corrected string content to the file.
+#         with open(final_output_path, 'w', encoding='utf-8') as f:
+#             f.write(final_output_content)
+#
+#         print(f"\n✅ Final Data Saved: {final_output_path}")
+#     else:
+#         print("\n❌ Pipeline Failed.")
+#         sys.exit(1)