Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Nov 27, 2025

Commit

b652b08

verified ·

1 Parent(s): 6288fb8

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +107 -20

working_yolo_pipeline.py CHANGED Viewed

@@ -178,37 +178,77 @@ from sklearn.metrics.pairwise import cosine_similarity
 import logging
-from transformers import TrOCRProcessor
-# NOTE: Using optimum.onnxruntime for faster inference, as suggested by your sample script.
-# If you run into issues, you may need to fall back to the standard
-# 'transformers.VisionEncoderDecoderModel' if ORTModelForVision2Seq is not found/working.
-from optimum.onnxruntime import ORTModelForVision2Seq
 # ============================================================================
-# --- TR-OCR/ORT MODEL INITIALIZATION ---
 # ============================================================================
-# Set up logging to WARNING level to suppress excessive output from model libraries
 logging.basicConfig(level=logging.WARNING)
 processor = None
-ort_model = None
 try:
     MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
     processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
-    # Initialize the model for ONNX Runtime
-    # NOTE: Set use_cache=False to avoid caching warnings/issues if reloading
-    ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
-    print("✅ ORTModelForVision2Seq and TrOCRProcessor initialized successfully for equation conversion.")
 except Exception as e:
-    print(f"❌ Error initializing TrOCR/ORT model. Equations will not be converted: {e}")
     processor = None
-    ort_model = None
@@ -362,13 +402,62 @@ except Exception as e:
 def get_latex_from_base64(base64_string: str) -> str:
     """
-    Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
     to recognize the formula. It cleans the output by removing spaces and
     crucially, replacing double backslashes with single backslashes for correct LaTeX.
     """
-    if ort_model is None or processor is None:
         return "[MODEL_ERROR: Model not initialized]"
     try:
@@ -381,7 +470,8 @@ def get_latex_from_base64(base64_string: str) -> str:
         pixel_values = processor(images=image, return_tensors="pt").pixel_values
         # 3. Text Generation (OCR)
-        generated_ids = ort_model.generate(pixel_values)
         raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
         if not raw_generated_text:
@@ -395,14 +485,11 @@ def get_latex_from_base64(base64_string: str) -> str:
         cleaned_latex = re.sub(r'\s+', '', latex_string)
         # B. CRITICAL FIX: Replace double backslashes with single backslashes.
-        # This addresses the over-escaping issue.
         final_output = cleaned_latex.replace('\\\\', '\\')
-        # Return the clean LaTeX string (e.g., $$a=\frac{F}{2m}$$)
         return final_output
     except Exception as e:
-        # Catch any unexpected errors
         print(f"  ❌ TR-OCR Recognition failed: {e}")
         return f"[TR_OCR_ERROR: Recognition failed: {e}]"

+# import logging
+# from transformers import TrOCRProcessor
+# # NOTE: Using optimum.onnxruntime for faster inference, as suggested by your sample script.
+# # If you run into issues, you may need to fall back to the standard
+# # 'transformers.VisionEncoderDecoderModel' if ORTModelForVision2Seq is not found/working.
+# from optimum.onnxruntime import ORTModelForVision2Seq
 import logging
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+# NOTE: We are replacing the ORTModelForVision2Seq import due to the ModuleNotFoundError
+# from optimum.onnxruntime import ORTModelForVision2Seq  <-- REMOVE THIS
+# # ============================================================================
+# # --- TR-OCR/ORT MODEL INITIALIZATION ---
+# # ============================================================================
+# # Set up logging to WARNING level to suppress excessive output from model libraries
+# logging.basicConfig(level=logging.WARNING)
+# processor = None
+# ort_model = None
+# try:
+#     MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
+#     processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
+#     # Initialize the model for ONNX Runtime
+#     # NOTE: Set use_cache=False to avoid caching warnings/issues if reloading
+#     ort_model = ORTModelForVision2Seq.from_pretrained(MODEL_NAME, use_cache=False)
+#     print("✅ ORTModelForVision2Seq and TrOCRProcessor initialized successfully for equation conversion.")
+# except Exception as e:
+#     print(f"❌ Error initializing TrOCR/ORT model. Equations will not be converted: {e}")
+#     processor = None
+#     ort_model = None
 # ============================================================================
+# --- TR-OCR/PYTORCH MODEL INITIALIZATION ---
 # ============================================================================
 logging.basicConfig(level=logging.WARNING)
 processor = None
+pt_model = None # Renaming the variable from 'ort_model' to 'pt_model' for clarity
 try:
     MODEL_NAME = 'breezedeus/pix2text-mfr-1.5'
     processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
+    # Initialize the standard PyTorch model instead of the ORT model
+    pt_model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
+    # CRITICAL: Since you want CPU-ONLY, explicitly ensure the model is on CPU
+    if torch.cuda.is_available():
+        # Although you requested CPU-only, check if CUDA is available
+        # and ensure you take the necessary steps to force CPU or use the correct runtime environment.
+        # For simplicity, if torch is installed for CPU, it will default to CPU.
+        pass
+    print("✅ VisionEncoderDecoderModel (PyTorch) and TrOCRProcessor initialized successfully for equation conversion.")
 except Exception as e:
+    print(f"❌ Error initializing TrOCR/PyTorch model. Equations will not be converted: {e}")
     processor = None
+    pt_model = None
+# def get_latex_from_base64(base64_string: str) -> str:
+#     """
+#     Decodes a Base64 image string and uses the pre-initialized TrOCR/ORT model
+#     to recognize the formula. It cleans the output by removing spaces and
+#     crucially, replacing double backslashes with single backslashes for correct LaTeX.
+#     """
+#     if ort_model is None or processor is None:
+#         return "[MODEL_ERROR: Model not initialized]"
+#     try:
+#         # 1. Decode Base64 to Image
+#         image_data = base64.b64decode(base64_string)
+#         # We must ensure the image is RGB format for the model input
+#         image = Image.open(io.BytesIO(image_data)).convert('RGB')
+#         # 2. Preprocess the image
+#         pixel_values = processor(images=image, return_tensors="pt").pixel_values
+#         # 3. Text Generation (OCR)
+#         generated_ids = ort_model.generate(pixel_values)
+#         raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+#         if not raw_generated_text:
+#             return "[OCR_WARNING: No formula found]"
+#         latex_string = raw_generated_text[0]
+#         # --- 4. Post-processing and Cleanup ---
+#         # A. Remove all spaces/line breaks
+#         cleaned_latex = re.sub(r'\s+', '', latex_string)
+#         # B. CRITICAL FIX: Replace double backslashes with single backslashes.
+#         # This addresses the over-escaping issue.
+#         final_output = cleaned_latex.replace('\\\\', '\\')
+#         # Return the clean LaTeX string (e.g., $$a=\frac{F}{2m}$$)
+#         return final_output
+#     except Exception as e:
+#         # Catch any unexpected errors
+#         print(f"  ❌ TR-OCR Recognition failed: {e}")
+#         return f"[TR_OCR_ERROR: Recognition failed: {e}]"
 def get_latex_from_base64(base64_string: str) -> str:
     """
+    Decodes a Base64 image string and uses the pre-initialized TrOCR/PyTorch model
     to recognize the formula. It cleans the output by removing spaces and
     crucially, replacing double backslashes with single backslashes for correct LaTeX.
     """
+    # Check the new model variable
+    if pt_model is None or processor is None:
         return "[MODEL_ERROR: Model not initialized]"
     try:
         pixel_values = processor(images=image, return_tensors="pt").pixel_values
         # 3. Text Generation (OCR)
+        # Use the PyTorch model's generate method
+        generated_ids = pt_model.generate(pixel_values)
         raw_generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
         if not raw_generated_text:
         cleaned_latex = re.sub(r'\s+', '', latex_string)
         # B. CRITICAL FIX: Replace double backslashes with single backslashes.
         final_output = cleaned_latex.replace('\\\\', '\\')
         return final_output
     except Exception as e:
         print(f"  ❌ TR-OCR Recognition failed: {e}")
         return f"[TR_OCR_ERROR: Recognition failed: {e}]"