Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Nov 27, 2025

Commit

192b719

verified ·

1 Parent(s): b9e4ee0

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +216 -52

working_yolo_pipeline.py CHANGED Viewed

@@ -175,10 +175,59 @@ except Exception as e:
     print(f"❌ Error initializing Pix2Text model. Equations will not be converted: {e}")
     p2t = None
 def get_latex_from_base64(base64_string: str) -> str:
     """
     Decodes a Base64 image string, uses Pix2Text to recognize the formula,
-    and returns the LaTeX code, wrapped in $$.
     """
     if p2t is None:
         return "[P2T_ERROR: Model not initialized]"
@@ -189,26 +238,31 @@ def get_latex_from_base64(base64_string: str) -> str:
         image = Image.open(io.BytesIO(image_data))
         # 2. Recognize text and formulas
-        result = p2t.recognize(image, save_formula_images=False, use_analyzer=True)
         # 3. Parse the result for LaTeX
         extracted_latex_parts = []
         if isinstance(result, list):
             for item in result:
-                if hasattr(item, 'text'):
-                    extracted_latex_parts.append(item.text)
-                elif isinstance(item, str):
-                    extracted_latex_parts.append(item)
         elif isinstance(result, str):
              extracted_latex_parts = [result]
         extracted_latex = " ".join(extracted_latex_parts).strip()
-        if not extracted_latex:
              return "[P2T_WARNING: No formula found]"
-        # Wrap result in LaTeX delimiters
-        return f"$${extracted_latex}$$"
     except Exception as e:
         # Catch any unexpected errors
@@ -221,6 +275,8 @@ def get_latex_from_base64(base64_string: str) -> str:
 # # Initialize the YOLO model
 # model = YOLO(WEIGHTS_PATH)
@@ -1667,13 +1723,29 @@ def correct_misaligned_options(structured_data: List[Dict[str, Any]]) -> List[Di
 # --- PHASE 4: IMAGE EMBEDDING (Base64) ---
 # ============================================================================
-def get_base64_for_file(filepath: str) -> str:
     try:
-        with open(filepath, 'rb') as f:
-            return base64.b64encode(f.read()).decode('utf-8')
     except Exception as e:
-        print(f"  ❌ Error encoding file {filepath}: {e}")
-        return ""
 # def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
@@ -1716,76 +1788,168 @@ def get_base64_for_file(filepath: str) -> str:
-def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
-    Dict[str, Any]]:
     print("\n" + "=" * 80)
     print("--- 4. STARTING IMAGE EMBEDDING (Base64) / EQUATION TO LATEX CONVERSION ---")
     print("=" * 80)
-    if not structured_data: return []
     image_files = glob.glob(os.path.join(figure_extraction_dir, "*.png"))
     image_lookup = {}
     tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
     for filepath in image_files:
         filename = os.path.basename(filepath)
         match = re.search(r'_(figure|equation)(\d+)\.png$', filename, re.IGNORECASE)
         if match:
             key = f"{match.group(1).upper()}{match.group(2)}"
             image_lookup[key] = filepath
-    print(f"  -> Found {len(image_lookup)} image components.")
     final_structured_data = []
     for item in structured_data:
         text_fields = [item.get('question', ''), item.get('passage', '')]
         if 'options' in item:
-            for opt_val in item['options'].values(): text_fields.append(opt_val)
-        if 'new_passage' in item: text_fields.append(item['new_passage'])
         unique_tags_to_embed = set()
         for text in text_fields:
             if not text: continue
             for match in tag_regex.finditer(text):
                 tag = match.group(0).upper()
-                if tag in image_lookup: unique_tags_to_embed.add(tag)
         # List of tags that were successfully converted to LaTeX
         tags_converted_to_latex = set()
         for tag in sorted(list(unique_tags_to_embed)):
             filepath = image_lookup[tag]
-            # Get the base64 code for processing, whether we embed it or convert it to LaTeX
-            base64_code = get_base64_for_file(filepath)
-            # --- PIX2TEXT/EQUATION CONVERSION LOGIC START ---
-            if tag.startswith('EQUATION') and p2t is not None:
-                print(f"  -> Converting EQUATION {tag} to LaTeX...")
-                latex_code = get_latex_from_base64(base64_code)
-                # Replace the original tag (e.g., EQUATION1) in the item's text fields with LaTeX
-                for key in ['question', 'passage', 'new_passage']:
-                    if item.get(key) and tag in item[key]:
-                        item[key] = item[key].replace(tag, latex_code)
-                if 'options' in item:
-                    for opt_key, opt_val in item['options'].items():
-                        if tag in opt_val:
-                            item['options'][opt_key] = opt_val.replace(tag, latex_code)
-                tags_converted_to_latex.add(tag)
-                # Skip the embedding of the Base64 code for equations
-                continue
-            # --- PIX2TEXT/EQUATION CONVERSION LOGIC END ---
-            # Original logic (for figures): Embed the base64 code
-            base_key = tag.replace(' ', '').lower()
-            item[base_key] = base64_code
         final_structured_data.append(item)
-    print(f"✅ Image embedding complete. {len(tags_converted_to_latex)} equations converted to LaTeX.")
     return final_structured_data
 # ============================================================================
 # --- MAIN FUNCTION ---
 # ============================================================================

     print(f"❌ Error initializing Pix2Text model. Equations will not be converted: {e}")
     p2t = None
+# def get_latex_from_base64(base64_string: str) -> str:
+#     """
+#     Decodes a Base64 image string, uses Pix2Text to recognize the formula,
+#     and returns the LaTeX code, wrapped in $$.
+#     """
+#     if p2t is None:
+#         return "[P2T_ERROR: Model not initialized]"
+#     try:
+#         # 1. Decode Base64 to Image
+#         image_data = base64.b64decode(base64_string)
+#         image = Image.open(io.BytesIO(image_data))
+#         # 2. Recognize text and formulas
+#         result = p2t.recognize(image, save_formula_images=False, use_analyzer=True)
+#         # 3. Parse the result for LaTeX
+#         extracted_latex_parts = []
+#         if isinstance(result, list):
+#             for item in result:
+#                 if hasattr(item, 'text'):
+#                     extracted_latex_parts.append(item.text)
+#                 elif isinstance(item, str):
+#                     extracted_latex_parts.append(item)
+#         elif isinstance(result, str):
+#              extracted_latex_parts = [result]
+#         extracted_latex = " ".join(extracted_latex_parts).strip()
+#         if not extracted_latex:
+#              return "[P2T_WARNING: No formula found]"
+#         # Wrap result in LaTeX delimiters
+#         return f"$${extracted_latex}$$"
+#     except Exception as e:
+#         # Catch any unexpected errors
+#         print(f"  ❌ Pix2Text Recognition failed: {e}")
+#         return f"[P2T_ERROR: Recognition failed: {e}]"
 def get_latex_from_base64(base64_string: str) -> str:
     """
     Decodes a Base64 image string, uses Pix2Text to recognize the formula,
+    and returns the LaTeX code, stripped of all whitespace, as requested.
     """
     if p2t is None:
         return "[P2T_ERROR: Model not initialized]"
         image = Image.open(io.BytesIO(image_data))
         # 2. Recognize text and formulas
+        # Use keep_original_image=False to save memory
+        result = p2t.recognize(image, save_formula_images=False, use_analyzer=True, keep_original_image=False)
         # 3. Parse the result for LaTeX
         extracted_latex_parts = []
         if isinstance(result, list):
             for item in result:
+                # Use .text for structured output, item itself for string output
+                text = item.text if hasattr(item, 'text') else str(item)
+                extracted_latex_parts.append(text)
         elif isinstance(result, str):
              extracted_latex_parts = [result]
+        # Join with a space first, then clean all whitespace
         extracted_latex = " ".join(extracted_latex_parts).strip()
+        # *** CORE CHANGE: Remove all spaces/line breaks as requested by the user ***
+        # This uses regex to replace any sequence of whitespace characters (spaces, tabs, newlines) with an empty string.
+        cleaned_latex = re.sub(r'\s+', '', extracted_latex)
+        if not cleaned_latex:
              return "[P2T_WARNING: No formula found]"
+        # Return the clean LaTeX string without wrapping $$, as requested.
+        return cleaned_latex
     except Exception as e:
         # Catch any unexpected errors
 # # Initialize the YOLO model
 # model = YOLO(WEIGHTS_PATH)
 # --- PHASE 4: IMAGE EMBEDDING (Base64) ---
 # ============================================================================
+# def get_base64_for_file(filepath: str) -> str:
+#     try:
+#         with open(filepath, 'rb') as f:
+#             return base64.b64encode(f.read()).decode('utf-8')
+#     except Exception as e:
+#         print(f"  ❌ Error encoding file {filepath}: {e}")
+#         return ""
+def get_base64_for_file(filepath: str) -> Optional[str]:
+    """Reads a file and returns its Base64 encoded string without the data URI prefix."""
     try:
+        with open(filepath, "rb") as image_file:
+            # Return raw base64 string
+            return base64.b64encode(image_file.read()).decode('utf-8')
     except Exception as e:
+        print(f"Error reading and encoding file {filepath}: {e}")
+        return None
 # def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
+# def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
+#     Dict[str, Any]]:
+#     print("\n" + "=" * 80)
+#     print("--- 4. STARTING IMAGE EMBEDDING (Base64) / EQUATION TO LATEX CONVERSION ---")
+#     print("=" * 80)
+#     if not structured_data: return []
+#     image_files = glob.glob(os.path.join(figure_extraction_dir, "*.png"))
+#     image_lookup = {}
+#     tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
+#     for filepath in image_files:
+#         filename = os.path.basename(filepath)
+#         match = re.search(r'_(figure|equation)(\d+)\.png$', filename, re.IGNORECASE)
+#         if match:
+#             key = f"{match.group(1).upper()}{match.group(2)}"
+#             image_lookup[key] = filepath
+#     print(f"  -> Found {len(image_lookup)} image components.")
+#     final_structured_data = []
+#     for item in structured_data:
+#         text_fields = [item.get('question', ''), item.get('passage', '')]
+#         if 'options' in item:
+#             for opt_val in item['options'].values(): text_fields.append(opt_val)
+#         if 'new_passage' in item: text_fields.append(item['new_passage'])
+#         unique_tags_to_embed = set()
+#         for text in text_fields:
+#             if not text: continue
+#             for match in tag_regex.finditer(text):
+#                 tag = match.group(0).upper()
+#                 if tag in image_lookup: unique_tags_to_embed.add(tag)
+#         # List of tags that were successfully converted to LaTeX
+#         tags_converted_to_latex = set()
+#         for tag in sorted(list(unique_tags_to_embed)):
+#             filepath = image_lookup[tag]
+#             # Get the base64 code for processing, whether we embed it or convert it to LaTeX
+#             base64_code = get_base64_for_file(filepath)
+#             # --- PIX2TEXT/EQUATION CONVERSION LOGIC START ---
+#             if tag.startswith('EQUATION') and p2t is not None:
+#                 print(f"  -> Converting EQUATION {tag} to LaTeX...")
+#                 latex_code = get_latex_from_base64(base64_code)
+#                 # Replace the original tag (e.g., EQUATION1) in the item's text fields with LaTeX
+#                 for key in ['question', 'passage', 'new_passage']:
+#                     if item.get(key) and tag in item[key]:
+#                         item[key] = item[key].replace(tag, latex_code)
+#                 if 'options' in item:
+#                     for opt_key, opt_val in item['options'].items():
+#                         if tag in opt_val:
+#                             item['options'][opt_key] = opt_val.replace(tag, latex_code)
+#                 tags_converted_to_latex.add(tag)
+#                 # Skip the embedding of the Base64 code for equations
+#                 continue
+#             # --- PIX2TEXT/EQUATION CONVERSION LOGIC END ---
+#             # Original logic (for figures): Embed the base64 code
+#             base_key = tag.replace(' ', '').lower()
+#             item[base_key] = base64_code
+#         final_structured_data.append(item)
+#     print(f"✅ Image embedding complete. {len(tags_converted_to_latex)} equations converted to LaTeX.")
+#     return final_structured_data
+def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[ Dict[str, Any]]:
     print("\n" + "=" * 80)
     print("--- 4. STARTING IMAGE EMBEDDING (Base64) / EQUATION TO LATEX CONVERSION ---")
     print("=" * 80)
+    if not structured_data:
+        return []
     image_files = glob.glob(os.path.join(figure_extraction_dir, "*.png"))
     image_lookup = {}
     tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
     for filepath in image_files:
         filename = os.path.basename(filepath)
         match = re.search(r'_(figure|equation)(\d+)\.png$', filename, re.IGNORECASE)
         if match:
             key = f"{match.group(1).upper()}{match.group(2)}"
             image_lookup[key] = filepath
+    print(f" -> Found {len(image_lookup)} image components.")
     final_structured_data = []
     for item in structured_data:
         text_fields = [item.get('question', ''), item.get('passage', '')]
         if 'options' in item:
+            for opt_val in item['options'].values():
+                text_fields.append(opt_val)
+        if 'new_passage' in item:
+            text_fields.append(item['new_passage'])
         unique_tags_to_embed = set()
         for text in text_fields:
             if not text: continue
             for match in tag_regex.finditer(text):
                 tag = match.group(0).upper()
+                if tag in image_lookup:
+                    unique_tags_to_embed.add(tag)
         # List of tags that were successfully converted to LaTeX
         tags_converted_to_latex = set()
         for tag in sorted(list(unique_tags_to_embed)):
             filepath = image_lookup[tag]
+            base_key = tag.replace(' ', '').lower() # e.g., figure1 or equation1
+            if 'EQUATION' in tag:
+                # Equation to LaTeX conversion
+                base64_code = get_base64_for_file(filepath) # This reads the file for conversion
+                if base64_code:
+                    latex_output = get_latex_from_base64(base64_code)
+                    if not latex_output.startswith('[P2T_ERROR') and not latex_output.startswith('[P2T_WARNING'):
+                        # *** CORE CHANGE: Store the clean LaTeX output directly ***
+                        item[base_key] = latex_output
+                        tags_converted_to_latex.add(tag)
+                        print(f"  ✅ Embedded Clean LaTeX for {tag}")
+                    else:
+                        # On failure, embed the error message
+                        item[base_key] = latex_output
+                        print(f"  ⚠️ Failed to convert {tag} to LaTeX. Embedding error message.")
+                else:
+                     item[base_key] = "[FILE_ERROR: Could not read image file]"
+                     print(f"  ❌ File read error for {tag}.")
+            elif 'FIGURE' in tag:
+                # Figure to Base64 conversion
+                base64_code = get_base64_for_file(filepath)
+                item[base_key] = base64_code
+                print(f"  ✅ Embedded Base64 for {tag}")
         final_structured_data.append(item)
+    print(f"✅ Image embedding complete.")
     return final_structured_data
 # ============================================================================
 # --- MAIN FUNCTION ---
 # ============================================================================