heerjtdev commited on
Commit
192b719
·
verified ·
1 Parent(s): b9e4ee0

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +216 -52
working_yolo_pipeline.py CHANGED
@@ -175,10 +175,59 @@ except Exception as e:
175
  print(f"❌ Error initializing Pix2Text model. Equations will not be converted: {e}")
176
  p2t = None
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  def get_latex_from_base64(base64_string: str) -> str:
179
  """
180
  Decodes a Base64 image string, uses Pix2Text to recognize the formula,
181
- and returns the LaTeX code, wrapped in $$.
182
  """
183
  if p2t is None:
184
  return "[P2T_ERROR: Model not initialized]"
@@ -189,26 +238,31 @@ def get_latex_from_base64(base64_string: str) -> str:
189
  image = Image.open(io.BytesIO(image_data))
190
 
191
  # 2. Recognize text and formulas
192
- result = p2t.recognize(image, save_formula_images=False, use_analyzer=True)
 
193
 
194
  # 3. Parse the result for LaTeX
195
  extracted_latex_parts = []
196
  if isinstance(result, list):
197
  for item in result:
198
- if hasattr(item, 'text'):
199
- extracted_latex_parts.append(item.text)
200
- elif isinstance(item, str):
201
- extracted_latex_parts.append(item)
202
  elif isinstance(result, str):
203
  extracted_latex_parts = [result]
204
 
 
205
  extracted_latex = " ".join(extracted_latex_parts).strip()
206
 
207
- if not extracted_latex:
 
 
 
 
208
  return "[P2T_WARNING: No formula found]"
209
 
210
- # Wrap result in LaTeX delimiters
211
- return f"$${extracted_latex}$$"
212
 
213
  except Exception as e:
214
  # Catch any unexpected errors
@@ -221,6 +275,8 @@ def get_latex_from_base64(base64_string: str) -> str:
221
 
222
 
223
 
 
 
224
  # # Initialize the YOLO model
225
  # model = YOLO(WEIGHTS_PATH)
226
 
@@ -1667,13 +1723,29 @@ def correct_misaligned_options(structured_data: List[Dict[str, Any]]) -> List[Di
1667
  # --- PHASE 4: IMAGE EMBEDDING (Base64) ---
1668
  # ============================================================================
1669
 
1670
- def get_base64_for_file(filepath: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
1671
  try:
1672
- with open(filepath, 'rb') as f:
1673
- return base64.b64encode(f.read()).decode('utf-8')
 
1674
  except Exception as e:
1675
- print(f"Error encoding file {filepath}: {e}")
1676
- return ""
 
 
 
1677
 
1678
 
1679
  # def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
@@ -1716,76 +1788,168 @@ def get_base64_for_file(filepath: str) -> str:
1716
 
1717
 
1718
 
1719
- def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
1720
- Dict[str, Any]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1721
  print("\n" + "=" * 80)
1722
  print("--- 4. STARTING IMAGE EMBEDDING (Base64) / EQUATION TO LATEX CONVERSION ---")
1723
  print("=" * 80)
1724
- if not structured_data: return []
 
 
1725
  image_files = glob.glob(os.path.join(figure_extraction_dir, "*.png"))
1726
  image_lookup = {}
1727
  tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
 
1728
  for filepath in image_files:
1729
  filename = os.path.basename(filepath)
1730
  match = re.search(r'_(figure|equation)(\d+)\.png$', filename, re.IGNORECASE)
1731
  if match:
1732
  key = f"{match.group(1).upper()}{match.group(2)}"
1733
  image_lookup[key] = filepath
1734
- print(f" -> Found {len(image_lookup)} image components.")
 
1735
 
1736
  final_structured_data = []
1737
-
1738
  for item in structured_data:
1739
  text_fields = [item.get('question', ''), item.get('passage', '')]
1740
  if 'options' in item:
1741
- for opt_val in item['options'].values(): text_fields.append(opt_val)
1742
- if 'new_passage' in item: text_fields.append(item['new_passage'])
1743
-
 
 
1744
  unique_tags_to_embed = set()
1745
  for text in text_fields:
1746
  if not text: continue
1747
  for match in tag_regex.finditer(text):
1748
  tag = match.group(0).upper()
1749
- if tag in image_lookup: unique_tags_to_embed.add(tag)
1750
-
 
1751
  # List of tags that were successfully converted to LaTeX
1752
  tags_converted_to_latex = set()
1753
 
1754
  for tag in sorted(list(unique_tags_to_embed)):
1755
  filepath = image_lookup[tag]
1756
- # Get the base64 code for processing, whether we embed it or convert it to LaTeX
1757
- base64_code = get_base64_for_file(filepath)
1758
-
1759
- # --- PIX2TEXT/EQUATION CONVERSION LOGIC START ---
1760
- if tag.startswith('EQUATION') and p2t is not None:
1761
- print(f" -> Converting EQUATION {tag} to LaTeX...")
1762
- latex_code = get_latex_from_base64(base64_code)
1763
-
1764
- # Replace the original tag (e.g., EQUATION1) in the item's text fields with LaTeX
1765
- for key in ['question', 'passage', 'new_passage']:
1766
- if item.get(key) and tag in item[key]:
1767
- item[key] = item[key].replace(tag, latex_code)
1768
-
1769
- if 'options' in item:
1770
- for opt_key, opt_val in item['options'].items():
1771
- if tag in opt_val:
1772
- item['options'][opt_key] = opt_val.replace(tag, latex_code)
1773
-
1774
- tags_converted_to_latex.add(tag)
1775
- # Skip the embedding of the Base64 code for equations
1776
- continue
1777
- # --- PIX2TEXT/EQUATION CONVERSION LOGIC END ---
 
 
 
1778
 
1779
- # Original logic (for figures): Embed the base64 code
1780
- base_key = tag.replace(' ', '').lower()
1781
- item[base_key] = base64_code
1782
-
1783
  final_structured_data.append(item)
1784
-
1785
- print(f"✅ Image embedding complete. {len(tags_converted_to_latex)} equations converted to LaTeX.")
1786
  return final_structured_data
1787
 
1788
 
 
 
 
 
 
 
 
 
 
 
 
1789
  # ============================================================================
1790
  # --- MAIN FUNCTION ---
1791
  # ============================================================================
 
175
  print(f"❌ Error initializing Pix2Text model. Equations will not be converted: {e}")
176
  p2t = None
177
 
178
+
179
+
180
+
181
+
182
+ # def get_latex_from_base64(base64_string: str) -> str:
183
+ # """
184
+ # Decodes a Base64 image string, uses Pix2Text to recognize the formula,
185
+ # and returns the LaTeX code, wrapped in $$.
186
+ # """
187
+ # if p2t is None:
188
+ # return "[P2T_ERROR: Model not initialized]"
189
+
190
+ # try:
191
+ # # 1. Decode Base64 to Image
192
+ # image_data = base64.b64decode(base64_string)
193
+ # image = Image.open(io.BytesIO(image_data))
194
+
195
+ # # 2. Recognize text and formulas
196
+ # result = p2t.recognize(image, save_formula_images=False, use_analyzer=True)
197
+
198
+ # # 3. Parse the result for LaTeX
199
+ # extracted_latex_parts = []
200
+ # if isinstance(result, list):
201
+ # for item in result:
202
+ # if hasattr(item, 'text'):
203
+ # extracted_latex_parts.append(item.text)
204
+ # elif isinstance(item, str):
205
+ # extracted_latex_parts.append(item)
206
+ # elif isinstance(result, str):
207
+ # extracted_latex_parts = [result]
208
+
209
+ # extracted_latex = " ".join(extracted_latex_parts).strip()
210
+
211
+ # if not extracted_latex:
212
+ # return "[P2T_WARNING: No formula found]"
213
+
214
+ # # Wrap result in LaTeX delimiters
215
+ # return f"$${extracted_latex}$$"
216
+
217
+ # except Exception as e:
218
+ # # Catch any unexpected errors
219
+ # print(f" ❌ Pix2Text Recognition failed: {e}")
220
+ # return f"[P2T_ERROR: Recognition failed: {e}]"
221
+
222
+
223
+
224
+
225
+
226
+
227
  def get_latex_from_base64(base64_string: str) -> str:
228
  """
229
  Decodes a Base64 image string, uses Pix2Text to recognize the formula,
230
+ and returns the LaTeX code, stripped of all whitespace, as requested.
231
  """
232
  if p2t is None:
233
  return "[P2T_ERROR: Model not initialized]"
 
238
  image = Image.open(io.BytesIO(image_data))
239
 
240
  # 2. Recognize text and formulas
241
+ # Use keep_original_image=False to save memory
242
+ result = p2t.recognize(image, save_formula_images=False, use_analyzer=True, keep_original_image=False)
243
 
244
  # 3. Parse the result for LaTeX
245
  extracted_latex_parts = []
246
  if isinstance(result, list):
247
  for item in result:
248
+ # Use .text for structured output, item itself for string output
249
+ text = item.text if hasattr(item, 'text') else str(item)
250
+ extracted_latex_parts.append(text)
 
251
  elif isinstance(result, str):
252
  extracted_latex_parts = [result]
253
 
254
+ # Join with a space first, then clean all whitespace
255
  extracted_latex = " ".join(extracted_latex_parts).strip()
256
 
257
+ # *** CORE CHANGE: Remove all spaces/line breaks as requested by the user ***
258
+ # This uses regex to replace any sequence of whitespace characters (spaces, tabs, newlines) with an empty string.
259
+ cleaned_latex = re.sub(r'\s+', '', extracted_latex)
260
+
261
+ if not cleaned_latex:
262
  return "[P2T_WARNING: No formula found]"
263
 
264
+ # Return the clean LaTeX string without wrapping $$, as requested.
265
+ return cleaned_latex
266
 
267
  except Exception as e:
268
  # Catch any unexpected errors
 
275
 
276
 
277
 
278
+
279
+
280
  # # Initialize the YOLO model
281
  # model = YOLO(WEIGHTS_PATH)
282
 
 
1723
  # --- PHASE 4: IMAGE EMBEDDING (Base64) ---
1724
  # ============================================================================
1725
 
1726
+ # def get_base64_for_file(filepath: str) -> str:
1727
+ # try:
1728
+ # with open(filepath, 'rb') as f:
1729
+ # return base64.b64encode(f.read()).decode('utf-8')
1730
+ # except Exception as e:
1731
+ # print(f" ❌ Error encoding file {filepath}: {e}")
1732
+ # return ""
1733
+
1734
+
1735
+
1736
+
1737
+ def get_base64_for_file(filepath: str) -> Optional[str]:
1738
+ """Reads a file and returns its Base64 encoded string without the data URI prefix."""
1739
  try:
1740
+ with open(filepath, "rb") as image_file:
1741
+ # Return raw base64 string
1742
+ return base64.b64encode(image_file.read()).decode('utf-8')
1743
  except Exception as e:
1744
+ print(f"Error reading and encoding file {filepath}: {e}")
1745
+ return None
1746
+
1747
+
1748
+
1749
 
1750
 
1751
  # def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
 
1788
 
1789
 
1790
 
1791
+ # def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
1792
+ # Dict[str, Any]]:
1793
+ # print("\n" + "=" * 80)
1794
+ # print("--- 4. STARTING IMAGE EMBEDDING (Base64) / EQUATION TO LATEX CONVERSION ---")
1795
+ # print("=" * 80)
1796
+ # if not structured_data: return []
1797
+ # image_files = glob.glob(os.path.join(figure_extraction_dir, "*.png"))
1798
+ # image_lookup = {}
1799
+ # tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
1800
+ # for filepath in image_files:
1801
+ # filename = os.path.basename(filepath)
1802
+ # match = re.search(r'_(figure|equation)(\d+)\.png$', filename, re.IGNORECASE)
1803
+ # if match:
1804
+ # key = f"{match.group(1).upper()}{match.group(2)}"
1805
+ # image_lookup[key] = filepath
1806
+ # print(f" -> Found {len(image_lookup)} image components.")
1807
+
1808
+ # final_structured_data = []
1809
+
1810
+ # for item in structured_data:
1811
+ # text_fields = [item.get('question', ''), item.get('passage', '')]
1812
+ # if 'options' in item:
1813
+ # for opt_val in item['options'].values(): text_fields.append(opt_val)
1814
+ # if 'new_passage' in item: text_fields.append(item['new_passage'])
1815
+
1816
+ # unique_tags_to_embed = set()
1817
+ # for text in text_fields:
1818
+ # if not text: continue
1819
+ # for match in tag_regex.finditer(text):
1820
+ # tag = match.group(0).upper()
1821
+ # if tag in image_lookup: unique_tags_to_embed.add(tag)
1822
+
1823
+ # # List of tags that were successfully converted to LaTeX
1824
+ # tags_converted_to_latex = set()
1825
+
1826
+ # for tag in sorted(list(unique_tags_to_embed)):
1827
+ # filepath = image_lookup[tag]
1828
+ # # Get the base64 code for processing, whether we embed it or convert it to LaTeX
1829
+ # base64_code = get_base64_for_file(filepath)
1830
+
1831
+ # # --- PIX2TEXT/EQUATION CONVERSION LOGIC START ---
1832
+ # if tag.startswith('EQUATION') and p2t is not None:
1833
+ # print(f" -> Converting EQUATION {tag} to LaTeX...")
1834
+ # latex_code = get_latex_from_base64(base64_code)
1835
+
1836
+ # # Replace the original tag (e.g., EQUATION1) in the item's text fields with LaTeX
1837
+ # for key in ['question', 'passage', 'new_passage']:
1838
+ # if item.get(key) and tag in item[key]:
1839
+ # item[key] = item[key].replace(tag, latex_code)
1840
+
1841
+ # if 'options' in item:
1842
+ # for opt_key, opt_val in item['options'].items():
1843
+ # if tag in opt_val:
1844
+ # item['options'][opt_key] = opt_val.replace(tag, latex_code)
1845
+
1846
+ # tags_converted_to_latex.add(tag)
1847
+ # # Skip the embedding of the Base64 code for equations
1848
+ # continue
1849
+ # # --- PIX2TEXT/EQUATION CONVERSION LOGIC END ---
1850
+
1851
+ # # Original logic (for figures): Embed the base64 code
1852
+ # base_key = tag.replace(' ', '').lower()
1853
+ # item[base_key] = base64_code
1854
+
1855
+ # final_structured_data.append(item)
1856
+
1857
+ # print(f"✅ Image embedding complete. {len(tags_converted_to_latex)} equations converted to LaTeX.")
1858
+ # return final_structured_data
1859
+
1860
+
1861
+
1862
+
1863
+
1864
+
1865
+
1866
+
1867
+ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[ Dict[str, Any]]:
1868
  print("\n" + "=" * 80)
1869
  print("--- 4. STARTING IMAGE EMBEDDING (Base64) / EQUATION TO LATEX CONVERSION ---")
1870
  print("=" * 80)
1871
+ if not structured_data:
1872
+ return []
1873
+
1874
  image_files = glob.glob(os.path.join(figure_extraction_dir, "*.png"))
1875
  image_lookup = {}
1876
  tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
1877
+
1878
  for filepath in image_files:
1879
  filename = os.path.basename(filepath)
1880
  match = re.search(r'_(figure|equation)(\d+)\.png$', filename, re.IGNORECASE)
1881
  if match:
1882
  key = f"{match.group(1).upper()}{match.group(2)}"
1883
  image_lookup[key] = filepath
1884
+
1885
+ print(f" -> Found {len(image_lookup)} image components.")
1886
 
1887
  final_structured_data = []
1888
+
1889
  for item in structured_data:
1890
  text_fields = [item.get('question', ''), item.get('passage', '')]
1891
  if 'options' in item:
1892
+ for opt_val in item['options'].values():
1893
+ text_fields.append(opt_val)
1894
+ if 'new_passage' in item:
1895
+ text_fields.append(item['new_passage'])
1896
+
1897
  unique_tags_to_embed = set()
1898
  for text in text_fields:
1899
  if not text: continue
1900
  for match in tag_regex.finditer(text):
1901
  tag = match.group(0).upper()
1902
+ if tag in image_lookup:
1903
+ unique_tags_to_embed.add(tag)
1904
+
1905
  # List of tags that were successfully converted to LaTeX
1906
  tags_converted_to_latex = set()
1907
 
1908
  for tag in sorted(list(unique_tags_to_embed)):
1909
  filepath = image_lookup[tag]
1910
+ base_key = tag.replace(' ', '').lower() # e.g., figure1 or equation1
1911
+
1912
+ if 'EQUATION' in tag:
1913
+ # Equation to LaTeX conversion
1914
+ base64_code = get_base64_for_file(filepath) # This reads the file for conversion
1915
+ if base64_code:
1916
+ latex_output = get_latex_from_base64(base64_code)
1917
+ if not latex_output.startswith('[P2T_ERROR') and not latex_output.startswith('[P2T_WARNING'):
1918
+ # *** CORE CHANGE: Store the clean LaTeX output directly ***
1919
+ item[base_key] = latex_output
1920
+ tags_converted_to_latex.add(tag)
1921
+ print(f" ✅ Embedded Clean LaTeX for {tag}")
1922
+ else:
1923
+ # On failure, embed the error message
1924
+ item[base_key] = latex_output
1925
+ print(f" ⚠️ Failed to convert {tag} to LaTeX. Embedding error message.")
1926
+ else:
1927
+ item[base_key] = "[FILE_ERROR: Could not read image file]"
1928
+ print(f" ❌ File read error for {tag}.")
1929
+
1930
+ elif 'FIGURE' in tag:
1931
+ # Figure to Base64 conversion
1932
+ base64_code = get_base64_for_file(filepath)
1933
+ item[base_key] = base64_code
1934
+ print(f" ✅ Embedded Base64 for {tag}")
1935
 
 
 
 
 
1936
  final_structured_data.append(item)
1937
+
1938
+ print(f"✅ Image embedding complete.")
1939
  return final_structured_data
1940
 
1941
 
1942
+
1943
+
1944
+
1945
+
1946
+
1947
+
1948
+
1949
+
1950
+
1951
+
1952
+
1953
  # ============================================================================
1954
  # --- MAIN FUNCTION ---
1955
  # ============================================================================