Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +216 -52
working_yolo_pipeline.py
CHANGED
|
@@ -175,10 +175,59 @@ except Exception as e:
|
|
| 175 |
print(f"❌ Error initializing Pix2Text model. Equations will not be converted: {e}")
|
| 176 |
p2t = None
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
def get_latex_from_base64(base64_string: str) -> str:
|
| 179 |
"""
|
| 180 |
Decodes a Base64 image string, uses Pix2Text to recognize the formula,
|
| 181 |
-
and returns the LaTeX code,
|
| 182 |
"""
|
| 183 |
if p2t is None:
|
| 184 |
return "[P2T_ERROR: Model not initialized]"
|
|
@@ -189,26 +238,31 @@ def get_latex_from_base64(base64_string: str) -> str:
|
|
| 189 |
image = Image.open(io.BytesIO(image_data))
|
| 190 |
|
| 191 |
# 2. Recognize text and formulas
|
| 192 |
-
|
|
|
|
| 193 |
|
| 194 |
# 3. Parse the result for LaTeX
|
| 195 |
extracted_latex_parts = []
|
| 196 |
if isinstance(result, list):
|
| 197 |
for item in result:
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
extracted_latex_parts.append(item)
|
| 202 |
elif isinstance(result, str):
|
| 203 |
extracted_latex_parts = [result]
|
| 204 |
|
|
|
|
| 205 |
extracted_latex = " ".join(extracted_latex_parts).strip()
|
| 206 |
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
return "[P2T_WARNING: No formula found]"
|
| 209 |
|
| 210 |
-
#
|
| 211 |
-
return
|
| 212 |
|
| 213 |
except Exception as e:
|
| 214 |
# Catch any unexpected errors
|
|
@@ -221,6 +275,8 @@ def get_latex_from_base64(base64_string: str) -> str:
|
|
| 221 |
|
| 222 |
|
| 223 |
|
|
|
|
|
|
|
| 224 |
# # Initialize the YOLO model
|
| 225 |
# model = YOLO(WEIGHTS_PATH)
|
| 226 |
|
|
@@ -1667,13 +1723,29 @@ def correct_misaligned_options(structured_data: List[Dict[str, Any]]) -> List[Di
|
|
| 1667 |
# --- PHASE 4: IMAGE EMBEDDING (Base64) ---
|
| 1668 |
# ============================================================================
|
| 1669 |
|
| 1670 |
-
def get_base64_for_file(filepath: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1671 |
try:
|
| 1672 |
-
with open(filepath,
|
| 1673 |
-
|
|
|
|
| 1674 |
except Exception as e:
|
| 1675 |
-
print(f"
|
| 1676 |
-
return
|
|
|
|
|
|
|
|
|
|
| 1677 |
|
| 1678 |
|
| 1679 |
# def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
|
|
@@ -1716,76 +1788,168 @@ def get_base64_for_file(filepath: str) -> str:
|
|
| 1716 |
|
| 1717 |
|
| 1718 |
|
| 1719 |
-
def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
|
| 1720 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1721 |
print("\n" + "=" * 80)
|
| 1722 |
print("--- 4. STARTING IMAGE EMBEDDING (Base64) / EQUATION TO LATEX CONVERSION ---")
|
| 1723 |
print("=" * 80)
|
| 1724 |
-
if not structured_data:
|
|
|
|
|
|
|
| 1725 |
image_files = glob.glob(os.path.join(figure_extraction_dir, "*.png"))
|
| 1726 |
image_lookup = {}
|
| 1727 |
tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
|
|
|
|
| 1728 |
for filepath in image_files:
|
| 1729 |
filename = os.path.basename(filepath)
|
| 1730 |
match = re.search(r'_(figure|equation)(\d+)\.png$', filename, re.IGNORECASE)
|
| 1731 |
if match:
|
| 1732 |
key = f"{match.group(1).upper()}{match.group(2)}"
|
| 1733 |
image_lookup[key] = filepath
|
| 1734 |
-
|
|
|
|
| 1735 |
|
| 1736 |
final_structured_data = []
|
| 1737 |
-
|
| 1738 |
for item in structured_data:
|
| 1739 |
text_fields = [item.get('question', ''), item.get('passage', '')]
|
| 1740 |
if 'options' in item:
|
| 1741 |
-
for opt_val in item['options'].values():
|
| 1742 |
-
|
| 1743 |
-
|
|
|
|
|
|
|
| 1744 |
unique_tags_to_embed = set()
|
| 1745 |
for text in text_fields:
|
| 1746 |
if not text: continue
|
| 1747 |
for match in tag_regex.finditer(text):
|
| 1748 |
tag = match.group(0).upper()
|
| 1749 |
-
if tag in image_lookup:
|
| 1750 |
-
|
|
|
|
| 1751 |
# List of tags that were successfully converted to LaTeX
|
| 1752 |
tags_converted_to_latex = set()
|
| 1753 |
|
| 1754 |
for tag in sorted(list(unique_tags_to_embed)):
|
| 1755 |
filepath = image_lookup[tag]
|
| 1756 |
-
|
| 1757 |
-
|
| 1758 |
-
|
| 1759 |
-
|
| 1760 |
-
|
| 1761 |
-
|
| 1762 |
-
|
| 1763 |
-
|
| 1764 |
-
|
| 1765 |
-
|
| 1766 |
-
|
| 1767 |
-
|
| 1768 |
-
|
| 1769 |
-
|
| 1770 |
-
|
| 1771 |
-
|
| 1772 |
-
|
| 1773 |
-
|
| 1774 |
-
|
| 1775 |
-
|
| 1776 |
-
|
| 1777 |
-
|
|
|
|
|
|
|
|
|
|
| 1778 |
|
| 1779 |
-
# Original logic (for figures): Embed the base64 code
|
| 1780 |
-
base_key = tag.replace(' ', '').lower()
|
| 1781 |
-
item[base_key] = base64_code
|
| 1782 |
-
|
| 1783 |
final_structured_data.append(item)
|
| 1784 |
-
|
| 1785 |
-
print(f"✅ Image embedding complete.
|
| 1786 |
return final_structured_data
|
| 1787 |
|
| 1788 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1789 |
# ============================================================================
|
| 1790 |
# --- MAIN FUNCTION ---
|
| 1791 |
# ============================================================================
|
|
|
|
| 175 |
print(f"❌ Error initializing Pix2Text model. Equations will not be converted: {e}")
|
| 176 |
p2t = None
|
| 177 |
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
# def get_latex_from_base64(base64_string: str) -> str:
|
| 183 |
+
# """
|
| 184 |
+
# Decodes a Base64 image string, uses Pix2Text to recognize the formula,
|
| 185 |
+
# and returns the LaTeX code, wrapped in $$.
|
| 186 |
+
# """
|
| 187 |
+
# if p2t is None:
|
| 188 |
+
# return "[P2T_ERROR: Model not initialized]"
|
| 189 |
+
|
| 190 |
+
# try:
|
| 191 |
+
# # 1. Decode Base64 to Image
|
| 192 |
+
# image_data = base64.b64decode(base64_string)
|
| 193 |
+
# image = Image.open(io.BytesIO(image_data))
|
| 194 |
+
|
| 195 |
+
# # 2. Recognize text and formulas
|
| 196 |
+
# result = p2t.recognize(image, save_formula_images=False, use_analyzer=True)
|
| 197 |
+
|
| 198 |
+
# # 3. Parse the result for LaTeX
|
| 199 |
+
# extracted_latex_parts = []
|
| 200 |
+
# if isinstance(result, list):
|
| 201 |
+
# for item in result:
|
| 202 |
+
# if hasattr(item, 'text'):
|
| 203 |
+
# extracted_latex_parts.append(item.text)
|
| 204 |
+
# elif isinstance(item, str):
|
| 205 |
+
# extracted_latex_parts.append(item)
|
| 206 |
+
# elif isinstance(result, str):
|
| 207 |
+
# extracted_latex_parts = [result]
|
| 208 |
+
|
| 209 |
+
# extracted_latex = " ".join(extracted_latex_parts).strip()
|
| 210 |
+
|
| 211 |
+
# if not extracted_latex:
|
| 212 |
+
# return "[P2T_WARNING: No formula found]"
|
| 213 |
+
|
| 214 |
+
# # Wrap result in LaTeX delimiters
|
| 215 |
+
# return f"$${extracted_latex}$$"
|
| 216 |
+
|
| 217 |
+
# except Exception as e:
|
| 218 |
+
# # Catch any unexpected errors
|
| 219 |
+
# print(f" ❌ Pix2Text Recognition failed: {e}")
|
| 220 |
+
# return f"[P2T_ERROR: Recognition failed: {e}]"
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
|
| 227 |
def get_latex_from_base64(base64_string: str) -> str:
|
| 228 |
"""
|
| 229 |
Decodes a Base64 image string, uses Pix2Text to recognize the formula,
|
| 230 |
+
and returns the LaTeX code, stripped of all whitespace, as requested.
|
| 231 |
"""
|
| 232 |
if p2t is None:
|
| 233 |
return "[P2T_ERROR: Model not initialized]"
|
|
|
|
| 238 |
image = Image.open(io.BytesIO(image_data))
|
| 239 |
|
| 240 |
# 2. Recognize text and formulas
|
| 241 |
+
# Use keep_original_image=False to save memory
|
| 242 |
+
result = p2t.recognize(image, save_formula_images=False, use_analyzer=True, keep_original_image=False)
|
| 243 |
|
| 244 |
# 3. Parse the result for LaTeX
|
| 245 |
extracted_latex_parts = []
|
| 246 |
if isinstance(result, list):
|
| 247 |
for item in result:
|
| 248 |
+
# Use .text for structured output, item itself for string output
|
| 249 |
+
text = item.text if hasattr(item, 'text') else str(item)
|
| 250 |
+
extracted_latex_parts.append(text)
|
|
|
|
| 251 |
elif isinstance(result, str):
|
| 252 |
extracted_latex_parts = [result]
|
| 253 |
|
| 254 |
+
# Join with a space first, then clean all whitespace
|
| 255 |
extracted_latex = " ".join(extracted_latex_parts).strip()
|
| 256 |
|
| 257 |
+
# *** CORE CHANGE: Remove all spaces/line breaks as requested by the user ***
|
| 258 |
+
# This uses regex to replace any sequence of whitespace characters (spaces, tabs, newlines) with an empty string.
|
| 259 |
+
cleaned_latex = re.sub(r'\s+', '', extracted_latex)
|
| 260 |
+
|
| 261 |
+
if not cleaned_latex:
|
| 262 |
return "[P2T_WARNING: No formula found]"
|
| 263 |
|
| 264 |
+
# Return the clean LaTeX string without wrapping $$, as requested.
|
| 265 |
+
return cleaned_latex
|
| 266 |
|
| 267 |
except Exception as e:
|
| 268 |
# Catch any unexpected errors
|
|
|
|
| 275 |
|
| 276 |
|
| 277 |
|
| 278 |
+
|
| 279 |
+
|
| 280 |
# # Initialize the YOLO model
|
| 281 |
# model = YOLO(WEIGHTS_PATH)
|
| 282 |
|
|
|
|
| 1723 |
# --- PHASE 4: IMAGE EMBEDDING (Base64) ---
|
| 1724 |
# ============================================================================
|
| 1725 |
|
| 1726 |
+
# def get_base64_for_file(filepath: str) -> str:
|
| 1727 |
+
# try:
|
| 1728 |
+
# with open(filepath, 'rb') as f:
|
| 1729 |
+
# return base64.b64encode(f.read()).decode('utf-8')
|
| 1730 |
+
# except Exception as e:
|
| 1731 |
+
# print(f" ❌ Error encoding file {filepath}: {e}")
|
| 1732 |
+
# return ""
|
| 1733 |
+
|
| 1734 |
+
|
| 1735 |
+
|
| 1736 |
+
|
| 1737 |
+
def get_base64_for_file(filepath: str) -> Optional[str]:
|
| 1738 |
+
"""Reads a file and returns its Base64 encoded string without the data URI prefix."""
|
| 1739 |
try:
|
| 1740 |
+
with open(filepath, "rb") as image_file:
|
| 1741 |
+
# Return raw base64 string
|
| 1742 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
| 1743 |
except Exception as e:
|
| 1744 |
+
print(f"Error reading and encoding file {filepath}: {e}")
|
| 1745 |
+
return None
|
| 1746 |
+
|
| 1747 |
+
|
| 1748 |
+
|
| 1749 |
|
| 1750 |
|
| 1751 |
# def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
|
|
|
|
| 1788 |
|
| 1789 |
|
| 1790 |
|
| 1791 |
+
# def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[
|
| 1792 |
+
# Dict[str, Any]]:
|
| 1793 |
+
# print("\n" + "=" * 80)
|
| 1794 |
+
# print("--- 4. STARTING IMAGE EMBEDDING (Base64) / EQUATION TO LATEX CONVERSION ---")
|
| 1795 |
+
# print("=" * 80)
|
| 1796 |
+
# if not structured_data: return []
|
| 1797 |
+
# image_files = glob.glob(os.path.join(figure_extraction_dir, "*.png"))
|
| 1798 |
+
# image_lookup = {}
|
| 1799 |
+
# tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
|
| 1800 |
+
# for filepath in image_files:
|
| 1801 |
+
# filename = os.path.basename(filepath)
|
| 1802 |
+
# match = re.search(r'_(figure|equation)(\d+)\.png$', filename, re.IGNORECASE)
|
| 1803 |
+
# if match:
|
| 1804 |
+
# key = f"{match.group(1).upper()}{match.group(2)}"
|
| 1805 |
+
# image_lookup[key] = filepath
|
| 1806 |
+
# print(f" -> Found {len(image_lookup)} image components.")
|
| 1807 |
+
|
| 1808 |
+
# final_structured_data = []
|
| 1809 |
+
|
| 1810 |
+
# for item in structured_data:
|
| 1811 |
+
# text_fields = [item.get('question', ''), item.get('passage', '')]
|
| 1812 |
+
# if 'options' in item:
|
| 1813 |
+
# for opt_val in item['options'].values(): text_fields.append(opt_val)
|
| 1814 |
+
# if 'new_passage' in item: text_fields.append(item['new_passage'])
|
| 1815 |
+
|
| 1816 |
+
# unique_tags_to_embed = set()
|
| 1817 |
+
# for text in text_fields:
|
| 1818 |
+
# if not text: continue
|
| 1819 |
+
# for match in tag_regex.finditer(text):
|
| 1820 |
+
# tag = match.group(0).upper()
|
| 1821 |
+
# if tag in image_lookup: unique_tags_to_embed.add(tag)
|
| 1822 |
+
|
| 1823 |
+
# # List of tags that were successfully converted to LaTeX
|
| 1824 |
+
# tags_converted_to_latex = set()
|
| 1825 |
+
|
| 1826 |
+
# for tag in sorted(list(unique_tags_to_embed)):
|
| 1827 |
+
# filepath = image_lookup[tag]
|
| 1828 |
+
# # Get the base64 code for processing, whether we embed it or convert it to LaTeX
|
| 1829 |
+
# base64_code = get_base64_for_file(filepath)
|
| 1830 |
+
|
| 1831 |
+
# # --- PIX2TEXT/EQUATION CONVERSION LOGIC START ---
|
| 1832 |
+
# if tag.startswith('EQUATION') and p2t is not None:
|
| 1833 |
+
# print(f" -> Converting EQUATION {tag} to LaTeX...")
|
| 1834 |
+
# latex_code = get_latex_from_base64(base64_code)
|
| 1835 |
+
|
| 1836 |
+
# # Replace the original tag (e.g., EQUATION1) in the item's text fields with LaTeX
|
| 1837 |
+
# for key in ['question', 'passage', 'new_passage']:
|
| 1838 |
+
# if item.get(key) and tag in item[key]:
|
| 1839 |
+
# item[key] = item[key].replace(tag, latex_code)
|
| 1840 |
+
|
| 1841 |
+
# if 'options' in item:
|
| 1842 |
+
# for opt_key, opt_val in item['options'].items():
|
| 1843 |
+
# if tag in opt_val:
|
| 1844 |
+
# item['options'][opt_key] = opt_val.replace(tag, latex_code)
|
| 1845 |
+
|
| 1846 |
+
# tags_converted_to_latex.add(tag)
|
| 1847 |
+
# # Skip the embedding of the Base64 code for equations
|
| 1848 |
+
# continue
|
| 1849 |
+
# # --- PIX2TEXT/EQUATION CONVERSION LOGIC END ---
|
| 1850 |
+
|
| 1851 |
+
# # Original logic (for figures): Embed the base64 code
|
| 1852 |
+
# base_key = tag.replace(' ', '').lower()
|
| 1853 |
+
# item[base_key] = base64_code
|
| 1854 |
+
|
| 1855 |
+
# final_structured_data.append(item)
|
| 1856 |
+
|
| 1857 |
+
# print(f"✅ Image embedding complete. {len(tags_converted_to_latex)} equations converted to LaTeX.")
|
| 1858 |
+
# return final_structured_data
|
| 1859 |
+
|
| 1860 |
+
|
| 1861 |
+
|
| 1862 |
+
|
| 1863 |
+
|
| 1864 |
+
|
| 1865 |
+
|
| 1866 |
+
|
| 1867 |
+
def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figure_extraction_dir: str) -> List[ Dict[str, Any]]:
|
| 1868 |
print("\n" + "=" * 80)
|
| 1869 |
print("--- 4. STARTING IMAGE EMBEDDING (Base64) / EQUATION TO LATEX CONVERSION ---")
|
| 1870 |
print("=" * 80)
|
| 1871 |
+
if not structured_data:
|
| 1872 |
+
return []
|
| 1873 |
+
|
| 1874 |
image_files = glob.glob(os.path.join(figure_extraction_dir, "*.png"))
|
| 1875 |
image_lookup = {}
|
| 1876 |
tag_regex = re.compile(r'(figure|equation)(\d+)', re.IGNORECASE)
|
| 1877 |
+
|
| 1878 |
for filepath in image_files:
|
| 1879 |
filename = os.path.basename(filepath)
|
| 1880 |
match = re.search(r'_(figure|equation)(\d+)\.png$', filename, re.IGNORECASE)
|
| 1881 |
if match:
|
| 1882 |
key = f"{match.group(1).upper()}{match.group(2)}"
|
| 1883 |
image_lookup[key] = filepath
|
| 1884 |
+
|
| 1885 |
+
print(f" -> Found {len(image_lookup)} image components.")
|
| 1886 |
|
| 1887 |
final_structured_data = []
|
| 1888 |
+
|
| 1889 |
for item in structured_data:
|
| 1890 |
text_fields = [item.get('question', ''), item.get('passage', '')]
|
| 1891 |
if 'options' in item:
|
| 1892 |
+
for opt_val in item['options'].values():
|
| 1893 |
+
text_fields.append(opt_val)
|
| 1894 |
+
if 'new_passage' in item:
|
| 1895 |
+
text_fields.append(item['new_passage'])
|
| 1896 |
+
|
| 1897 |
unique_tags_to_embed = set()
|
| 1898 |
for text in text_fields:
|
| 1899 |
if not text: continue
|
| 1900 |
for match in tag_regex.finditer(text):
|
| 1901 |
tag = match.group(0).upper()
|
| 1902 |
+
if tag in image_lookup:
|
| 1903 |
+
unique_tags_to_embed.add(tag)
|
| 1904 |
+
|
| 1905 |
# List of tags that were successfully converted to LaTeX
|
| 1906 |
tags_converted_to_latex = set()
|
| 1907 |
|
| 1908 |
for tag in sorted(list(unique_tags_to_embed)):
|
| 1909 |
filepath = image_lookup[tag]
|
| 1910 |
+
base_key = tag.replace(' ', '').lower() # e.g., figure1 or equation1
|
| 1911 |
+
|
| 1912 |
+
if 'EQUATION' in tag:
|
| 1913 |
+
# Equation to LaTeX conversion
|
| 1914 |
+
base64_code = get_base64_for_file(filepath) # This reads the file for conversion
|
| 1915 |
+
if base64_code:
|
| 1916 |
+
latex_output = get_latex_from_base64(base64_code)
|
| 1917 |
+
if not latex_output.startswith('[P2T_ERROR') and not latex_output.startswith('[P2T_WARNING'):
|
| 1918 |
+
# *** CORE CHANGE: Store the clean LaTeX output directly ***
|
| 1919 |
+
item[base_key] = latex_output
|
| 1920 |
+
tags_converted_to_latex.add(tag)
|
| 1921 |
+
print(f" ✅ Embedded Clean LaTeX for {tag}")
|
| 1922 |
+
else:
|
| 1923 |
+
# On failure, embed the error message
|
| 1924 |
+
item[base_key] = latex_output
|
| 1925 |
+
print(f" ⚠️ Failed to convert {tag} to LaTeX. Embedding error message.")
|
| 1926 |
+
else:
|
| 1927 |
+
item[base_key] = "[FILE_ERROR: Could not read image file]"
|
| 1928 |
+
print(f" ❌ File read error for {tag}.")
|
| 1929 |
+
|
| 1930 |
+
elif 'FIGURE' in tag:
|
| 1931 |
+
# Figure to Base64 conversion
|
| 1932 |
+
base64_code = get_base64_for_file(filepath)
|
| 1933 |
+
item[base_key] = base64_code
|
| 1934 |
+
print(f" ✅ Embedded Base64 for {tag}")
|
| 1935 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1936 |
final_structured_data.append(item)
|
| 1937 |
+
|
| 1938 |
+
print(f"✅ Image embedding complete.")
|
| 1939 |
return final_structured_data
|
| 1940 |
|
| 1941 |
|
| 1942 |
+
|
| 1943 |
+
|
| 1944 |
+
|
| 1945 |
+
|
| 1946 |
+
|
| 1947 |
+
|
| 1948 |
+
|
| 1949 |
+
|
| 1950 |
+
|
| 1951 |
+
|
| 1952 |
+
|
| 1953 |
# ============================================================================
|
| 1954 |
# --- MAIN FUNCTION ---
|
| 1955 |
# ============================================================================
|