""" Figure extraction and processing Uses PIL images directly from Docling extraction """ from typing import List, Dict, Any from PIL import Image def extract_figures(page_images: List[Image.Image], figures_info: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Process extracted figures from Docling. Docling already extracts the figure images directly, so this function: - Uses the PIL images from Docling (figures_info contains 'image' field) - Creates thumbnails for gallery display - Preserves metadata Args: page_images: List of PIL Images (pages) - not used since Docling provides images figures_info: List of figure metadata with 'image', 'bbox', 'page', 'caption' Returns: List of dicts with 'image' (PIL), 'thumbnail', 'bbox', 'page', 'caption' """ results = [] try: for fig_info in figures_info: # Get PIL image directly from Docling extraction pil_image = fig_info.get("image") if pil_image is None: print(f"⚠️ Figure missing image: {fig_info}") continue # Ensure it's a PIL image if not isinstance(pil_image, Image.Image): print(f"⚠️ Figure image is not PIL Image: {type(pil_image)}") continue # Create thumbnail thumb = pil_image.copy() thumb.thumbnail((200, 200), Image.Resampling.LANCZOS) results.append({ "image": pil_image, "thumbnail": thumb, "bbox": fig_info.get("bbox"), "page": fig_info.get("page", 0), "caption": fig_info.get("caption", ""), }) print(f"✅ Processed figure {len(results)}") except Exception as e: print(f"⚠️ Error processing figures: {e}") import traceback traceback.print_exc() # If no figures found, return page previews as fallback if not results and page_images: print("⚠️ No figures extracted, using page previews as fallback") for i, page_img in enumerate(page_images[:3]): # First 3 pages thumb = page_img.copy() thumb.thumbnail((200, 200), Image.Resampling.LANCZOS) results.append({ "image": page_img, "thumbnail": thumb, "bbox": None, "page": i, "caption": f"Page {i + 1}", }) return results