import os from typing import Any, Dict, List import fitz # PyMuPDF import pymupdf4llm from google import ( genai, # Since the repo uses Gemini, we'll swap to Gemini 2.5 Flash for vision tasks! ) # Initialize Gemini Client client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY", "dummy_key")) class AdvancedPDFParser: def __init__(self, pdf_path: str): self.pdf_path = pdf_path if not os.path.exists(pdf_path): raise FileNotFoundError(f"PDF file not found at: {pdf_path}") self.doc = fitz.open(pdf_path) def extract_structured_text(self) -> List[Dict[str, Any]]: """Parses PDF page-by-page preserving markdown layouts & tables.""" pages_data = [] try: md_pages = pymupdf4llm.to_markdown(self.pdf_path, page_chunks=True) for page in md_pages: pages_data.append( { "page_number": page["metadata"]["page"], "text": page["text"], "type": "text_layout", } ) except Exception as e: print(f"Layout parsing failed, falling back to standard text: {e}") for page_num in range(len(self.doc)): page = self.doc.load_page(page_num) pages_data.append( { "page_number": page_num + 1, "text": page.get_text(), "type": "fallback_text", } ) return pages_data def process_embedded_images(self, page_num: int, page_obj: fitz.Page) -> List[str]: """Extracts images/charts and uses Gemini Flash to generate dense data descriptions.""" image_descriptions = [] image_list = page_obj.get_images(full=True) try: from google import genai client = genai.Client() except Exception as e: print(f"Gemini client init failed, skipping vision: {e}") return image_descriptions for img_index, img in enumerate(image_list): xref = img[0] base_image = self.doc.extract_image(xref) image_bytes = base_image["image"] try: # Use Gemini 2.5 Flash via standard structured part inputs response = client.models.generate_content( model="gemini-2.5-flash", contents=[ genai.types.Part.from_bytes( data=image_bytes, mime_type="image/jpeg" ), "Analyze this chart/image extracted from a document. Provide a highly detailed summary of its numbers, structural trends, or data contents so it can be effectively used for downstream text retrieval.", ], ) if response.text: image_descriptions.append(response.text) except Exception as e: print(f"Vision processing skipped for page {page_num + 1}: {e}") continue return image_descriptions def ingest_document(self) -> List[Dict[str, Any]]: """Executes the hybrid pipeline generating combined text and image context strings.""" final_payload = [] structured_chunks = self.extract_structured_text() final_payload.extend(structured_chunks) for page_num in range(len(self.doc)): page = self.doc.load_page(page_num) img_summaries = self.process_embedded_images(page_num, page) for summary in img_summaries: final_payload.append( { "page_number": page_num + 1, "text": f"[Visual Data Extraction Summary]: {summary}", "type": "visual_image_summary", } ) return final_payload