Spaces:
Running
Running
| import os | |
| from typing import Any, Dict, List | |
| import fitz # PyMuPDF | |
| import pymupdf4llm | |
| from google import ( | |
| genai, # Since the repo uses Gemini, we'll swap to Gemini 2.5 Flash for vision tasks! | |
| ) | |
| # Initialize Gemini Client | |
| client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY", "dummy_key")) | |
| class AdvancedPDFParser: | |
| def __init__(self, pdf_path: str): | |
| self.pdf_path = pdf_path | |
| if not os.path.exists(pdf_path): | |
| raise FileNotFoundError(f"PDF file not found at: {pdf_path}") | |
| self.doc = fitz.open(pdf_path) | |
| def extract_structured_text(self) -> List[Dict[str, Any]]: | |
| """Parses PDF page-by-page preserving markdown layouts & tables.""" | |
| pages_data = [] | |
| try: | |
| md_pages = pymupdf4llm.to_markdown(self.pdf_path, page_chunks=True) | |
| for page in md_pages: | |
| pages_data.append( | |
| { | |
| "page_number": page["metadata"]["page"], | |
| "text": page["text"], | |
| "type": "text_layout", | |
| } | |
| ) | |
| except Exception as e: | |
| print(f"Layout parsing failed, falling back to standard text: {e}") | |
| for page_num in range(len(self.doc)): | |
| page = self.doc.load_page(page_num) | |
| pages_data.append( | |
| { | |
| "page_number": page_num + 1, | |
| "text": page.get_text(), | |
| "type": "fallback_text", | |
| } | |
| ) | |
| return pages_data | |
| def process_embedded_images(self, page_num: int, page_obj: fitz.Page) -> List[str]: | |
| """Extracts images/charts and uses Gemini Flash to generate dense data descriptions.""" | |
| image_descriptions = [] | |
| image_list = page_obj.get_images(full=True) | |
| try: | |
| from google import genai | |
| client = genai.Client() | |
| except Exception as e: | |
| print(f"Gemini client init failed, skipping vision: {e}") | |
| return image_descriptions | |
| for img_index, img in enumerate(image_list): | |
| xref = img[0] | |
| base_image = self.doc.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| try: | |
| # Use Gemini 2.5 Flash via standard structured part inputs | |
| response = client.models.generate_content( | |
| model="gemini-2.5-flash", | |
| contents=[ | |
| genai.types.Part.from_bytes( | |
| data=image_bytes, mime_type="image/jpeg" | |
| ), | |
| "Analyze this chart/image extracted from a document. Provide a highly detailed summary of its numbers, structural trends, or data contents so it can be effectively used for downstream text retrieval.", | |
| ], | |
| ) | |
| if response.text: | |
| image_descriptions.append(response.text) | |
| except Exception as e: | |
| print(f"Vision processing skipped for page {page_num + 1}: {e}") | |
| continue | |
| return image_descriptions | |
| def ingest_document(self) -> List[Dict[str, Any]]: | |
| """Executes the hybrid pipeline generating combined text and image context strings.""" | |
| final_payload = [] | |
| structured_chunks = self.extract_structured_text() | |
| final_payload.extend(structured_chunks) | |
| for page_num in range(len(self.doc)): | |
| page = self.doc.load_page(page_num) | |
| img_summaries = self.process_embedded_images(page_num, page) | |
| for summary in img_summaries: | |
| final_payload.append( | |
| { | |
| "page_number": page_num + 1, | |
| "text": f"[Visual Data Extraction Summary]: {summary}", | |
| "type": "visual_image_summary", | |
| } | |
| ) | |
| return final_payload | |