PDF-Assit_RAG / backend /app /services /layout_parser.py
Param20h's picture
deploy: pure backend API with keywords fix
7c46845 unverified
Raw
History Blame Contribute Delete
4 kB
import os
from typing import Any, Dict, List
import fitz # PyMuPDF
import pymupdf4llm
from google import (
genai, # Since the repo uses Gemini, we'll swap to Gemini 2.5 Flash for vision tasks!
)
# Initialize Gemini Client
client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY", "dummy_key"))
class AdvancedPDFParser:
def __init__(self, pdf_path: str):
self.pdf_path = pdf_path
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF file not found at: {pdf_path}")
self.doc = fitz.open(pdf_path)
def extract_structured_text(self) -> List[Dict[str, Any]]:
"""Parses PDF page-by-page preserving markdown layouts & tables."""
pages_data = []
try:
md_pages = pymupdf4llm.to_markdown(self.pdf_path, page_chunks=True)
for page in md_pages:
pages_data.append(
{
"page_number": page["metadata"]["page"],
"text": page["text"],
"type": "text_layout",
}
)
except Exception as e:
print(f"Layout parsing failed, falling back to standard text: {e}")
for page_num in range(len(self.doc)):
page = self.doc.load_page(page_num)
pages_data.append(
{
"page_number": page_num + 1,
"text": page.get_text(),
"type": "fallback_text",
}
)
return pages_data
def process_embedded_images(self, page_num: int, page_obj: fitz.Page) -> List[str]:
"""Extracts images/charts and uses Gemini Flash to generate dense data descriptions."""
image_descriptions = []
image_list = page_obj.get_images(full=True)
try:
from google import genai
client = genai.Client()
except Exception as e:
print(f"Gemini client init failed, skipping vision: {e}")
return image_descriptions
for img_index, img in enumerate(image_list):
xref = img[0]
base_image = self.doc.extract_image(xref)
image_bytes = base_image["image"]
try:
# Use Gemini 2.5 Flash via standard structured part inputs
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=[
genai.types.Part.from_bytes(
data=image_bytes, mime_type="image/jpeg"
),
"Analyze this chart/image extracted from a document. Provide a highly detailed summary of its numbers, structural trends, or data contents so it can be effectively used for downstream text retrieval.",
],
)
if response.text:
image_descriptions.append(response.text)
except Exception as e:
print(f"Vision processing skipped for page {page_num + 1}: {e}")
continue
return image_descriptions
def ingest_document(self) -> List[Dict[str, Any]]:
"""Executes the hybrid pipeline generating combined text and image context strings."""
final_payload = []
structured_chunks = self.extract_structured_text()
final_payload.extend(structured_chunks)
for page_num in range(len(self.doc)):
page = self.doc.load_page(page_num)
img_summaries = self.process_embedded_images(page_num, page)
for summary in img_summaries:
final_payload.append(
{
"page_number": page_num + 1,
"text": f"[Visual Data Extraction Summary]: {summary}",
"type": "visual_image_summary",
}
)
return final_payload