import os import base64 from pdf2image import convert_from_path from langchain_core.documents import Document from langchain_openai import ChatOpenAI from langchain_core.messages import HumanMessage class VisionProcessor: def __init__(self): self.vision_model = ChatOpenAI(model="gpt-4o", max_tokens=1024) # PRO FIX: Point to local Poppler bin # This assumes 'poppler' folder is in the project root self.poppler_path = os.path.join(os.getcwd(), "poppler", "Library", "bin") def process_visual_pdf(self, pdf_path): print(f" đŸ‘ī¸ Processing Visual PDF: {os.path.basename(pdf_path)}...") documents = [] try: # Check if our local poppler exists if not os.path.exists(self.poppler_path): print(f" ❌ Error: Poppler not found at {self.poppler_path}") return [] # 1. Convert PDF pages to Images (Using local poppler) images = convert_from_path(pdf_path, fmt="jpeg", poppler_path=self.poppler_path) print(f" -> Extracted {len(images)} images (pages) from PDF.") # 2. Analyze first 3 pages (Cost Saving Mode) for i, img in enumerate(images[:3]): print(f" -> Analyzing Page {i+1} with GPT-4o Vision...") # Base64 Encode import io buffered = io.BytesIO() img.save(buffered, format="JPEG") img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") # 3. Send to GPT-4o response = self.vision_model.invoke( [ HumanMessage( content=[ {"type": "text", "text": "Describe this image in detail. If it is a graph, extract the data points. If it is a table, transcribe it."}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_str}"}}, ] ) ] ) description = response.content doc = Document( page_content=f"IMAGE DESCRIPTION (Page {i+1}): {description}", metadata={ "source": os.path.basename(pdf_path), "page": i+1, "category": "visual_data" } ) documents.append(doc) if len(images) > 3: print(" â„šī¸ Limited to first 3 pages for POC cost safety.") except Exception as e: print(f" ❌ Vision Error: {e}") return documents