Pro-RAG-Level1 / src /vision_processor.py
alihaiderscholar's picture
Upload 19 files
aabd1d8 verified
import os
import base64
from pdf2image import convert_from_path
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
class VisionProcessor:
def __init__(self):
self.vision_model = ChatOpenAI(model="gpt-4o", max_tokens=1024)
# PRO FIX: Point to local Poppler bin
# This assumes 'poppler' folder is in the project root
self.poppler_path = os.path.join(os.getcwd(), "poppler", "Library", "bin")
def process_visual_pdf(self, pdf_path):
print(f" 👁️ Processing Visual PDF: {os.path.basename(pdf_path)}...")
documents = []
try:
# Check if our local poppler exists
if not os.path.exists(self.poppler_path):
print(f" ❌ Error: Poppler not found at {self.poppler_path}")
return []
# 1. Convert PDF pages to Images (Using local poppler)
images = convert_from_path(pdf_path, fmt="jpeg", poppler_path=self.poppler_path)
print(f" -> Extracted {len(images)} images (pages) from PDF.")
# 2. Analyze first 3 pages (Cost Saving Mode)
for i, img in enumerate(images[:3]):
print(f" -> Analyzing Page {i+1} with GPT-4o Vision...")
# Base64 Encode
import io
buffered = io.BytesIO()
img.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
# 3. Send to GPT-4o
response = self.vision_model.invoke(
[
HumanMessage(
content=[
{"type": "text", "text": "Describe this image in detail. If it is a graph, extract the data points. If it is a table, transcribe it."},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_str}"}},
]
)
]
)
description = response.content
doc = Document(
page_content=f"IMAGE DESCRIPTION (Page {i+1}): {description}",
metadata={
"source": os.path.basename(pdf_path),
"page": i+1,
"category": "visual_data"
}
)
documents.append(doc)
if len(images) > 3:
print(" ℹ️ Limited to first 3 pages for POC cost safety.")
except Exception as e:
print(f" ❌ Vision Error: {e}")
return documents