final_project / src /image_summarizer.py
dnj0's picture
Upload 7 files
835ecb4 verified
import base64
import os
from typing import Optional
from openai import OpenAI
class ImageSummarizer:
"""Summarizes images using OpenAI's vision API."""
def __init__(self, api_key: Optional[str] = None):
"""Initialize OpenAI client."""
self.client = OpenAI(api_key=api_key or os.getenv("OPENAI_API_KEY"))
def summarize_image_base64(self,
image_base64: str,
image_format: str = "png") -> str:
"""
Summarize image using OpenAI vision.
Args:
image_base64: Base64 encoded image
image_format: Image format (png, jpg, etc.)
Returns:
Image description/summary
"""
try:
response = self.client.chat.completions.create(
model="gpt-4o-mini", # or "gpt-4-vision-preview"
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/{image_format};base64,{image_base64}"
}
},
{
"type": "text",
"text": "Пожалуйста, опишите детально содержание этого изображения на русском языке. Укажите все видимые объекты, текст, диаграммы, графики и их взаимосвязь."
}
]
}
],
max_tokens=500
)
return response.choices[0].message.content
except Exception as e:
print(f"Error summarizing image: {e}")
return f"Изображение на странице (ошибка обработки: {str(e)})"
def process_images_in_documents(documents_data: list,
image_summarizer: ImageSummarizer) -> list:
"""
Process images in extracted PDF documents and add summaries.
Args:
documents_data: List of document content dictionaries
image_summarizer: ImageSummarizer instance
Returns:
Updated documents with image summaries
"""
for doc in documents_data:
for page in doc.get("pages", []):
for image in page.get("images", []):
if image.get("base64"):
print(f"Summarizing image from page {page.get('page_number')}")
summary = image_summarizer.summarize_image_base64(
image.get("base64"),
image.get("format", "png")
)
image["summary"] = summary
return documents_data