import base64 import os from typing import Optional from openai import OpenAI class ImageSummarizer: """Summarizes images using OpenAI's vision API.""" def __init__(self, api_key: Optional[str] = None): """Initialize OpenAI client.""" self.client = OpenAI(api_key=api_key or os.getenv("OPENAI_API_KEY")) def summarize_image_base64(self, image_base64: str, image_format: str = "png") -> str: """ Summarize image using OpenAI vision. Args: image_base64: Base64 encoded image image_format: Image format (png, jpg, etc.) Returns: Image description/summary """ try: response = self.client.chat.completions.create( model="gpt-4o-mini", # or "gpt-4-vision-preview" messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": f"data:image/{image_format};base64,{image_base64}" } }, { "type": "text", "text": "Пожалуйста, опишите детально содержание этого изображения на русском языке. Укажите все видимые объекты, текст, диаграммы, графики и их взаимосвязь." } ] } ], max_tokens=500 ) return response.choices[0].message.content except Exception as e: print(f"Error summarizing image: {e}") return f"Изображение на странице (ошибка обработки: {str(e)})" def process_images_in_documents(documents_data: list, image_summarizer: ImageSummarizer) -> list: """ Process images in extracted PDF documents and add summaries. Args: documents_data: List of document content dictionaries image_summarizer: ImageSummarizer instance Returns: Updated documents with image summaries """ for doc in documents_data: for page in doc.get("pages", []): for image in page.get("images", []): if image.get("base64"): print(f"Summarizing image from page {page.get('page_number')}") summary = image_summarizer.summarize_image_base64( image.get("base64"), image.get("format", "png") ) image["summary"] = summary return documents_data