import base64 import io import os from PIL import Image # API Imports import openai import google.generativeai as genai from groq import Groq from dotenv import load_dotenv load_dotenv() openai_key = os.getenv("OPENAI_API_KEY_IC") gemini_key = os.getenv("GEMINI_API_KEY_IC") groq_key = os.getenv("GROQ_API_KEY_IC") class MultiModelCaptionGenerator: """Handles caption generation using multiple models.""" def __init__(self): self.openai_client = None self.groq_client = None self.gemini_configured = False def configure_apis(self, openai_key: str|None = openai_key, groq_key: str|None = groq_key, gemini_key: str|None = gemini_key): if openai_key: self.openai_client = openai.OpenAI(api_key=openai_key) if groq_key: self.groq_client = Groq(api_key=groq_key) if gemini_key: genai.configure(api_key=gemini_key) self.gemini_configured = True def encode_image_base64(self, image: Image.Image) -> str: buffered = io.BytesIO() image.save(buffered, format="PNG") return base64.b64encode(buffered.getvalue()).decode() def generate_caption_openai(self, image: Image.Image, model: str = "gpt-4o-mini") -> str: """Fixed OpenAI caption generation with correct model and image_url format""" if not self.openai_client: raise ValueError("OpenAI API key not configured.") base64_image = self.encode_image_base64(image) response = self.openai_client.chat.completions.create( model=model, # Use gpt-4o or gpt-4o-mini for vision messages=[ { "role": "user", "content": [ { "type": "text", "text": "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise." }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{base64_image}" # Fixed: removed space after comma } } ] } ], max_tokens=300 ) return response.choices[0].message.content def generate_caption_gemini(self, image: Image.Image, model: str = "gemini-2.5-flash") -> str: # Fixed: use correct model name """Fixed Gemini caption generation with correct model name""" if not self.gemini_configured: raise ValueError("Gemini API key not configured!") model_instance = genai.GenerativeModel(model) prompt = "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise." response = model_instance.generate_content([prompt, image]) return response.text def generate_caption_groq(self, image: Image.Image, model: str = "meta-llama/llama-4-scout-17b-16e-instruct") -> str: """Fixed GROQ caption generation with correct model name and API structure""" if not self.groq_client: raise ValueError("GROQ API key is not configured!") base64_image = self.encode_image_base64(image) completion = self.groq_client.chat.completions.create( model=model, # Fixed: added missing model parameter messages=[ { "role": "user", "content": [ { "type": "text", "text": "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise." }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{base64_image}" # Fixed: proper format with url key } } ] } ], max_tokens=300, temperature=0.7 ) return completion.choices[0].message.content