Spaces:
Sleeping
Sleeping
| import base64 | |
| import io | |
| import os | |
| from PIL import Image | |
| # API Imports | |
| import openai | |
| import google.generativeai as genai | |
| from groq import Groq | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| openai_key = os.getenv("OPENAI_API_KEY_IC") | |
| gemini_key = os.getenv("GEMINI_API_KEY_IC") | |
| groq_key = os.getenv("GROQ_API_KEY_IC") | |
| class MultiModelCaptionGenerator: | |
| """Handles caption generation using multiple models.""" | |
| def __init__(self): | |
| self.openai_client = None | |
| self.groq_client = None | |
| self.gemini_configured = False | |
| def configure_apis(self, openai_key: str|None = openai_key, groq_key: str|None = groq_key, | |
| gemini_key: str|None = gemini_key): | |
| if openai_key: | |
| self.openai_client = openai.OpenAI(api_key=openai_key) | |
| if groq_key: | |
| self.groq_client = Groq(api_key=groq_key) | |
| if gemini_key: | |
| genai.configure(api_key=gemini_key) | |
| self.gemini_configured = True | |
| def encode_image_base64(self, image: Image.Image) -> str: | |
| buffered = io.BytesIO() | |
| image.save(buffered, format="PNG") | |
| return base64.b64encode(buffered.getvalue()).decode() | |
| def generate_caption_openai(self, image: Image.Image, model: str = "gpt-4o-mini") -> str: | |
| """Fixed OpenAI caption generation with correct model and image_url format""" | |
| if not self.openai_client: | |
| raise ValueError("OpenAI API key not configured.") | |
| base64_image = self.encode_image_base64(image) | |
| response = self.openai_client.chat.completions.create( | |
| model=model, # Use gpt-4o or gpt-4o-mini for vision | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise." | |
| }, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/png;base64,{base64_image}" # Fixed: removed space after comma | |
| } | |
| } | |
| ] | |
| } | |
| ], | |
| max_tokens=300 | |
| ) | |
| return response.choices[0].message.content | |
| def generate_caption_gemini(self, image: Image.Image, | |
| model: str = "gemini-2.5-flash") -> str: # Fixed: use correct model name | |
| """Fixed Gemini caption generation with correct model name""" | |
| if not self.gemini_configured: | |
| raise ValueError("Gemini API key not configured!") | |
| model_instance = genai.GenerativeModel(model) | |
| prompt = "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise." | |
| response = model_instance.generate_content([prompt, image]) | |
| return response.text | |
| def generate_caption_groq(self, image: Image.Image, | |
| model: str = "meta-llama/llama-4-scout-17b-16e-instruct") -> str: | |
| """Fixed GROQ caption generation with correct model name and API structure""" | |
| if not self.groq_client: | |
| raise ValueError("GROQ API key is not configured!") | |
| base64_image = self.encode_image_base64(image) | |
| completion = self.groq_client.chat.completions.create( | |
| model=model, # Fixed: added missing model parameter | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise." | |
| }, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/png;base64,{base64_image}" # Fixed: proper format with url key | |
| } | |
| } | |
| ] | |
| } | |
| ], | |
| max_tokens=300, | |
| temperature=0.7 | |
| ) | |
| return completion.choices[0].message.content |