Multi_LLM_Image_Captioning / caption_generation.py
HarshitX's picture
Upload 9 files
8a8f3ed verified
import base64
import io
import os
from PIL import Image
# API Imports
import openai
import google.generativeai as genai
from groq import Groq
from dotenv import load_dotenv
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY_IC")
gemini_key = os.getenv("GEMINI_API_KEY_IC")
groq_key = os.getenv("GROQ_API_KEY_IC")
class MultiModelCaptionGenerator:
"""Handles caption generation using multiple models."""
def __init__(self):
self.openai_client = None
self.groq_client = None
self.gemini_configured = False
def configure_apis(self, openai_key: str|None = openai_key, groq_key: str|None = groq_key,
gemini_key: str|None = gemini_key):
if openai_key:
self.openai_client = openai.OpenAI(api_key=openai_key)
if groq_key:
self.groq_client = Groq(api_key=groq_key)
if gemini_key:
genai.configure(api_key=gemini_key)
self.gemini_configured = True
def encode_image_base64(self, image: Image.Image) -> str:
buffered = io.BytesIO()
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode()
def generate_caption_openai(self, image: Image.Image, model: str = "gpt-4o-mini") -> str:
"""Fixed OpenAI caption generation with correct model and image_url format"""
if not self.openai_client:
raise ValueError("OpenAI API key not configured.")
base64_image = self.encode_image_base64(image)
response = self.openai_client.chat.completions.create(
model=model, # Use gpt-4o or gpt-4o-mini for vision
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}" # Fixed: removed space after comma
}
}
]
}
],
max_tokens=300
)
return response.choices[0].message.content
def generate_caption_gemini(self, image: Image.Image,
model: str = "gemini-2.5-flash") -> str: # Fixed: use correct model name
"""Fixed Gemini caption generation with correct model name"""
if not self.gemini_configured:
raise ValueError("Gemini API key not configured!")
model_instance = genai.GenerativeModel(model)
prompt = "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise."
response = model_instance.generate_content([prompt, image])
return response.text
def generate_caption_groq(self, image: Image.Image,
model: str = "meta-llama/llama-4-scout-17b-16e-instruct") -> str:
"""Fixed GROQ caption generation with correct model name and API structure"""
if not self.groq_client:
raise ValueError("GROQ API key is not configured!")
base64_image = self.encode_image_base64(image)
completion = self.groq_client.chat.completions.create(
model=model, # Fixed: added missing model parameter
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}" # Fixed: proper format with url key
}
}
]
}
],
max_tokens=300,
temperature=0.7
)
return completion.choices[0].message.content