Spaces:
Sleeping
Sleeping
File size: 4,775 Bytes
8a8f3ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import base64
import io
import os
from PIL import Image
# API Imports
import openai
import google.generativeai as genai
from groq import Groq
from dotenv import load_dotenv
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY_IC")
gemini_key = os.getenv("GEMINI_API_KEY_IC")
groq_key = os.getenv("GROQ_API_KEY_IC")
class MultiModelCaptionGenerator:
"""Handles caption generation using multiple models."""
def __init__(self):
self.openai_client = None
self.groq_client = None
self.gemini_configured = False
def configure_apis(self, openai_key: str|None = openai_key, groq_key: str|None = groq_key,
gemini_key: str|None = gemini_key):
if openai_key:
self.openai_client = openai.OpenAI(api_key=openai_key)
if groq_key:
self.groq_client = Groq(api_key=groq_key)
if gemini_key:
genai.configure(api_key=gemini_key)
self.gemini_configured = True
def encode_image_base64(self, image: Image.Image) -> str:
buffered = io.BytesIO()
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode()
def generate_caption_openai(self, image: Image.Image, model: str = "gpt-4o-mini") -> str:
"""Fixed OpenAI caption generation with correct model and image_url format"""
if not self.openai_client:
raise ValueError("OpenAI API key not configured.")
base64_image = self.encode_image_base64(image)
response = self.openai_client.chat.completions.create(
model=model, # Use gpt-4o or gpt-4o-mini for vision
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}" # Fixed: removed space after comma
}
}
]
}
],
max_tokens=300
)
return response.choices[0].message.content
def generate_caption_gemini(self, image: Image.Image,
model: str = "gemini-2.5-flash") -> str: # Fixed: use correct model name
"""Fixed Gemini caption generation with correct model name"""
if not self.gemini_configured:
raise ValueError("Gemini API key not configured!")
model_instance = genai.GenerativeModel(model)
prompt = "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise."
response = model_instance.generate_content([prompt, image])
return response.text
def generate_caption_groq(self, image: Image.Image,
model: str = "meta-llama/llama-4-scout-17b-16e-instruct") -> str:
"""Fixed GROQ caption generation with correct model name and API structure"""
if not self.groq_client:
raise ValueError("GROQ API key is not configured!")
base64_image = self.encode_image_base64(image)
completion = self.groq_client.chat.completions.create(
model=model, # Fixed: added missing model parameter
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}" # Fixed: proper format with url key
}
}
]
}
],
max_tokens=300,
temperature=0.7
)
return completion.choices[0].message.content |