Spaces:

HarshitX
/

Multi_LLM_Image_Captioning

Sleeping

File size: 4,775 Bytes

8a8f3ed

import base64
import io
import os
from PIL import Image

# API Imports
import openai
import google.generativeai as genai
from groq import Groq

from dotenv import load_dotenv

load_dotenv()

openai_key = os.getenv("OPENAI_API_KEY_IC")
gemini_key = os.getenv("GEMINI_API_KEY_IC")
groq_key = os.getenv("GROQ_API_KEY_IC")

class MultiModelCaptionGenerator:
    """Handles caption generation using multiple models."""
    def __init__(self):
        self.openai_client = None
        self.groq_client = None
        self.gemini_configured = False

    def configure_apis(self, openai_key: str|None = openai_key, groq_key: str|None = groq_key,

        gemini_key: str|None = gemini_key):

        if openai_key:
            self.openai_client = openai.OpenAI(api_key=openai_key)

        if groq_key:
            self.groq_client = Groq(api_key=groq_key)

        if gemini_key:
            genai.configure(api_key=gemini_key)
            self.gemini_configured = True

    def encode_image_base64(self, image: Image.Image) -> str:
        buffered = io.BytesIO()
        image.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode()
    
    def generate_caption_openai(self, image: Image.Image, model: str = "gpt-4o-mini") -> str:
        """Fixed OpenAI caption generation with correct model and image_url format"""
        if not self.openai_client:
            raise ValueError("OpenAI API key not configured.")
        
        base64_image = self.encode_image_base64(image)

        response = self.openai_client.chat.completions.create(
            model=model,  # Use gpt-4o or gpt-4o-mini for vision
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise."
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{base64_image}"  # Fixed: removed space after comma
                            }
                        }
                    ]
                }
            ],
            max_tokens=300
        )
        return response.choices[0].message.content 
    
    def generate_caption_gemini(self, image: Image.Image, 

    model: str = "gemini-2.5-flash") -> str:  # Fixed: use correct model name
        """Fixed Gemini caption generation with correct model name"""
        if not self.gemini_configured:
            raise ValueError("Gemini API key not configured!")
        
        model_instance = genai.GenerativeModel(model)
        prompt = "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise."

        response = model_instance.generate_content([prompt, image])
        return response.text
    
    def generate_caption_groq(self, image: Image.Image, 

    model: str = "meta-llama/llama-4-scout-17b-16e-instruct") -> str:  
        """Fixed GROQ caption generation with correct model name and API structure"""
        if not self.groq_client:
            raise ValueError("GROQ API key is not configured!")
        
        base64_image = self.encode_image_base64(image)

        completion = self.groq_client.chat.completions.create(
            model=model,  # Fixed: added missing model parameter
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Generate the caption for this image. IMPORTANT: Use 10 words or 50 characters maximum. Use only plain text - no emojis, special character but can use ASCII punctuations if you want. Be descriptive but concise."
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{base64_image}"  # Fixed: proper format with url key
                            }
                        }
                    ]
                }
            ],
            max_tokens=300,
            temperature=0.7
        )
        return completion.choices[0].message.content