Spaces:

raksama19
/

Alt_Text_Via_API

Sleeping

File size: 7,321 Bytes

"""
Gemma 3n Image Description Test App
A simple Gradio app to test image description using Gemma 3n via Google Gemini API
"""

import gradio as gr
import os
import io
from PIL import Image
import google.generativeai as genai


def initialize_gemini():
    """Initialize Gemini API with API key"""
    try:
        api_key = os.getenv('GEMINI_API_KEY')
        if not api_key:
            return False, "❌ GEMINI_API_KEY not found in environment variables"
        
        genai.configure(api_key=api_key)
        return True, "✅ Gemini API configured successfully"
    except Exception as e:
        return False, f"❌ Error configuring Gemini API: {str(e)}"


def generate_image_description(image):
    """Generate description for uploaded image using Gemma 3n"""
    if image is None:
        return "Please upload an image first."
    
    try:
        # Initialize Gemini API
        success, message = initialize_gemini()
        if not success:
            return message
        
        # Ensure image is in RGB mode
        if image.mode != 'RGB':
            image = image.convert('RGB')
        
        # Create prompt for detailed image description
        prompt = """You are an expert at describing images in detail. Analyze this image and provide a comprehensive description that includes:

1. Main subjects and objects in the image
2. Colors, lighting, and composition
3. Setting and background details
4. Any text, numbers, or symbols visible
5. Mood, style, or artistic elements
6. Spatial relationships between elements

Provide a clear, detailed description that would help someone who cannot see the image understand what it contains."""
        
        # Generate description using Gemma 3n via Gemini API
        model = genai.GenerativeModel('gemma-3n-e4b-it')
        response = model.generate_content([prompt, image])
        
        if hasattr(response, 'text') and response.text:
            return response.text.strip()
        else:
            return "❌ No description generated. Please try again."
            
    except Exception as e:
        return f"❌ Error generating description: {str(e)}"


def create_alt_text(image):
    """Generate concise alt text for accessibility"""
    if image is None:
        return "Please upload an image first."
    
    try:
        # Initialize Gemini API
        success, message = initialize_gemini()
        if not success:
            return message
        
        # Ensure image is in RGB mode
        if image.mode != 'RGB':
            image = image.convert('RGB')
        
        # Create prompt for concise alt text
        prompt = """You are an accessibility expert creating alt text for images. Analyze this image and provide a clear, concise description suitable for screen readers.

Focus on:
- Main subject or content of the image
- Important details, text, or data shown
- Context that helps understand the image's purpose

Provide alt text in 1-2 sentences that is informative but concise. Start directly with the description without saying "This image shows" or similar phrases."""
        
        # Generate alt text using Gemma 3n via Gemini API
        model = genai.GenerativeModel('gemma-3n-e4b-it')
        response = model.generate_content([prompt, image])
        
        if hasattr(response, 'text') and response.text:
            alt_text = response.text.strip()
            # Clean up common prefixes
            prefixes_to_remove = ["This image shows", "The image shows", "This shows", "The figure shows"]
            for prefix in prefixes_to_remove:
                if alt_text.startswith(prefix):
                    alt_text = alt_text[len(prefix):].strip()
                    break
            return alt_text
        else:
            return "❌ No alt text generated. Please try again."
            
    except Exception as e:
        return f"❌ Error generating alt text: {str(e)}"


# Create Gradio interface
with gr.Blocks(
    title="Gemma 3n Image Description Test",
    theme=gr.themes.Soft(),
    css="""
    .main-container { 
        max-width: 800px; 
        margin: 0 auto; 
    }
    .upload-container { 
        text-align: center; 
        padding: 20px;
        border: 2px dashed #e0e0e0;
        border-radius: 15px;
        margin: 20px 0;
    }
    """
) as demo:
    
    gr.Markdown(
        """
        # 🔍 Gemma 3n Image Description Test
        
        Upload an image and get AI-generated descriptions using **Gemma 3n** via Google Gemini API.
        
        **Requirements:** Set your `GEMINI_API_KEY` environment variable.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            with gr.Group(elem_classes="upload-container"):
                gr.Markdown("## 📷 Upload Image")
                image_input = gr.Image(
                    label="Upload an image",
                    type="pil",
                    height=300
                )
                
                with gr.Row():
                    describe_btn = gr.Button(
                        "📝 Generate Detailed Description", 
                        variant="primary",
                        size="lg"
                    )
                    alt_text_btn = gr.Button(
                        "♿ Generate Alt Text", 
                        variant="secondary",
                        size="lg"
                    )
        
        with gr.Column(scale=1):
            gr.Markdown("## 📋 Results")
            
            detailed_output = gr.Textbox(
                label="Detailed Description",
                placeholder="Detailed description will appear here...",
                lines=10,
                max_lines=15
            )
            
            alt_text_output = gr.Textbox(
                label="Alt Text (Accessibility)",
                placeholder="Concise alt text will appear here...",
                lines=3,
                max_lines=5
            )
    
    # Event handlers
    describe_btn.click(
        fn=generate_image_description,
        inputs=[image_input],
        outputs=[detailed_output]
    )
    
    alt_text_btn.click(
        fn=create_alt_text,
        inputs=[image_input],
        outputs=[alt_text_output]
    )
    
    # Auto-generate on image upload
    image_input.change(
        fn=create_alt_text,
        inputs=[image_input],
        outputs=[alt_text_output]
    )
    
    gr.Markdown(
        """
        ---
        
        ### 💡 Tips:
        - **Detailed Description**: Comprehensive analysis perfect for content understanding
        - **Alt Text**: Concise description optimized for screen readers and accessibility
        - Images are automatically converted to JPEG format for processing
        - Both functions use the same Gemma 3n model with different prompts
        
        ### 🔧 Setup:
        ```bash
        export GEMINI_API_KEY="your-api-key-here"
        pip install -r requirements_gemma_test.txt
        python gradio_gemma_alt_text.py
        ```
        """
    )


if __name__ == "__main__":
    # Check if API key is available
    success, message = initialize_gemini()
    print(f"Startup check: {message}")
    
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )