File size: 7,321 Bytes
2b5d46f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f63d98f
2b5d46f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f63d98f
2b5d46f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
"""
Gemma 3n Image Description Test App
A simple Gradio app to test image description using Gemma 3n via Google Gemini API
"""

import gradio as gr
import os
import io
from PIL import Image
import google.generativeai as genai


def initialize_gemini():
    """Initialize Gemini API with API key"""
    try:
        api_key = os.getenv('GEMINI_API_KEY')
        if not api_key:
            return False, "❌ GEMINI_API_KEY not found in environment variables"
        
        genai.configure(api_key=api_key)
        return True, "βœ… Gemini API configured successfully"
    except Exception as e:
        return False, f"❌ Error configuring Gemini API: {str(e)}"


def generate_image_description(image):
    """Generate description for uploaded image using Gemma 3n"""
    if image is None:
        return "Please upload an image first."
    
    try:
        # Initialize Gemini API
        success, message = initialize_gemini()
        if not success:
            return message
        
        # Ensure image is in RGB mode
        if image.mode != 'RGB':
            image = image.convert('RGB')
        
        # Create prompt for detailed image description
        prompt = """You are an expert at describing images in detail. Analyze this image and provide a comprehensive description that includes:

1. Main subjects and objects in the image
2. Colors, lighting, and composition
3. Setting and background details
4. Any text, numbers, or symbols visible
5. Mood, style, or artistic elements
6. Spatial relationships between elements

Provide a clear, detailed description that would help someone who cannot see the image understand what it contains."""
        
        # Generate description using Gemma 3n via Gemini API
        model = genai.GenerativeModel('gemma-3n-e4b-it')
        response = model.generate_content([prompt, image])
        
        if hasattr(response, 'text') and response.text:
            return response.text.strip()
        else:
            return "❌ No description generated. Please try again."
            
    except Exception as e:
        return f"❌ Error generating description: {str(e)}"


def create_alt_text(image):
    """Generate concise alt text for accessibility"""
    if image is None:
        return "Please upload an image first."
    
    try:
        # Initialize Gemini API
        success, message = initialize_gemini()
        if not success:
            return message
        
        # Ensure image is in RGB mode
        if image.mode != 'RGB':
            image = image.convert('RGB')
        
        # Create prompt for concise alt text
        prompt = """You are an accessibility expert creating alt text for images. Analyze this image and provide a clear, concise description suitable for screen readers.

Focus on:
- Main subject or content of the image
- Important details, text, or data shown
- Context that helps understand the image's purpose

Provide alt text in 1-2 sentences that is informative but concise. Start directly with the description without saying "This image shows" or similar phrases."""
        
        # Generate alt text using Gemma 3n via Gemini API
        model = genai.GenerativeModel('gemma-3n-e4b-it')
        response = model.generate_content([prompt, image])
        
        if hasattr(response, 'text') and response.text:
            alt_text = response.text.strip()
            # Clean up common prefixes
            prefixes_to_remove = ["This image shows", "The image shows", "This shows", "The figure shows"]
            for prefix in prefixes_to_remove:
                if alt_text.startswith(prefix):
                    alt_text = alt_text[len(prefix):].strip()
                    break
            return alt_text
        else:
            return "❌ No alt text generated. Please try again."
            
    except Exception as e:
        return f"❌ Error generating alt text: {str(e)}"


# Create Gradio interface
with gr.Blocks(
    title="Gemma 3n Image Description Test",
    theme=gr.themes.Soft(),
    css="""
    .main-container { 
        max-width: 800px; 
        margin: 0 auto; 
    }
    .upload-container { 
        text-align: center; 
        padding: 20px;
        border: 2px dashed #e0e0e0;
        border-radius: 15px;
        margin: 20px 0;
    }
    """
) as demo:
    
    gr.Markdown(
        """
        # πŸ” Gemma 3n Image Description Test
        
        Upload an image and get AI-generated descriptions using **Gemma 3n** via Google Gemini API.
        
        **Requirements:** Set your `GEMINI_API_KEY` environment variable.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            with gr.Group(elem_classes="upload-container"):
                gr.Markdown("## πŸ“· Upload Image")
                image_input = gr.Image(
                    label="Upload an image",
                    type="pil",
                    height=300
                )
                
                with gr.Row():
                    describe_btn = gr.Button(
                        "πŸ“ Generate Detailed Description", 
                        variant="primary",
                        size="lg"
                    )
                    alt_text_btn = gr.Button(
                        "β™Ώ Generate Alt Text", 
                        variant="secondary",
                        size="lg"
                    )
        
        with gr.Column(scale=1):
            gr.Markdown("## πŸ“‹ Results")
            
            detailed_output = gr.Textbox(
                label="Detailed Description",
                placeholder="Detailed description will appear here...",
                lines=10,
                max_lines=15
            )
            
            alt_text_output = gr.Textbox(
                label="Alt Text (Accessibility)",
                placeholder="Concise alt text will appear here...",
                lines=3,
                max_lines=5
            )
    
    # Event handlers
    describe_btn.click(
        fn=generate_image_description,
        inputs=[image_input],
        outputs=[detailed_output]
    )
    
    alt_text_btn.click(
        fn=create_alt_text,
        inputs=[image_input],
        outputs=[alt_text_output]
    )
    
    # Auto-generate on image upload
    image_input.change(
        fn=create_alt_text,
        inputs=[image_input],
        outputs=[alt_text_output]
    )
    
    gr.Markdown(
        """
        ---
        
        ### πŸ’‘ Tips:
        - **Detailed Description**: Comprehensive analysis perfect for content understanding
        - **Alt Text**: Concise description optimized for screen readers and accessibility
        - Images are automatically converted to JPEG format for processing
        - Both functions use the same Gemma 3n model with different prompts
        
        ### πŸ”§ Setup:
        ```bash
        export GEMINI_API_KEY="your-api-key-here"
        pip install -r requirements_gemma_test.txt
        python gradio_gemma_alt_text.py
        ```
        """
    )


if __name__ == "__main__":
    # Check if API key is available
    success, message = initialize_gemini()
    print(f"Startup check: {message}")
    
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )