raksama19 commited on
Commit
2b5d46f
Β·
verified Β·
1 Parent(s): 2f49d2e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +252 -0
app.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gemma 3n Image Description Test App
3
+ A simple Gradio app to test image description using Gemma 3n via Google Gemini API
4
+ """
5
+
6
+ import gradio as gr
7
+ import os
8
+ import io
9
+ from PIL import Image
10
+ import google.generativeai as genai
11
+ from google.generativeai import types
12
+
13
+
14
+ def initialize_gemini():
15
+ """Initialize Gemini API with API key"""
16
+ try:
17
+ api_key = os.getenv('GEMINI_API_KEY')
18
+ if not api_key:
19
+ return False, "❌ GEMINI_API_KEY not found in environment variables"
20
+
21
+ genai.configure(api_key=api_key)
22
+ return True, "βœ… Gemini API configured successfully"
23
+ except Exception as e:
24
+ return False, f"❌ Error configuring Gemini API: {str(e)}"
25
+
26
+
27
+ def generate_image_description(image):
28
+ """Generate description for uploaded image using Gemma 3n"""
29
+ if image is None:
30
+ return "Please upload an image first."
31
+
32
+ try:
33
+ # Initialize Gemini API
34
+ success, message = initialize_gemini()
35
+ if not success:
36
+ return message
37
+
38
+ # Ensure image is in RGB mode
39
+ if image.mode != 'RGB':
40
+ image = image.convert('RGB')
41
+
42
+ # Convert PIL image to bytes
43
+ buffered = io.BytesIO()
44
+ image.save(buffered, format="JPEG")
45
+ image_bytes = buffered.getvalue()
46
+
47
+ # Create prompt for detailed image description
48
+ prompt = """You are an expert at describing images in detail. Analyze this image and provide a comprehensive description that includes:
49
+
50
+ 1. Main subjects and objects in the image
51
+ 2. Colors, lighting, and composition
52
+ 3. Setting and background details
53
+ 4. Any text, numbers, or symbols visible
54
+ 5. Mood, style, or artistic elements
55
+ 6. Spatial relationships between elements
56
+
57
+ Provide a clear, detailed description that would help someone who cannot see the image understand what it contains."""
58
+
59
+ # Generate description using Gemma 3n via Gemini API
60
+ model = genai.GenerativeModel('gemma-3n-e4b-it')
61
+ response = model.generate_content([
62
+ types.Part.from_bytes(
63
+ data=image_bytes,
64
+ mime_type='image/jpeg',
65
+ ),
66
+ prompt
67
+ ])
68
+
69
+ if hasattr(response, 'text') and response.text:
70
+ return response.text.strip()
71
+ else:
72
+ return "❌ No description generated. Please try again."
73
+
74
+ except Exception as e:
75
+ return f"❌ Error generating description: {str(e)}"
76
+
77
+
78
+ def create_alt_text(image):
79
+ """Generate concise alt text for accessibility"""
80
+ if image is None:
81
+ return "Please upload an image first."
82
+
83
+ try:
84
+ # Initialize Gemini API
85
+ success, message = initialize_gemini()
86
+ if not success:
87
+ return message
88
+
89
+ # Ensure image is in RGB mode
90
+ if image.mode != 'RGB':
91
+ image = image.convert('RGB')
92
+
93
+ # Convert PIL image to bytes
94
+ buffered = io.BytesIO()
95
+ image.save(buffered, format="JPEG")
96
+ image_bytes = buffered.getvalue()
97
+
98
+ # Create prompt for concise alt text
99
+ prompt = """You are an accessibility expert creating alt text for images. Analyze this image and provide a clear, concise description suitable for screen readers.
100
+
101
+ Focus on:
102
+ - Main subject or content of the image
103
+ - Important details, text, or data shown
104
+ - Context that helps understand the image's purpose
105
+
106
+ Provide alt text in 1-2 sentences that is informative but concise. Start directly with the description without saying "This image shows" or similar phrases."""
107
+
108
+ # Generate alt text using Gemma 3n via Gemini API
109
+ model = genai.GenerativeModel('gemma-3n-e4b-it')
110
+ response = model.generate_content([
111
+ types.Part.from_bytes(
112
+ data=image_bytes,
113
+ mime_type='image/jpeg',
114
+ ),
115
+ prompt
116
+ ])
117
+
118
+ if hasattr(response, 'text') and response.text:
119
+ alt_text = response.text.strip()
120
+ # Clean up common prefixes
121
+ prefixes_to_remove = ["This image shows", "The image shows", "This shows", "The figure shows"]
122
+ for prefix in prefixes_to_remove:
123
+ if alt_text.startswith(prefix):
124
+ alt_text = alt_text[len(prefix):].strip()
125
+ break
126
+ return alt_text
127
+ else:
128
+ return "❌ No alt text generated. Please try again."
129
+
130
+ except Exception as e:
131
+ return f"❌ Error generating alt text: {str(e)}"
132
+
133
+
134
+ # Create Gradio interface
135
+ with gr.Blocks(
136
+ title="Gemma 3n Image Description Test",
137
+ theme=gr.themes.Soft(),
138
+ css="""
139
+ .main-container {
140
+ max-width: 800px;
141
+ margin: 0 auto;
142
+ }
143
+ .upload-container {
144
+ text-align: center;
145
+ padding: 20px;
146
+ border: 2px dashed #e0e0e0;
147
+ border-radius: 15px;
148
+ margin: 20px 0;
149
+ }
150
+ """
151
+ ) as demo:
152
+
153
+ gr.Markdown(
154
+ """
155
+ # πŸ” Gemma 3n Image Description Test
156
+
157
+ Upload an image and get AI-generated descriptions using **Gemma 3n** via Google Gemini API.
158
+
159
+ **Requirements:** Set your `GEMINI_API_KEY` environment variable.
160
+ """
161
+ )
162
+
163
+ with gr.Row():
164
+ with gr.Column(scale=1):
165
+ with gr.Group(elem_classes="upload-container"):
166
+ gr.Markdown("## πŸ“· Upload Image")
167
+ image_input = gr.Image(
168
+ label="Upload an image",
169
+ type="pil",
170
+ height=300
171
+ )
172
+
173
+ with gr.Row():
174
+ describe_btn = gr.Button(
175
+ "πŸ“ Generate Detailed Description",
176
+ variant="primary",
177
+ size="lg"
178
+ )
179
+ alt_text_btn = gr.Button(
180
+ "β™Ώ Generate Alt Text",
181
+ variant="secondary",
182
+ size="lg"
183
+ )
184
+
185
+ with gr.Column(scale=1):
186
+ gr.Markdown("## πŸ“‹ Results")
187
+
188
+ detailed_output = gr.Textbox(
189
+ label="Detailed Description",
190
+ placeholder="Detailed description will appear here...",
191
+ lines=10,
192
+ max_lines=15
193
+ )
194
+
195
+ alt_text_output = gr.Textbox(
196
+ label="Alt Text (Accessibility)",
197
+ placeholder="Concise alt text will appear here...",
198
+ lines=3,
199
+ max_lines=5
200
+ )
201
+
202
+ # Event handlers
203
+ describe_btn.click(
204
+ fn=generate_image_description,
205
+ inputs=[image_input],
206
+ outputs=[detailed_output]
207
+ )
208
+
209
+ alt_text_btn.click(
210
+ fn=create_alt_text,
211
+ inputs=[image_input],
212
+ outputs=[alt_text_output]
213
+ )
214
+
215
+ # Auto-generate on image upload
216
+ image_input.change(
217
+ fn=create_alt_text,
218
+ inputs=[image_input],
219
+ outputs=[alt_text_output]
220
+ )
221
+
222
+ gr.Markdown(
223
+ """
224
+ ---
225
+
226
+ ### πŸ’‘ Tips:
227
+ - **Detailed Description**: Comprehensive analysis perfect for content understanding
228
+ - **Alt Text**: Concise description optimized for screen readers and accessibility
229
+ - Images are automatically converted to JPEG format for processing
230
+ - Both functions use the same Gemma 3n model with different prompts
231
+
232
+ ### πŸ”§ Setup:
233
+ ```bash
234
+ export GEMINI_API_KEY="your-api-key-here"
235
+ pip install -r requirements_gemma_test.txt
236
+ python gradio_gemma_alt_text.py
237
+ ```
238
+ """
239
+ )
240
+
241
+
242
+ if __name__ == "__main__":
243
+ # Check if API key is available
244
+ success, message = initialize_gemini()
245
+ print(f"Startup check: {message}")
246
+
247
+ demo.launch(
248
+ server_name="0.0.0.0",
249
+ server_port=7860,
250
+ share=False,
251
+ show_error=True
252
+ )