kkkai123456 commited on
Commit
0263be8
Β·
verified Β·
1 Parent(s): 8c4c5ca

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +300 -0
app.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image
4
+ from transformers import (
5
+ BlipProcessor, BlipForConditionalGeneration,
6
+ BlipForQuestionAnswering,
7
+ CLIPProcessor, CLIPModel
8
+ )
9
+ import numpy as np
10
+
11
+ # ==================== Model Loading ====================
12
+ print("πŸ”„ Loading models...")
13
+
14
+ # BLIP Image Captioning Model
15
+ caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
16
+ caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
17
+
18
+ # BLIP Visual Question Answering Model
19
+ vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
20
+ vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
21
+
22
+ # CLIP Image Classification Model
23
+ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
24
+ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
25
+
26
+ print("βœ… Models loaded successfully!")
27
+
28
+ # ==================== Function Definitions ====================
29
+
30
+ def generate_caption(image):
31
+ """Generate image caption"""
32
+ if image is None:
33
+ return "❌ Please upload an image first"
34
+
35
+ try:
36
+ # Process image
37
+ inputs = caption_processor(image, return_tensors="pt")
38
+
39
+ # Generate caption
40
+ out = caption_model.generate(**inputs, max_length=50)
41
+ caption = caption_processor.decode(out[0], skip_special_tokens=True)
42
+
43
+ return f"πŸ“ Image Caption:\n{caption}"
44
+
45
+ except Exception as e:
46
+ return f"❌ Processing failed: {str(e)}"
47
+
48
+
49
+ def answer_question(image, question):
50
+ """Visual Question Answering"""
51
+ if image is None:
52
+ return "❌ Please upload an image first"
53
+ if not question.strip():
54
+ return "❌ Please enter a question"
55
+
56
+ try:
57
+ # Process inputs
58
+ inputs = vqa_processor(image, question, return_tensors="pt")
59
+
60
+ # Generate answer
61
+ out = vqa_model.generate(**inputs, max_length=20)
62
+ answer = vqa_processor.decode(out[0], skip_special_tokens=True)
63
+
64
+ return f"❓ Question: {question}\n\nβœ… Answer: {answer}"
65
+
66
+ except Exception as e:
67
+ return f"❌ Processing failed: {str(e)}"
68
+
69
+
70
+ def classify_image(image, categories):
71
+ """Zero-shot Image Classification"""
72
+ if image is None:
73
+ return "❌ Please upload an image first"
74
+ if not categories.strip():
75
+ return "❌ Please enter categories"
76
+
77
+ try:
78
+ # Parse categories
79
+ category_list = [cat.strip() for cat in categories.split(",")]
80
+
81
+ # Process image and text
82
+ inputs = clip_processor(
83
+ text=category_list,
84
+ images=image,
85
+ return_tensors="pt",
86
+ padding=True
87
+ )
88
+
89
+ # Calculate similarity
90
+ outputs = clip_model(**inputs)
91
+ logits_per_image = outputs.logits_per_image
92
+ probs = logits_per_image.softmax(dim=1)[0]
93
+
94
+ # Format results
95
+ results = "🎯 Classification Results:\n\n"
96
+ for category, prob in zip(category_list, probs):
97
+ percentage = prob.item() * 100
98
+ bar = "β–ˆ" * int(percentage / 5)
99
+ results += f"{category}: {percentage:.2f}% {bar}\n"
100
+
101
+ return results
102
+
103
+ except Exception as e:
104
+ return f"❌ Processing failed: {str(e)}"
105
+
106
+
107
+ def multimodal_chat(image, message, history):
108
+ """Multimodal Chat (Simplified)"""
109
+ if image is None:
110
+ return history + [[message, "❌ Please upload an image first to start chatting"]]
111
+
112
+ try:
113
+ # Use VQA model to process question
114
+ inputs = vqa_processor(image, message, return_tensors="pt")
115
+ out = vqa_model.generate(**inputs, max_length=30)
116
+ response = vqa_processor.decode(out[0], skip_special_tokens=True)
117
+
118
+ history.append([message, response])
119
+ return history
120
+
121
+ except Exception as e:
122
+ history.append([message, f"❌ Processing failed: {str(e)}"])
123
+ return history
124
+
125
+
126
+ # ==================== Gradio Interface ====================
127
+
128
+ # Custom CSS
129
+ custom_css = """
130
+ #title {
131
+ text-align: center;
132
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
133
+ -webkit-background-clip: text;
134
+ -webkit-text-fill-color: transparent;
135
+ font-size: 3em;
136
+ font-weight: bold;
137
+ margin-bottom: 10px;
138
+ }
139
+ #subtitle {
140
+ text-align: center;
141
+ color: #666;
142
+ font-size: 1.2em;
143
+ margin-bottom: 30px;
144
+ }
145
+ .feature-box {
146
+ border: 2px solid #667eea;
147
+ border-radius: 10px;
148
+ padding: 20px;
149
+ margin: 10px 0;
150
+ }
151
+ """
152
+
153
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
154
+
155
+ # Title
156
+ gr.HTML('<h1 id="title">πŸ€– Vision Language AI Demo</h1>')
157
+ gr.HTML('<p id="subtitle">Interactive application showcasing multiple vision-language AI capabilities</p>')
158
+
159
+ # Tabbed Interface
160
+ with gr.Tabs():
161
+
162
+ # Tab 1: Image Captioning
163
+ with gr.Tab("πŸ–ΌοΈ Image Captioning"):
164
+ gr.Markdown("### Upload an image and AI will generate a description")
165
+ with gr.Row():
166
+ with gr.Column():
167
+ caption_image = gr.Image(type="pil", label="Upload Image")
168
+ caption_btn = gr.Button("🎨 Generate Caption", variant="primary")
169
+ with gr.Column():
170
+ caption_output = gr.Textbox(
171
+ label="Generated Caption",
172
+ lines=5,
173
+ placeholder="Caption will appear here..."
174
+ )
175
+
176
+ # Examples
177
+ gr.Examples(
178
+ examples=[
179
+ ["https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba"],
180
+ ["https://images.unsplash.com/photo-1506748686214-e9df14d4d9d0"],
181
+ ],
182
+ inputs=caption_image,
183
+ label="πŸ“Έ Click on examples to try"
184
+ )
185
+
186
+ caption_btn.click(
187
+ fn=generate_caption,
188
+ inputs=caption_image,
189
+ outputs=caption_output
190
+ )
191
+ caption_image.change(
192
+ fn=generate_caption,
193
+ inputs=caption_image,
194
+ outputs=caption_output
195
+ )
196
+
197
+ # Tab 2: Visual Question Answering
198
+ with gr.Tab("πŸ” Visual Question Answering"):
199
+ gr.Markdown("### Upload an image and ask questions, AI will answer based on the image content")
200
+ with gr.Row():
201
+ with gr.Column():
202
+ vqa_image = gr.Image(type="pil", label="Upload Image")
203
+ vqa_question = gr.Textbox(
204
+ label="Enter Question",
205
+ placeholder="e.g., What color is the car? How many people are there?",
206
+ lines=2
207
+ )
208
+ vqa_btn = gr.Button("πŸ€” Get Answer", variant="primary")
209
+ with gr.Column():
210
+ vqa_output = gr.Textbox(
211
+ label="AI Answer",
212
+ lines=6,
213
+ placeholder="Answer will appear here..."
214
+ )
215
+
216
+ # Common question examples
217
+ gr.Markdown("**πŸ’‘ Common Question Examples:**")
218
+ gr.Markdown("- What is in the image?\n- What color is...?\n- How many ... are there?\n- Is there a ... in the image?")
219
+
220
+ vqa_btn.click(
221
+ fn=answer_question,
222
+ inputs=[vqa_image, vqa_question],
223
+ outputs=vqa_output
224
+ )
225
+
226
+ # Tab 3: Image Classification
227
+ with gr.Tab("🏷️ Zero-Shot Classification"):
228
+ gr.Markdown("### Define custom categories and AI will classify the image")
229
+ with gr.Row():
230
+ with gr.Column():
231
+ classify_image_input = gr.Image(type="pil", label="Upload Image")
232
+ classify_categories = gr.Textbox(
233
+ label="Categories (comma-separated)",
234
+ placeholder="e.g., cat, dog, bird, car, building",
235
+ value="cat, dog, bird, car, building",
236
+ lines=2
237
+ )
238
+ classify_btn = gr.Button("🎯 Classify", variant="primary")
239
+ with gr.Column():
240
+ classify_output = gr.Textbox(
241
+ label="Classification Results",
242
+ lines=8,
243
+ placeholder="Results will appear here..."
244
+ )
245
+
246
+ gr.Markdown("**πŸ’‘ Tip:** You can input any categories, the model will calculate similarity between the image and each category")
247
+
248
+ classify_btn.click(
249
+ fn=classify_image,
250
+ inputs=[classify_image_input, classify_categories],
251
+ outputs=classify_output
252
+ )
253
+
254
+ # Tab 4: Multimodal Chat
255
+ with gr.Tab("πŸ’¬ Multimodal Chat"):
256
+ gr.Markdown("### Upload an image and have a conversation with AI about it")
257
+ with gr.Row():
258
+ with gr.Column(scale=1):
259
+ chat_image = gr.Image(type="pil", label="Upload Image")
260
+ gr.Markdown("**πŸ’‘ Conversation Prompts:**")
261
+ gr.Markdown("- Describe this image\n- What's in the image?\n- Where is this?\n- What is the main color?")
262
+
263
+ with gr.Column(scale=2):
264
+ chatbot = gr.Chatbot(label="Chat History", height=400)
265
+ chat_input = gr.Textbox(
266
+ label="Enter Message",
267
+ placeholder="Type your question...",
268
+ lines=2
269
+ )
270
+ with gr.Row():
271
+ chat_btn = gr.Button("πŸ“€ Send", variant="primary")
272
+ clear_btn = gr.Button("πŸ—‘οΏ½οΏ½ Clear Chat")
273
+
274
+ chat_btn.click(
275
+ fn=multimodal_chat,
276
+ inputs=[chat_image, chat_input, chatbot],
277
+ outputs=chatbot
278
+ )
279
+ chat_input.submit(
280
+ fn=multimodal_chat,
281
+ inputs=[chat_image, chat_input, chatbot],
282
+ outputs=chatbot
283
+ )
284
+ clear_btn.click(lambda: [], outputs=chatbot)
285
+
286
+ # Footer
287
+ gr.Markdown("---")
288
+ gr.Markdown("""
289
+ ### πŸ“š About This Application
290
+ - **Models**: BLIP (Captioning & VQA) + CLIP (Classification)
291
+ - **Framework**: Gradio + Transformers
292
+ - **Deployment**: Can be deployed to Hugging Face Spaces
293
+ - **Open Source**: All models are open source
294
+
295
+ ⚑ **Performance Tip**: Use Hugging Face Spaces Zero GPU for significantly faster processing
296
+ """)
297
+
298
+ # Launch application
299
+ if __name__ == "__main__":
300
+ demo.launch(share=True)