adityaardak commited on
Commit
e596ac7
·
verified ·
1 Parent(s): 9c88c94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +691 -117
app.py CHANGED
@@ -2,9 +2,9 @@ import gradio as gr
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
 
5
- # -----------------------------
6
- # Model configuration
7
- # -----------------------------
8
  MID = "apple/FastVLM-0.5B"
9
  IMAGE_TOKEN_INDEX = -200
10
 
@@ -14,7 +14,7 @@ model = None
14
  def load_model():
15
  global tok, model
16
  if tok is None or model is None:
17
- print("Loading model on CPU...")
18
  tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
19
  model = AutoModelForCausalLM.from_pretrained(
20
  MID,
@@ -22,11 +22,11 @@ def load_model():
22
  device_map="cpu",
23
  trust_remote_code=True,
24
  )
25
- print("Model loaded successfully!")
26
  return tok, model
27
 
28
 
29
- def run_fastvlm(image, prompt):
30
  if image is None:
31
  return "Please upload an image first."
32
 
@@ -52,10 +52,12 @@ def run_fastvlm(image, prompt):
52
  model_dtype = next(model.parameters()).dtype
53
 
54
  img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype, device=model_device)
 
55
  input_ids = torch.cat(
56
  [pre_ids.to(model_device), img_tok, post_ids.to(model_device)],
57
  dim=1
58
  )
 
59
  attention_mask = torch.ones_like(input_ids, device=model_device)
60
 
61
  pixel_values = model.get_vision_tower().image_processor(
@@ -68,7 +70,7 @@ def run_fastvlm(image, prompt):
68
  inputs=input_ids,
69
  attention_mask=attention_mask,
70
  images=pixel_values,
71
- max_new_tokens=220,
72
  do_sample=False
73
  )
74
 
@@ -84,161 +86,733 @@ def run_fastvlm(image, prompt):
84
  return response
85
 
86
  except Exception as e:
87
- return f"Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- def build_prompt(mode, user_context):
91
- context_part = f"\nExtra user context: {user_context}" if user_context.strip() else ""
 
 
 
 
92
 
93
  prompts = {
94
- "Scene Description":
95
- f"""
96
- You are an AI assistant helping a visually impaired person.
97
- Describe the image in simple, human-friendly language.
98
-
99
- Return output in this format:
100
- 1. Quick Summary
101
- 2. Main Objects Seen
102
- 3. Relative Position of Important Objects
103
- 4. Helpful Note
104
-
105
- Keep the language simple and practical.{context_part}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  """,
107
 
108
- "Hazard Detection":
109
- f"""
110
- You are an AI safety assistant helping a visually impaired person.
111
- Analyze the image for possible hazards.
112
 
113
- Return output in this format:
114
- 1. Quick Summary
115
- 2. Possible Hazards
116
- 3. Risk Level (Low/Medium/High)
117
- 4. Safety Advice
118
 
119
- Be practical and avoid exaggeration.{context_part}
 
120
  """,
121
 
122
- "Important Object Summary":
123
- f"""
124
- You are an AI visual assistant.
125
- Identify the most important objects in the image that a visually impaired person should know about.
126
 
127
- Return output in this format:
128
- 1. Key Objects
129
- 2. What Looks Most Important
130
- 3. Why These Objects Matter
131
- 4. Short Spoken Summary
132
 
133
- Keep it easy to understand.{context_part}
 
134
  """,
135
 
136
- "Safe Action Suggestion":
137
- f"""
138
- You are an AI guidance assistant for a visually impaired person.
139
- Based on the image, suggest the next safest action.
140
 
141
- Return output in this format:
142
- 1. What the Scene Looks Like
143
- 2. What Needs Attention
144
- 3. Recommended Action
145
- 4. One-Line Safety Tip
146
 
147
- Do not assume too much. Give cautious guidance.{context_part}
 
148
  """
149
  }
150
 
151
- return prompts.get(mode, prompts["Scene Description"])
152
 
153
 
154
- def analyze_image(image, mode, user_context):
155
- if image is None:
156
- return "Please upload an image."
157
-
158
- prompt = build_prompt(mode, user_context)
159
- return run_fastvlm(image, prompt)
160
-
161
-
162
- def exhibition_pitch(mode):
163
- pitches = {
164
- "Scene Description":
165
- "This mode explains the surrounding environment in simple words so a visually impaired person can understand the scene.",
166
- "Hazard Detection":
167
- "This mode checks whether the image contains obstacles or risky elements such as vehicles, stairs, clutter, or unsafe walking areas.",
168
- "Important Object Summary":
169
- "This mode highlights the most useful objects in the scene so the user can focus on what matters most.",
170
- "Safe Action Suggestion":
171
- "This mode provides the next practical action the user should consider, based on the visual situation."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  }
173
- return pitches.get(mode, "")
174
 
 
175
 
176
- with gr.Blocks(title="VisionMate AI - Smart Visual Assistant") as demo:
177
- gr.Markdown("""
178
- # 👁️ VisionMate AI
179
- ## Smart Visual Assistant for Visually Impaired People
180
 
181
- Upload an image and let the AI explain the scene, identify hazards, summarize important objects, or suggest the safest next action.
 
 
 
182
 
183
- ### Exhibition Theme
184
- **AI for Social Good**
185
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  with gr.Row():
188
  with gr.Column(scale=1):
189
- image_input = gr.Image(type="pil", label="Upload Scene Image")
190
-
191
- mode = gr.Radio(
192
- choices=[
193
- "Scene Description",
194
- "Hazard Detection",
195
- "Important Object Summary",
196
- "Safe Action Suggestion"
197
- ],
198
- value="Scene Description",
199
- label="Select Assistance Mode"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  )
201
 
202
- user_context = gr.Textbox(
203
- label="Optional Context",
204
- placeholder="Example: Person is walking alone on a road / indoor corridor / market area",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  lines=2
206
  )
 
207
 
208
  with gr.Row():
209
- analyze_btn = gr.Button("Analyze Scene", variant="primary")
210
- clear_btn = gr.ClearButton([image_input, user_context])
 
 
 
 
 
 
 
211
 
212
- with gr.Column(scale=1):
213
- mode_explanation = gr.Textbox(
214
- label="Mode Purpose",
215
- value=exhibition_pitch("Scene Description"),
216
- interactive=False,
217
- lines=4
 
218
  )
 
 
219
 
220
- output = gr.Textbox(
221
- label="AI Assistance Output",
222
- lines=16,
223
- max_lines=25,
224
- show_copy_button=True
225
  )
226
 
227
- mode.change(fn=exhibition_pitch, inputs=mode, outputs=mode_explanation)
228
- analyze_btn.click(fn=analyze_image, inputs=[image_input, mode, user_context], outputs=output)
229
 
230
  gr.Markdown("""
231
  ---
232
- ### Suggested Demo Images for Exhibition
233
- - A road with vehicles and pedestrians
234
- - A classroom or hallway
235
- - A kitchen or home environment
236
- - A supermarket shelf or crowded place
237
-
238
- ### Expected Impact
239
- This project shows how computer vision and multimodal AI can improve accessibility and independence for visually impaired users.
 
 
 
 
 
 
240
  """)
241
 
 
242
  if __name__ == "__main__":
243
  demo.launch(
244
  share=False,
 
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
 
5
+ # =========================================================
6
+ # Model setup
7
+ # =========================================================
8
  MID = "apple/FastVLM-0.5B"
9
  IMAGE_TOKEN_INDEX = -200
10
 
 
14
  def load_model():
15
  global tok, model
16
  if tok is None or model is None:
17
+ print("Loading FastVLM on CPU...")
18
  tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
19
  model = AutoModelForCausalLM.from_pretrained(
20
  MID,
 
22
  device_map="cpu",
23
  trust_remote_code=True,
24
  )
25
+ print("Model loaded successfully on CPU.")
26
  return tok, model
27
 
28
 
29
+ def run_fastvlm(image, prompt, max_new_tokens=180):
30
  if image is None:
31
  return "Please upload an image first."
32
 
 
52
  model_dtype = next(model.parameters()).dtype
53
 
54
  img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype, device=model_device)
55
+
56
  input_ids = torch.cat(
57
  [pre_ids.to(model_device), img_tok, post_ids.to(model_device)],
58
  dim=1
59
  )
60
+
61
  attention_mask = torch.ones_like(input_ids, device=model_device)
62
 
63
  pixel_values = model.get_vision_tower().image_processor(
 
70
  inputs=input_ids,
71
  attention_mask=attention_mask,
72
  images=pixel_values,
73
+ max_new_tokens=max_new_tokens,
74
  do_sample=False
75
  )
76
 
 
86
  return response
87
 
88
  except Exception as e:
89
+ return f"Error generating response: {str(e)}"
90
+
91
+
92
+ # =========================================================
93
+ # Use case knowledge cards
94
+ # =========================================================
95
+ USE_CASE_INFO = {
96
+ "Accessibility Assistant": {
97
+ "problem": "A visually impaired user may need quick scene understanding and help identifying objects or obstacles.",
98
+ "beneficiaries": "Visually impaired users, caregivers, accessibility NGOs, smart assistive-tech teams.",
99
+ "proof": "The app describes the scene, highlights key items, and gives practical guidance.",
100
+ "judge_angle": "Shows AI for inclusion and social impact."
101
+ },
102
+ "Safety Checker": {
103
+ "problem": "People often miss visible risks in busy roads, stairways, cluttered spaces, or public areas.",
104
+ "beneficiaries": "Schools, public-space monitoring teams, safety awareness projects.",
105
+ "proof": "The app flags possible risks, risky zones, and next-safe-action ideas.",
106
+ "judge_angle": "Shows preventive AI and practical awareness."
107
+ },
108
+ "Museum / Exhibit Guide": {
109
+ "problem": "Visitors want engaging explanations, not just raw object names.",
110
+ "beneficiaries": "Museums, exhibitions, tourism projects, learning spaces.",
111
+ "proof": "The app turns the same image into a friendly guide-like explanation.",
112
+ "judge_angle": "Shows storytelling plus education."
113
+ },
114
+ "Retail Shelf Helper": {
115
+ "problem": "Customers and staff need quick item understanding, arrangement insight, and shelf-level interpretation.",
116
+ "beneficiaries": "Retail stores, FMCG demos, smart shopping assistants.",
117
+ "proof": "The app summarizes visible products, arrangement, and shopper-facing insights.",
118
+ "judge_angle": "Shows business and commercial use."
119
+ },
120
+ "Classroom Explainer": {
121
+ "problem": "Students often understand better when images are explained in simple, structured language.",
122
+ "beneficiaries": "Teachers, students, EdTech demos, smart classrooms.",
123
+ "proof": "The app explains the image like a teacher using easy language and teaching points.",
124
+ "judge_angle": "Shows educational value."
125
+ },
126
+ "Travel Interpreter": {
127
+ "problem": "Travelers want quick understanding of landmarks, scenes, crowd conditions, and surroundings.",
128
+ "beneficiaries": "Travel apps, tourism assistance, city experience projects.",
129
+ "proof": "The app explains what the place appears to be, what stands out, and what a visitor should notice.",
130
+ "judge_angle": "Shows lifestyle and tourism use."
131
+ }
132
+ }
133
+
134
+
135
+ def get_use_case_card(use_case):
136
+ info = USE_CASE_INFO[use_case]
137
+ return f"""
138
+ ### {use_case}
139
 
140
+ **Problem Solved**
141
+ {info['problem']}
142
+
143
+ **Who Benefits**
144
+ {info['beneficiaries']}
145
+
146
+ **What This Demo Proves**
147
+ {info['proof']}
148
+
149
+ **Why Judges Usually Like It**
150
+ {info['judge_angle']}
151
+ """
152
 
153
+
154
+ # =========================================================
155
+ # Prompt builders
156
+ # =========================================================
157
+ def build_use_case_prompt(use_case, user_context):
158
+ context = user_context.strip() if user_context else "No extra context provided."
159
 
160
  prompts = {
161
+ "Accessibility Assistant": f"""
162
+ You are an assistive AI helping a visually impaired user.
163
+
164
+ Analyze the uploaded image and return your answer in this format:
165
+ 1. Quick Scene Summary
166
+ 2. Main Objects and Their Positions
167
+ 3. Anything Important to Notice
168
+ 4. Helpful Guidance for the User
169
+
170
+ Use simple, natural, practical language.
171
+ Mention uncertainty when needed.
172
+
173
+ Context: {context}
174
+ """,
175
+
176
+ "Safety Checker": f"""
177
+ You are an AI safety observer.
178
+
179
+ Analyze the uploaded image and return your answer in this format:
180
+ 1. What the Scene Appears to Show
181
+ 2. Possible Hazards or Risky Elements
182
+ 3. Risk Level: Low / Medium / High
183
+ 4. Best Next Safe Action
184
+
185
+ Be cautious, grounded, and practical.
186
+ Do not invent invisible hazards.
187
+ Mention uncertainty when needed.
188
+
189
+ Context: {context}
190
+ """,
191
+
192
+ "Museum / Exhibit Guide": f"""
193
+ You are a smart museum guide.
194
+
195
+ Analyze the uploaded image and return:
196
+ 1. What Visitors Are Looking At
197
+ 2. Interesting Visual Details
198
+ 3. Why It Could Matter / Be Memorable
199
+ 4. A Friendly 2-line Visitor Guide
200
+
201
+ Make it warm, engaging, and exhibition-friendly.
202
+ Context: {context}
203
  """,
204
 
205
+ "Retail Shelf Helper": f"""
206
+ You are an AI retail assistant.
 
 
207
 
208
+ Analyze the uploaded image and return:
209
+ 1. What Products / Objects Are Visible
210
+ 2. Arrangement or Display Observations
211
+ 3. Shopper-Friendly Insights
212
+ 4. Staff / Store Improvement Suggestion
213
 
214
+ Be concise, business-relevant, and practical.
215
+ Context: {context}
216
  """,
217
 
218
+ "Classroom Explainer": f"""
219
+ You are a teacher explaining the image to students.
 
 
220
 
221
+ Return:
222
+ 1. What We See
223
+ 2. Main Concepts / Objects
224
+ 3. Easy Explanation for Students
225
+ 4. One Learning Question
226
 
227
+ Use clear, beginner-friendly language.
228
+ Context: {context}
229
  """,
230
 
231
+ "Travel Interpreter": f"""
232
+ You are an AI travel companion.
 
 
233
 
234
+ Analyze the uploaded image and return:
235
+ 1. What This Place / Scene Looks Like
236
+ 2. What a Visitor Would Notice First
237
+ 3. Interesting or Useful Observations
238
+ 4. One Practical Travel Tip
239
 
240
+ Stay grounded in the visible scene.
241
+ Context: {context}
242
  """
243
  }
244
 
245
+ return prompts[use_case]
246
 
247
 
248
+ def build_persona_prompt(persona, tone, goal):
249
+ goal_text = goal.strip() if goal else "Explain the image in your role."
250
+ return f"""
251
+ You are analyzing the image as this role: {persona}
252
+ Tone: {tone}
253
+ Goal: {goal_text}
254
+
255
+ Return your answer in this format:
256
+ 1. Role Introduction
257
+ 2. What I Notice First
258
+ 3. What Matters Most From My Perspective
259
+ 4. My Advice / Commentary
260
+ 5. One Memorable Closing Line
261
+
262
+ Stay grounded in the image.
263
+ Do not pretend to know hidden facts.
264
+ """
265
+
266
+
267
+ def build_mission_prompt(mission, mission_context):
268
+ context = mission_context.strip() if mission_context else "No extra context."
269
+
270
+ mission_prompts = {
271
+ "Hidden Detail Hunt": f"""
272
+ Study the image carefully.
273
+
274
+ Return:
275
+ 1. 5 specific details that are easy to miss
276
+ 2. Why each detail matters
277
+ 3. What those details suggest about the scene
278
+
279
+ Stay grounded in the visible image only.
280
+ Context: {context}
281
+ """,
282
+
283
+ "Exhibit Quiz Maker": f"""
284
+ Create a mini exhibition quiz from the image.
285
+
286
+ Return:
287
+ 1. Five quiz questions
288
+ 2. Correct answer under each question
289
+ 3. One final bonus question
290
+
291
+ Make the quiz engaging and image-based.
292
+ Context: {context}
293
+ """,
294
+
295
+ "Pitch From the Picture": f"""
296
+ Look at the image and imagine a useful product, service, or startup idea inspired by it.
297
+
298
+ Return:
299
+ 1. Problem Seen in the Image
300
+ 2. Product / Service Idea
301
+ 3. Target Users
302
+ 4. One-line Pitch
303
+
304
+ Keep it smart, creative, but still linked to the image.
305
+ Context: {context}
306
+ """,
307
+
308
+ "Evidence Board": f"""
309
+ Analyze the image critically.
310
+
311
+ Return:
312
+ 1. Things that are clearly visible
313
+ 2. Things that are likely but not certain
314
+ 3. Things that should NOT be assumed
315
+ 4. Why careful interpretation matters
316
+
317
+ This mission is for teaching responsible AI reasoning.
318
+ Context: {context}
319
+ """,
320
+
321
+ "Story Spark": f"""
322
+ Create a short story inspired by the image.
323
+
324
+ Return:
325
+ 1. Title
326
+ 2. Story in under 120 words
327
+ 3. What visual details inspired the story
328
+
329
+ Keep it imaginative but tied to the scene.
330
+ Context: {context}
331
+ """,
332
+
333
+ "Accessibility Voiceover": f"""
334
+ Create a voiceover-style narration for a visually impaired user.
335
+
336
+ Return:
337
+ 1. Calm spoken scene narration
338
+ 2. Important objects
339
+ 3. Immediate practical note
340
+ 4. Final short reassurance
341
+
342
+ Make it audio-friendly and natural.
343
+ Context: {context}
344
+ """
345
  }
 
346
 
347
+ return mission_prompts[mission]
348
 
 
 
 
 
349
 
350
+ def build_question_prompt(question):
351
+ user_q = question.strip() if question else "What is happening in this image?"
352
+ return f"""
353
+ Answer the user's question about the image.
354
 
355
+ Question: {user_q}
356
+
357
+ Return:
358
+ 1. Direct Answer
359
+ 2. Evidence From the Image
360
+ 3. Uncertainty Note if Needed
361
+
362
+ Keep it short and reliable.
363
+ """
364
+
365
+
366
+ # =========================================================
367
+ # App functions
368
+ # =========================================================
369
+ def analyze_use_case(image, use_case, user_context):
370
+ prompt = build_use_case_prompt(use_case, user_context)
371
+ return run_fastvlm(image, prompt, max_new_tokens=200)
372
+
373
+
374
+ def persona_playground(image, persona, tone, goal):
375
+ prompt = build_persona_prompt(persona, tone, goal)
376
+ return run_fastvlm(image, prompt, max_new_tokens=190)
377
+
378
+
379
+ def mission_lab(image, mission, mission_context):
380
+ prompt = build_mission_prompt(mission, mission_context)
381
+ return run_fastvlm(image, prompt, max_new_tokens=220)
382
+
383
+
384
+ def ask_image(image, question):
385
+ prompt = build_question_prompt(question)
386
+ return run_fastvlm(image, prompt, max_new_tokens=160)
387
+
388
+
389
+ def compare_booth(image, compare_context):
390
+ context = compare_context.strip() if compare_context else "No extra context."
391
+
392
+ prompt_1 = f"""
393
+ Explain this image as an Accessibility Assistant.
394
+ Return:
395
+ 1. Scene Summary
396
+ 2. Important Objects
397
+ 3. Helpful Guidance
398
+ Context: {context}
399
+ """
400
+ prompt_2 = f"""
401
+ Explain this image as a Safety Checker.
402
+ Return:
403
+ 1. Visible Risks
404
+ 2. Risk Level
405
+ 3. Safe Next Step
406
+ Context: {context}
407
+ """
408
+ prompt_3 = f"""
409
+ Explain this image as a Classroom Teacher.
410
+ Return:
411
+ 1. What Students See
412
+ 2. Main Idea
413
+ 3. One Learning Question
414
+ Context: {context}
415
+ """
416
+
417
+ out1 = run_fastvlm(image, prompt_1, max_new_tokens=140)
418
+ out2 = run_fastvlm(image, prompt_2, max_new_tokens=140)
419
+ out3 = run_fastvlm(image, prompt_3, max_new_tokens=140)
420
+
421
+ return out1, out2, out3
422
+
423
+
424
+ def generate_exhibit_script(use_case):
425
+ scripts = {
426
+ "Accessibility Assistant": """
427
+ ### 30-Second Pitch
428
+
429
+ This project turns image understanding into an accessibility helper.
430
+ A user uploads a scene, and the system explains what is visible, what matters most, and what practical guidance may help.
431
+ This shows how multimodal AI can support inclusion, independence, and human-centered design.
432
+
433
+ **Best line for judges:**
434
+ "We are not just describing pictures. We are translating visual space into usable understanding."
435
+ """,
436
+
437
+ "Safety Checker": """
438
+ ### 30-Second Pitch
439
+
440
+ This project uses visual AI to inspect scenes for visible risk signals such as clutter, unsafe movement zones, or attention-worthy areas.
441
+ It is useful as an awareness tool for schools, public demonstrations, and smart safety education.
442
+ The value is not only detection, but guidance.
443
+
444
+ **Best line for judges:**
445
+ "This app turns passive vision into preventive awareness."
446
+ """,
447
+
448
+ "Museum / Exhibit Guide": """
449
+ ### 30-Second Pitch
450
+
451
+ This project acts like an AI guide that explains images in a visitor-friendly way.
452
+ Instead of only naming objects, it creates interpretation, context, and memorable observations.
453
+ It can be adapted for museums, campus exhibitions, tourism booths, and educational spaces.
454
+
455
+ **Best line for judges:**
456
+ "We changed image captioning into an interactive guide experience."
457
+ """,
458
+
459
+ "Retail Shelf Helper": """
460
+ ### 30-Second Pitch
461
+
462
+ This project interprets shelf images and converts them into shopper and business insights.
463
+ It can help summarize visible products, arrangement cues, and display observations.
464
+ This shows how the same AI model can serve a commercial use case without retraining.
465
+
466
+ **Best line for judges:**
467
+ "One image can become both a customer insight and an operational insight."
468
+ """,
469
+
470
+ "Classroom Explainer": """
471
+ ### 30-Second Pitch
472
+
473
+ This project uses image understanding to support teaching.
474
+ It explains the same visual in simple educational language and even creates learning prompts.
475
+ That makes it useful for smart classrooms, EdTech projects, and visual learning tools.
476
+
477
+ **Best line for judges:**
478
+ "This app helps students look at an image and actually learn from it."
479
+ """,
480
+
481
+ "Travel Interpreter": """
482
+ ### 30-Second Pitch
483
+
484
+ This project behaves like a visual travel companion.
485
+ It interprets scenes, highlights what visitors may notice, and gives useful context or practical tips.
486
+ That makes it relevant for tourism, smart city experiences, and visitor support.
487
+
488
+ **Best line for judges:**
489
+ "We turned one uploaded image into a mini travel briefing."
490
+ """
491
+ }
492
+ return scripts[use_case]
493
+
494
+
495
+ # =========================================================
496
+ # UI text
497
+ # =========================================================
498
+ HERO = """
499
+ # VisionVerse AI
500
+ ## Exhibition Studio for Real-World Image Intelligence
501
+
502
+ Upload one image and explore many use cases:
503
+ - accessibility
504
+ - safety
505
+ - teaching
506
+ - tourism
507
+ - retail
508
+ - storytelling
509
+ - evidence checking
510
+ - interactive Q&A
511
+
512
+ ### What makes this exhibition-ready?
513
+ This is not a one-button caption demo.
514
+ It is a **multi-use visual intelligence studio** designed to prove that a single AI vision engine can serve many real-world situations.
515
+ """
516
+
517
+ INFO_PAGE = """
518
+ # Project Info
519
+
520
+ ## 1) What this project is
521
+ VisionVerse AI is an exhibition-ready visual intelligence app built on top of a multimodal image-language model.
522
+ Instead of using the model for just one generic caption, the app wraps it in multiple roles, scenarios, and interaction modes.
523
+
524
+ ## 2) Core idea
525
+ One uploaded image can be interpreted in many ways:
526
+ - as an accessibility helper
527
+ - as a safety observer
528
+ - as a teacher
529
+ - as a museum guide
530
+ - as a retail assistant
531
+ - as a travel companion
532
+ - as a critical evidence checker
533
+
534
+ ## 3) Why this matters
535
+ In many student projects, the model is good but the demonstration feels narrow.
536
+ This app proves flexibility, purpose, and user-centered design.
537
+
538
+ ## 4) Architecture
539
+ - Gradio front-end
540
+ - FastVLM multimodal model
541
+ - CPU-only inference
542
+ - Prompt engineering for role adaptation
543
+ - Tab-based interaction design
544
+
545
+ ## 5) Strengths
546
+ - many real-world uses from one model
547
+ - strong exhibition storytelling
548
+ - easy demo with any uploaded image
549
+ - playful interaction modes
550
+ - educational and social impact angles
551
+
552
+ ## 6) Limitations
553
+ - runs on CPU, so response can be slower
554
+ - not a certified medical or safety device
555
+ - may miss fine details or make uncertain interpretations
556
+ - should be used as assistive AI, not final authority
557
+
558
+ ## 7) Responsible AI note
559
+ The Evidence Board mission is included to show that good AI systems should separate:
560
+ - what is clearly visible
561
+ - what is likely
562
+ - what should not be assumed
563
+
564
+ ## 8) Suggested evaluation ideas
565
+ - response usefulness
566
+ - clarity of explanation
567
+ - consistency across different scenes
568
+ - user satisfaction by use case
569
+ - educational / accessibility impact
570
+
571
+ ## 9) Best demo images
572
+ - road or traffic scene
573
+ - classroom or laboratory
574
+ - store shelf
575
+ - museum object
576
+ - crowded public place
577
+ - home kitchen or hallway
578
+
579
+ ## 10) Best exhibition closing line
580
+ "This project is not about generating text from images. It is about generating the right kind of help for the right kind of user."
581
+ """
582
+
583
+ CSS = """
584
+ .gradio-container {
585
+ max-width: 1400px !important;
586
+ }
587
+ .card-note {
588
+ border-radius: 16px;
589
+ padding: 14px;
590
+ background: #f6f8ff;
591
+ }
592
+ """
593
+
594
+
595
+ # =========================================================
596
+ # Gradio UI
597
+ # =========================================================
598
+ with gr.Blocks(title="VisionVerse AI", css=CSS, theme=gr.themes.Soft()) as demo:
599
+ gr.Markdown(HERO)
600
 
601
  with gr.Row():
602
  with gr.Column(scale=1):
603
+ shared_image = gr.Image(type="pil", label="Upload Image for All Tabs")
604
+ clear_all = gr.ClearButton([shared_image], value="Clear Image")
605
+ with gr.Column(scale=1):
606
+ gr.Markdown("""
607
+ ### Quick Demo Route
608
+ 1. Upload one image
609
+ 2. Open **Use Case Studio**
610
+ 3. Open **Persona Playground**
611
+ 4. Open **Mission Lab**
612
+ 5. Open **Compare Booth**
613
+ 6. End with **Live Exhibit Script**
614
+
615
+ This flow makes the demo feel layered, interactive, and purposeful.
616
+ """)
617
+
618
+ with gr.Tabs():
619
+ with gr.Tab("Use Case Studio"):
620
+ with gr.Row():
621
+ with gr.Column():
622
+ use_case = gr.Dropdown(
623
+ choices=list(USE_CASE_INFO.keys()),
624
+ value="Accessibility Assistant",
625
+ label="Choose Real-World Use Case"
626
+ )
627
+ use_case_context = gr.Textbox(
628
+ label="Optional Context",
629
+ placeholder="Example: school corridor / grocery shelf / street crossing / museum object",
630
+ lines=2
631
+ )
632
+ use_case_btn = gr.Button("Run Use Case Analysis", variant="primary")
633
+ with gr.Column():
634
+ use_case_card = gr.Markdown(get_use_case_card("Accessibility Assistant"))
635
+ use_case_output = gr.Textbox(
636
+ label="Use Case Output",
637
+ lines=16,
638
+ max_lines=24,
639
+ show_copy_button=True
640
+ )
641
+
642
+ use_case.change(fn=get_use_case_card, inputs=use_case, outputs=use_case_card)
643
+ use_case_btn.click(
644
+ fn=analyze_use_case,
645
+ inputs=[shared_image, use_case, use_case_context],
646
+ outputs=use_case_output
647
+ )
648
+
649
+ with gr.Tab("Persona Playground"):
650
+ gr.Markdown("Make the same image speak through different roles. This is great for grabbing attention at an exhibition.")
651
+
652
+ with gr.Row():
653
+ with gr.Column():
654
+ persona = gr.Dropdown(
655
+ choices=[
656
+ "Teacher",
657
+ "Tour Guide",
658
+ "Safety Officer",
659
+ "Journalist",
660
+ "Retail Manager",
661
+ "Emergency Responder",
662
+ "Storyteller",
663
+ "Accessibility Coach"
664
+ ],
665
+ value="Teacher",
666
+ label="Choose Persona"
667
+ )
668
+ tone = gr.Dropdown(
669
+ choices=["Friendly", "Professional", "Calm", "Excited", "Analytical", "Simple"],
670
+ value="Friendly",
671
+ label="Tone"
672
+ )
673
+ persona_goal = gr.Textbox(
674
+ label="Goal",
675
+ placeholder="Example: explain to children / brief judges / guide a visitor",
676
+ lines=2
677
+ )
678
+ persona_btn = gr.Button("Transform Through Persona", variant="primary")
679
+
680
+ with gr.Column():
681
+ persona_output = gr.Textbox(
682
+ label="Persona Response",
683
+ lines=18,
684
+ max_lines=26,
685
+ show_copy_button=True
686
+ )
687
+
688
+ persona_btn.click(
689
+ fn=persona_playground,
690
+ inputs=[shared_image, persona, tone, persona_goal],
691
+ outputs=persona_output
692
+ )
693
+
694
+ with gr.Tab("Mission Lab"):
695
+ gr.Markdown("This tab gives the app unusual interaction playgrounds. These are excellent for proving flexibility, creativity, and responsible reasoning.")
696
+
697
+ with gr.Row():
698
+ with gr.Column():
699
+ mission = gr.Radio(
700
+ choices=[
701
+ "Hidden Detail Hunt",
702
+ "Exhibit Quiz Maker",
703
+ "Pitch From the Picture",
704
+ "Evidence Board",
705
+ "Story Spark",
706
+ "Accessibility Voiceover"
707
+ ],
708
+ value="Hidden Detail Hunt",
709
+ label="Choose Mission"
710
+ )
711
+ mission_context = gr.Textbox(
712
+ label="Mission Context",
713
+ placeholder="Example: target audience is school students / judges / visually impaired users",
714
+ lines=2
715
+ )
716
+ mission_btn = gr.Button("Run Mission", variant="primary")
717
+ with gr.Column():
718
+ mission_output = gr.Textbox(
719
+ label="Mission Output",
720
+ lines=18,
721
+ max_lines=28,
722
+ show_copy_button=True
723
+ )
724
+
725
+ mission_btn.click(
726
+ fn=mission_lab,
727
+ inputs=[shared_image, mission, mission_context],
728
+ outputs=mission_output
729
  )
730
 
731
+ with gr.Tab("Ask the Image"):
732
+ gr.Markdown("Ask anything about the uploaded image. This makes the demo feel conversational rather than static.")
733
+
734
+ with gr.Row():
735
+ with gr.Column():
736
+ user_question = gr.Textbox(
737
+ label="Ask a Question About the Image",
738
+ placeholder="What is the most important object here? / Does this look crowded? / What should a student learn from this?",
739
+ lines=2
740
+ )
741
+ ask_btn = gr.Button("Ask", variant="primary")
742
+ with gr.Column():
743
+ ask_output = gr.Textbox(
744
+ label="Answer",
745
+ lines=12,
746
+ max_lines=20,
747
+ show_copy_button=True
748
+ )
749
+
750
+ ask_btn.click(
751
+ fn=ask_image,
752
+ inputs=[shared_image, user_question],
753
+ outputs=ask_output
754
+ )
755
+
756
+ with gr.Tab("Compare Booth"):
757
+ gr.Markdown("One image, three minds. This tab is strong for proving that the same model can support different goals.")
758
+
759
+ compare_context = gr.Textbox(
760
+ label="Optional Compare Context",
761
+ placeholder="Example: public road / classroom / tourist spot",
762
  lines=2
763
  )
764
+ compare_btn = gr.Button("Run 3-Way Compare", variant="primary")
765
 
766
  with gr.Row():
767
+ compare_out_1 = gr.Textbox(label="Accessibility Lens", lines=14, show_copy_button=True)
768
+ compare_out_2 = gr.Textbox(label="Safety Lens", lines=14, show_copy_button=True)
769
+ compare_out_3 = gr.Textbox(label="Teaching Lens", lines=14, show_copy_button=True)
770
+
771
+ compare_btn.click(
772
+ fn=compare_booth,
773
+ inputs=[shared_image, compare_context],
774
+ outputs=[compare_out_1, compare_out_2, compare_out_3]
775
+ )
776
 
777
+ with gr.Tab("Live Exhibit Script"):
778
+ gr.Markdown("Use this tab at the end of your demo. It gives you clean lines to say in front of judges.")
779
+
780
+ script_use_case = gr.Dropdown(
781
+ choices=list(USE_CASE_INFO.keys()),
782
+ value="Accessibility Assistant",
783
+ label="Choose Your Main Showcase Angle"
784
  )
785
+ script_btn = gr.Button("Generate Pitch Script", variant="primary")
786
+ script_output = gr.Markdown()
787
 
788
+ script_btn.click(
789
+ fn=generate_exhibit_script,
790
+ inputs=script_use_case,
791
+ outputs=script_output
 
792
  )
793
 
794
+ with gr.Tab("Project Info"):
795
+ gr.Markdown(INFO_PAGE)
796
 
797
  gr.Markdown("""
798
  ---
799
+ ### Extra Exhibition Tips
800
+
801
+ **Best live flow**
802
+ - start with Accessibility Assistant
803
+ - switch to Persona Playground
804
+ - show Evidence Board in Mission Lab
805
+ - finish with Compare Booth
806
+ - close using Live Exhibit Script
807
+
808
+ **Why that works**
809
+ You show usefulness, creativity, responsibility, and communication in one go.
810
+
811
+ **Note**
812
+ The Compare Booth runs the model three times, so it can be slower on CPU.
813
  """)
814
 
815
+
816
  if __name__ == "__main__":
817
  demo.launch(
818
  share=False,