Spaces:

adityaardak
/

Describe_Image_using_FastVLM

Running

App Files Files Community

adityaardak commited on Mar 30

Commit

e596ac7

verified ·

1 Parent(s): 9c88c94

Update app.py

Browse files

Files changed (1) hide show

app.py +691 -117

app.py CHANGED Viewed

@@ -2,9 +2,9 @@ import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# -----------------------------
-# Model configuration
-# -----------------------------
 MID = "apple/FastVLM-0.5B"
 IMAGE_TOKEN_INDEX = -200
@@ -14,7 +14,7 @@ model = None
 def load_model():
     global tok, model
     if tok is None or model is None:
-        print("Loading model on CPU...")
         tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             MID,
@@ -22,11 +22,11 @@ def load_model():
             device_map="cpu",
             trust_remote_code=True,
         )
-        print("Model loaded successfully!")
     return tok, model
-def run_fastvlm(image, prompt):
     if image is None:
         return "Please upload an image first."
@@ -52,10 +52,12 @@ def run_fastvlm(image, prompt):
         model_dtype = next(model.parameters()).dtype
         img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype, device=model_device)
         input_ids = torch.cat(
             [pre_ids.to(model_device), img_tok, post_ids.to(model_device)],
             dim=1
         )
         attention_mask = torch.ones_like(input_ids, device=model_device)
         pixel_values = model.get_vision_tower().image_processor(
@@ -68,7 +70,7 @@ def run_fastvlm(image, prompt):
                 inputs=input_ids,
                 attention_mask=attention_mask,
                 images=pixel_values,
-                max_new_tokens=220,
                 do_sample=False
             )
@@ -84,161 +86,733 @@ def run_fastvlm(image, prompt):
         return response
     except Exception as e:
-        return f"Error: {str(e)}"
-def build_prompt(mode, user_context):
-    context_part = f"\nExtra user context: {user_context}" if user_context.strip() else ""
     prompts = {
-        "Scene Description":
-            f"""
-You are an AI assistant helping a visually impaired person.
-Describe the image in simple, human-friendly language.
-Return output in this format:
-1. Quick Summary
-2. Main Objects Seen
-3. Relative Position of Important Objects
-4. Helpful Note
-Keep the language simple and practical.{context_part}
 """,
-        "Hazard Detection":
-            f"""
-You are an AI safety assistant helping a visually impaired person.
-Analyze the image for possible hazards.
-Return output in this format:
-1. Quick Summary
-2. Possible Hazards
-3. Risk Level (Low/Medium/High)
-4. Safety Advice
-Be practical and avoid exaggeration.{context_part}
 """,
-        "Important Object Summary":
-            f"""
-You are an AI visual assistant.
-Identify the most important objects in the image that a visually impaired person should know about.
-Return output in this format:
-1. Key Objects
-2. What Looks Most Important
-3. Why These Objects Matter
-4. Short Spoken Summary
-Keep it easy to understand.{context_part}
 """,
-        "Safe Action Suggestion":
-            f"""
-You are an AI guidance assistant for a visually impaired person.
-Based on the image, suggest the next safest action.
-Return output in this format:
-1. What the Scene Looks Like
-2. What Needs Attention
-3. Recommended Action
-4. One-Line Safety Tip
-Do not assume too much. Give cautious guidance.{context_part}
 """
     }
-    return prompts.get(mode, prompts["Scene Description"])
-def analyze_image(image, mode, user_context):
-    if image is None:
-        return "Please upload an image."
-    prompt = build_prompt(mode, user_context)
-    return run_fastvlm(image, prompt)
-def exhibition_pitch(mode):
-    pitches = {
-        "Scene Description":
-            "This mode explains the surrounding environment in simple words so a visually impaired person can understand the scene.",
-        "Hazard Detection":
-            "This mode checks whether the image contains obstacles or risky elements such as vehicles, stairs, clutter, or unsafe walking areas.",
-        "Important Object Summary":
-            "This mode highlights the most useful objects in the scene so the user can focus on what matters most.",
-        "Safe Action Suggestion":
-            "This mode provides the next practical action the user should consider, based on the visual situation."
     }
-    return pitches.get(mode, "")
-with gr.Blocks(title="VisionMate AI - Smart Visual Assistant") as demo:
-    gr.Markdown("""
-# 👁️ VisionMate AI
-## Smart Visual Assistant for Visually Impaired People
-Upload an image and let the AI explain the scene, identify hazards, summarize important objects, or suggest the safest next action.
-### Exhibition Theme
-**AI for Social Good**
-""")
     with gr.Row():
         with gr.Column(scale=1):
-            image_input = gr.Image(type="pil", label="Upload Scene Image")
-            mode = gr.Radio(
-                choices=[
-                    "Scene Description",
-                    "Hazard Detection",
-                    "Important Object Summary",
-                    "Safe Action Suggestion"
-                ],
-                value="Scene Description",
-                label="Select Assistance Mode"
             )
-            user_context = gr.Textbox(
-                label="Optional Context",
-                placeholder="Example: Person is walking alone on a road / indoor corridor / market area",
                 lines=2
             )
             with gr.Row():
-                analyze_btn = gr.Button("Analyze Scene", variant="primary")
-                clear_btn = gr.ClearButton([image_input, user_context])
-        with gr.Column(scale=1):
-            mode_explanation = gr.Textbox(
-                label="Mode Purpose",
-                value=exhibition_pitch("Scene Description"),
-                interactive=False,
-                lines=4
             )
-            output = gr.Textbox(
-                label="AI Assistance Output",
-                lines=16,
-                max_lines=25,
-                show_copy_button=True
             )
-    mode.change(fn=exhibition_pitch, inputs=mode, outputs=mode_explanation)
-    analyze_btn.click(fn=analyze_image, inputs=[image_input, mode, user_context], outputs=output)
     gr.Markdown("""
 ---
-### Suggested Demo Images for Exhibition
-- A road with vehicles and pedestrians
-- A classroom or hallway
-- A kitchen or home environment
-- A supermarket shelf or crowded place
-### Expected Impact
-This project shows how computer vision and multimodal AI can improve accessibility and independence for visually impaired users.
 """)
 if __name__ == "__main__":
     demo.launch(
         share=False,

 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
+# =========================================================
+# Model setup
+# =========================================================
 MID = "apple/FastVLM-0.5B"
 IMAGE_TOKEN_INDEX = -200
 def load_model():
     global tok, model
     if tok is None or model is None:
+        print("Loading FastVLM on CPU...")
         tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             MID,
             device_map="cpu",
             trust_remote_code=True,
         )
+        print("Model loaded successfully on CPU.")
     return tok, model
+def run_fastvlm(image, prompt, max_new_tokens=180):
     if image is None:
         return "Please upload an image first."
         model_dtype = next(model.parameters()).dtype
         img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype, device=model_device)
         input_ids = torch.cat(
             [pre_ids.to(model_device), img_tok, post_ids.to(model_device)],
             dim=1
         )
         attention_mask = torch.ones_like(input_ids, device=model_device)
         pixel_values = model.get_vision_tower().image_processor(
                 inputs=input_ids,
                 attention_mask=attention_mask,
                 images=pixel_values,
+                max_new_tokens=max_new_tokens,
                 do_sample=False
             )
         return response
     except Exception as e:
+        return f"Error generating response: {str(e)}"
+# =========================================================
+# Use case knowledge cards
+# =========================================================
+USE_CASE_INFO = {
+    "Accessibility Assistant": {
+        "problem": "A visually impaired user may need quick scene understanding and help identifying objects or obstacles.",
+        "beneficiaries": "Visually impaired users, caregivers, accessibility NGOs, smart assistive-tech teams.",
+        "proof": "The app describes the scene, highlights key items, and gives practical guidance.",
+        "judge_angle": "Shows AI for inclusion and social impact."
+    },
+    "Safety Checker": {
+        "problem": "People often miss visible risks in busy roads, stairways, cluttered spaces, or public areas.",
+        "beneficiaries": "Schools, public-space monitoring teams, safety awareness projects.",
+        "proof": "The app flags possible risks, risky zones, and next-safe-action ideas.",
+        "judge_angle": "Shows preventive AI and practical awareness."
+    },
+    "Museum / Exhibit Guide": {
+        "problem": "Visitors want engaging explanations, not just raw object names.",
+        "beneficiaries": "Museums, exhibitions, tourism projects, learning spaces.",
+        "proof": "The app turns the same image into a friendly guide-like explanation.",
+        "judge_angle": "Shows storytelling plus education."
+    },
+    "Retail Shelf Helper": {
+        "problem": "Customers and staff need quick item understanding, arrangement insight, and shelf-level interpretation.",
+        "beneficiaries": "Retail stores, FMCG demos, smart shopping assistants.",
+        "proof": "The app summarizes visible products, arrangement, and shopper-facing insights.",
+        "judge_angle": "Shows business and commercial use."
+    },
+    "Classroom Explainer": {
+        "problem": "Students often understand better when images are explained in simple, structured language.",
+        "beneficiaries": "Teachers, students, EdTech demos, smart classrooms.",
+        "proof": "The app explains the image like a teacher using easy language and teaching points.",
+        "judge_angle": "Shows educational value."
+    },
+    "Travel Interpreter": {
+        "problem": "Travelers want quick understanding of landmarks, scenes, crowd conditions, and surroundings.",
+        "beneficiaries": "Travel apps, tourism assistance, city experience projects.",
+        "proof": "The app explains what the place appears to be, what stands out, and what a visitor should notice.",
+        "judge_angle": "Shows lifestyle and tourism use."
+    }
+}
+def get_use_case_card(use_case):
+    info = USE_CASE_INFO[use_case]
+    return f"""
+### {use_case}
+**Problem Solved**
+{info['problem']}
+**Who Benefits**
+{info['beneficiaries']}
+**What This Demo Proves**
+{info['proof']}
+**Why Judges Usually Like It**
+{info['judge_angle']}
+"""
+# =========================================================
+# Prompt builders
+# =========================================================
+def build_use_case_prompt(use_case, user_context):
+    context = user_context.strip() if user_context else "No extra context provided."
     prompts = {
+        "Accessibility Assistant": f"""
+You are an assistive AI helping a visually impaired user.
+Analyze the uploaded image and return your answer in this format:
+1. Quick Scene Summary
+2. Main Objects and Their Positions
+3. Anything Important to Notice
+4. Helpful Guidance for the User
+Use simple, natural, practical language.
+Mention uncertainty when needed.
+Context: {context}
+""",
+        "Safety Checker": f"""
+You are an AI safety observer.
+Analyze the uploaded image and return your answer in this format:
+1. What the Scene Appears to Show
+2. Possible Hazards or Risky Elements
+3. Risk Level: Low / Medium / High
+4. Best Next Safe Action
+Be cautious, grounded, and practical.
+Do not invent invisible hazards.
+Mention uncertainty when needed.
+Context: {context}
+""",
+        "Museum / Exhibit Guide": f"""
+You are a smart museum guide.
+Analyze the uploaded image and return:
+1. What Visitors Are Looking At
+2. Interesting Visual Details
+3. Why It Could Matter / Be Memorable
+4. A Friendly 2-line Visitor Guide
+Make it warm, engaging, and exhibition-friendly.
+Context: {context}
 """,
+        "Retail Shelf Helper": f"""
+You are an AI retail assistant.
+Analyze the uploaded image and return:
+1. What Products / Objects Are Visible
+2. Arrangement or Display Observations
+3. Shopper-Friendly Insights
+4. Staff / Store Improvement Suggestion
+Be concise, business-relevant, and practical.
+Context: {context}
 """,
+        "Classroom Explainer": f"""
+You are a teacher explaining the image to students.
+Return:
+1. What We See
+2. Main Concepts / Objects
+3. Easy Explanation for Students
+4. One Learning Question
+Use clear, beginner-friendly language.
+Context: {context}
 """,
+        "Travel Interpreter": f"""
+You are an AI travel companion.
+Analyze the uploaded image and return:
+1. What This Place / Scene Looks Like
+2. What a Visitor Would Notice First
+3. Interesting or Useful Observations
+4. One Practical Travel Tip
+Stay grounded in the visible scene.
+Context: {context}
 """
     }
+    return prompts[use_case]
+def build_persona_prompt(persona, tone, goal):
+    goal_text = goal.strip() if goal else "Explain the image in your role."
+    return f"""
+You are analyzing the image as this role: {persona}
+Tone: {tone}
+Goal: {goal_text}
+Return your answer in this format:
+1. Role Introduction
+2. What I Notice First
+3. What Matters Most From My Perspective
+4. My Advice / Commentary
+5. One Memorable Closing Line
+Stay grounded in the image.
+Do not pretend to know hidden facts.
+"""
+def build_mission_prompt(mission, mission_context):
+    context = mission_context.strip() if mission_context else "No extra context."
+    mission_prompts = {
+        "Hidden Detail Hunt": f"""
+Study the image carefully.
+Return:
+1. 5 specific details that are easy to miss
+2. Why each detail matters
+3. What those details suggest about the scene
+Stay grounded in the visible image only.
+Context: {context}
+""",
+        "Exhibit Quiz Maker": f"""
+Create a mini exhibition quiz from the image.
+Return:
+1. Five quiz questions
+2. Correct answer under each question
+3. One final bonus question
+Make the quiz engaging and image-based.
+Context: {context}
+""",
+        "Pitch From the Picture": f"""
+Look at the image and imagine a useful product, service, or startup idea inspired by it.
+Return:
+1. Problem Seen in the Image
+2. Product / Service Idea
+3. Target Users
+4. One-line Pitch
+Keep it smart, creative, but still linked to the image.
+Context: {context}
+""",
+        "Evidence Board": f"""
+Analyze the image critically.
+Return:
+1. Things that are clearly visible
+2. Things that are likely but not certain
+3. Things that should NOT be assumed
+4. Why careful interpretation matters
+This mission is for teaching responsible AI reasoning.
+Context: {context}
+""",
+        "Story Spark": f"""
+Create a short story inspired by the image.
+Return:
+1. Title
+2. Story in under 120 words
+3. What visual details inspired the story
+Keep it imaginative but tied to the scene.
+Context: {context}
+""",
+        "Accessibility Voiceover": f"""
+Create a voiceover-style narration for a visually impaired user.
+Return:
+1. Calm spoken scene narration
+2. Important objects
+3. Immediate practical note
+4. Final short reassurance
+Make it audio-friendly and natural.
+Context: {context}
+"""
     }
+    return mission_prompts[mission]
+def build_question_prompt(question):
+    user_q = question.strip() if question else "What is happening in this image?"
+    return f"""
+Answer the user's question about the image.
+Question: {user_q}
+Return:
+1. Direct Answer
+2. Evidence From the Image
+3. Uncertainty Note if Needed
+Keep it short and reliable.
+"""
+# =========================================================
+# App functions
+# =========================================================
+def analyze_use_case(image, use_case, user_context):
+    prompt = build_use_case_prompt(use_case, user_context)
+    return run_fastvlm(image, prompt, max_new_tokens=200)
+def persona_playground(image, persona, tone, goal):
+    prompt = build_persona_prompt(persona, tone, goal)
+    return run_fastvlm(image, prompt, max_new_tokens=190)
+def mission_lab(image, mission, mission_context):
+    prompt = build_mission_prompt(mission, mission_context)
+    return run_fastvlm(image, prompt, max_new_tokens=220)
+def ask_image(image, question):
+    prompt = build_question_prompt(question)
+    return run_fastvlm(image, prompt, max_new_tokens=160)
+def compare_booth(image, compare_context):
+    context = compare_context.strip() if compare_context else "No extra context."
+    prompt_1 = f"""
+Explain this image as an Accessibility Assistant.
+Return:
+1. Scene Summary
+2. Important Objects
+3. Helpful Guidance
+Context: {context}
+"""
+    prompt_2 = f"""
+Explain this image as a Safety Checker.
+Return:
+1. Visible Risks
+2. Risk Level
+3. Safe Next Step
+Context: {context}
+"""
+    prompt_3 = f"""
+Explain this image as a Classroom Teacher.
+Return:
+1. What Students See
+2. Main Idea
+3. One Learning Question
+Context: {context}
+"""
+    out1 = run_fastvlm(image, prompt_1, max_new_tokens=140)
+    out2 = run_fastvlm(image, prompt_2, max_new_tokens=140)
+    out3 = run_fastvlm(image, prompt_3, max_new_tokens=140)
+    return out1, out2, out3
+def generate_exhibit_script(use_case):
+    scripts = {
+        "Accessibility Assistant": """
+### 30-Second Pitch
+This project turns image understanding into an accessibility helper.
+A user uploads a scene, and the system explains what is visible, what matters most, and what practical guidance may help.
+This shows how multimodal AI can support inclusion, independence, and human-centered design.
+**Best line for judges:**
+"We are not just describing pictures. We are translating visual space into usable understanding."
+""",
+        "Safety Checker": """
+### 30-Second Pitch
+This project uses visual AI to inspect scenes for visible risk signals such as clutter, unsafe movement zones, or attention-worthy areas.
+It is useful as an awareness tool for schools, public demonstrations, and smart safety education.
+The value is not only detection, but guidance.
+**Best line for judges:**
+"This app turns passive vision into preventive awareness."
+""",
+        "Museum / Exhibit Guide": """
+### 30-Second Pitch
+This project acts like an AI guide that explains images in a visitor-friendly way.
+Instead of only naming objects, it creates interpretation, context, and memorable observations.
+It can be adapted for museums, campus exhibitions, tourism booths, and educational spaces.
+**Best line for judges:**
+"We changed image captioning into an interactive guide experience."
+""",
+        "Retail Shelf Helper": """
+### 30-Second Pitch
+This project interprets shelf images and converts them into shopper and business insights.
+It can help summarize visible products, arrangement cues, and display observations.
+This shows how the same AI model can serve a commercial use case without retraining.
+**Best line for judges:**
+"One image can become both a customer insight and an operational insight."
+""",
+        "Classroom Explainer": """
+### 30-Second Pitch
+This project uses image understanding to support teaching.
+It explains the same visual in simple educational language and even creates learning prompts.
+That makes it useful for smart classrooms, EdTech projects, and visual learning tools.
+**Best line for judges:**
+"This app helps students look at an image and actually learn from it."
+""",
+        "Travel Interpreter": """
+### 30-Second Pitch
+This project behaves like a visual travel companion.
+It interprets scenes, highlights what visitors may notice, and gives useful context or practical tips.
+That makes it relevant for tourism, smart city experiences, and visitor support.
+**Best line for judges:**
+"We turned one uploaded image into a mini travel briefing."
+"""
+    }
+    return scripts[use_case]
+# =========================================================
+# UI text
+# =========================================================
+HERO = """
+# VisionVerse AI
+## Exhibition Studio for Real-World Image Intelligence
+Upload one image and explore many use cases:
+- accessibility
+- safety
+- teaching
+- tourism
+- retail
+- storytelling
+- evidence checking
+- interactive Q&A
+### What makes this exhibition-ready?
+This is not a one-button caption demo.
+It is a **multi-use visual intelligence studio** designed to prove that a single AI vision engine can serve many real-world situations.
+"""
+INFO_PAGE = """
+# Project Info
+## 1) What this project is
+VisionVerse AI is an exhibition-ready visual intelligence app built on top of a multimodal image-language model.
+Instead of using the model for just one generic caption, the app wraps it in multiple roles, scenarios, and interaction modes.
+## 2) Core idea
+One uploaded image can be interpreted in many ways:
+- as an accessibility helper
+- as a safety observer
+- as a teacher
+- as a museum guide
+- as a retail assistant
+- as a travel companion
+- as a critical evidence checker
+## 3) Why this matters
+In many student projects, the model is good but the demonstration feels narrow.
+This app proves flexibility, purpose, and user-centered design.
+## 4) Architecture
+- Gradio front-end
+- FastVLM multimodal model
+- CPU-only inference
+- Prompt engineering for role adaptation
+- Tab-based interaction design
+## 5) Strengths
+- many real-world uses from one model
+- strong exhibition storytelling
+- easy demo with any uploaded image
+- playful interaction modes
+- educational and social impact angles
+## 6) Limitations
+- runs on CPU, so response can be slower
+- not a certified medical or safety device
+- may miss fine details or make uncertain interpretations
+- should be used as assistive AI, not final authority
+## 7) Responsible AI note
+The Evidence Board mission is included to show that good AI systems should separate:
+- what is clearly visible
+- what is likely
+- what should not be assumed
+## 8) Suggested evaluation ideas
+- response usefulness
+- clarity of explanation
+- consistency across different scenes
+- user satisfaction by use case
+- educational / accessibility impact
+## 9) Best demo images
+- road or traffic scene
+- classroom or laboratory
+- store shelf
+- museum object
+- crowded public place
+- home kitchen or hallway
+## 10) Best exhibition closing line
+"This project is not about generating text from images. It is about generating the right kind of help for the right kind of user."
+"""
+CSS = """
+.gradio-container {
+    max-width: 1400px !important;
+}
+.card-note {
+    border-radius: 16px;
+    padding: 14px;
+    background: #f6f8ff;
+}
+"""
+# =========================================================
+# Gradio UI
+# =========================================================
+with gr.Blocks(title="VisionVerse AI", css=CSS, theme=gr.themes.Soft()) as demo:
+    gr.Markdown(HERO)
     with gr.Row():
         with gr.Column(scale=1):
+            shared_image = gr.Image(type="pil", label="Upload Image for All Tabs")
+            clear_all = gr.ClearButton([shared_image], value="Clear Image")
+        with gr.Column(scale=1):
+            gr.Markdown("""
+### Quick Demo Route
+1. Upload one image
+2. Open **Use Case Studio**
+3. Open **Persona Playground**
+4. Open **Mission Lab**
+5. Open **Compare Booth**
+6. End with **Live Exhibit Script**
+This flow makes the demo feel layered, interactive, and purposeful.
+""")
+    with gr.Tabs():
+        with gr.Tab("Use Case Studio"):
+            with gr.Row():
+                with gr.Column():
+                    use_case = gr.Dropdown(
+                        choices=list(USE_CASE_INFO.keys()),
+                        value="Accessibility Assistant",
+                        label="Choose Real-World Use Case"
+                    )
+                    use_case_context = gr.Textbox(
+                        label="Optional Context",
+                        placeholder="Example: school corridor / grocery shelf / street crossing / museum object",
+                        lines=2
+                    )
+                    use_case_btn = gr.Button("Run Use Case Analysis", variant="primary")
+                with gr.Column():
+                    use_case_card = gr.Markdown(get_use_case_card("Accessibility Assistant"))
+            use_case_output = gr.Textbox(
+                label="Use Case Output",
+                lines=16,
+                max_lines=24,
+                show_copy_button=True
+            )
+            use_case.change(fn=get_use_case_card, inputs=use_case, outputs=use_case_card)
+            use_case_btn.click(
+                fn=analyze_use_case,
+                inputs=[shared_image, use_case, use_case_context],
+                outputs=use_case_output
+            )
+        with gr.Tab("Persona Playground"):
+            gr.Markdown("Make the same image speak through different roles. This is great for grabbing attention at an exhibition.")
+            with gr.Row():
+                with gr.Column():
+                    persona = gr.Dropdown(
+                        choices=[
+                            "Teacher",
+                            "Tour Guide",
+                            "Safety Officer",
+                            "Journalist",
+                            "Retail Manager",
+                            "Emergency Responder",
+                            "Storyteller",
+                            "Accessibility Coach"
+                        ],
+                        value="Teacher",
+                        label="Choose Persona"
+                    )
+                    tone = gr.Dropdown(
+                        choices=["Friendly", "Professional", "Calm", "Excited", "Analytical", "Simple"],
+                        value="Friendly",
+                        label="Tone"
+                    )
+                    persona_goal = gr.Textbox(
+                        label="Goal",
+                        placeholder="Example: explain to children / brief judges / guide a visitor",
+                        lines=2
+                    )
+                    persona_btn = gr.Button("Transform Through Persona", variant="primary")
+                with gr.Column():
+                    persona_output = gr.Textbox(
+                        label="Persona Response",
+                        lines=18,
+                        max_lines=26,
+                        show_copy_button=True
+                    )
+            persona_btn.click(
+                fn=persona_playground,
+                inputs=[shared_image, persona, tone, persona_goal],
+                outputs=persona_output
+            )
+        with gr.Tab("Mission Lab"):
+            gr.Markdown("This tab gives the app unusual interaction playgrounds. These are excellent for proving flexibility, creativity, and responsible reasoning.")
+            with gr.Row():
+                with gr.Column():
+                    mission = gr.Radio(
+                        choices=[
+                            "Hidden Detail Hunt",
+                            "Exhibit Quiz Maker",
+                            "Pitch From the Picture",
+                            "Evidence Board",
+                            "Story Spark",
+                            "Accessibility Voiceover"
+                        ],
+                        value="Hidden Detail Hunt",
+                        label="Choose Mission"
+                    )
+                    mission_context = gr.Textbox(
+                        label="Mission Context",
+                        placeholder="Example: target audience is school students / judges / visually impaired users",
+                        lines=2
+                    )
+                    mission_btn = gr.Button("Run Mission", variant="primary")
+                with gr.Column():
+                    mission_output = gr.Textbox(
+                        label="Mission Output",
+                        lines=18,
+                        max_lines=28,
+                        show_copy_button=True
+                    )
+            mission_btn.click(
+                fn=mission_lab,
+                inputs=[shared_image, mission, mission_context],
+                outputs=mission_output
             )
+        with gr.Tab("Ask the Image"):
+            gr.Markdown("Ask anything about the uploaded image. This makes the demo feel conversational rather than static.")
+            with gr.Row():
+                with gr.Column():
+                    user_question = gr.Textbox(
+                        label="Ask a Question About the Image",
+                        placeholder="What is the most important object here? / Does this look crowded? / What should a student learn from this?",
+                        lines=2
+                    )
+                    ask_btn = gr.Button("Ask", variant="primary")
+                with gr.Column():
+                    ask_output = gr.Textbox(
+                        label="Answer",
+                        lines=12,
+                        max_lines=20,
+                        show_copy_button=True
+                    )
+            ask_btn.click(
+                fn=ask_image,
+                inputs=[shared_image, user_question],
+                outputs=ask_output
+            )
+        with gr.Tab("Compare Booth"):
+            gr.Markdown("One image, three minds. This tab is strong for proving that the same model can support different goals.")
+            compare_context = gr.Textbox(
+                label="Optional Compare Context",
+                placeholder="Example: public road / classroom / tourist spot",
                 lines=2
             )
+            compare_btn = gr.Button("Run 3-Way Compare", variant="primary")
             with gr.Row():
+                compare_out_1 = gr.Textbox(label="Accessibility Lens", lines=14, show_copy_button=True)
+                compare_out_2 = gr.Textbox(label="Safety Lens", lines=14, show_copy_button=True)
+                compare_out_3 = gr.Textbox(label="Teaching Lens", lines=14, show_copy_button=True)
+            compare_btn.click(
+                fn=compare_booth,
+                inputs=[shared_image, compare_context],
+                outputs=[compare_out_1, compare_out_2, compare_out_3]
+            )
+        with gr.Tab("Live Exhibit Script"):
+            gr.Markdown("Use this tab at the end of your demo. It gives you clean lines to say in front of judges.")
+            script_use_case = gr.Dropdown(
+                choices=list(USE_CASE_INFO.keys()),
+                value="Accessibility Assistant",
+                label="Choose Your Main Showcase Angle"
             )
+            script_btn = gr.Button("Generate Pitch Script", variant="primary")
+            script_output = gr.Markdown()
+            script_btn.click(
+                fn=generate_exhibit_script,
+                inputs=script_use_case,
+                outputs=script_output
             )
+        with gr.Tab("Project Info"):
+            gr.Markdown(INFO_PAGE)
     gr.Markdown("""
 ---
+### Extra Exhibition Tips
+**Best live flow**
+- start with Accessibility Assistant
+- switch to Persona Playground
+- show Evidence Board in Mission Lab
+- finish with Compare Booth
+- close using Live Exhibit Script
+**Why that works**
+You show usefulness, creativity, responsibility, and communication in one go.
+**Note**
+The Compare Booth runs the model three times, so it can be slower on CPU.
 """)
 if __name__ == "__main__":
     demo.launch(
         share=False,