ganeshkumar383 commited on
Commit
6628dcd
·
verified ·
1 Parent(s): 0b5da16

Create app_advanced.py

Browse files
Files changed (1) hide show
  1. app_advanced.py +782 -0
app_advanced.py ADDED
@@ -0,0 +1,782 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VISUAL CONVERSATIONAL INTELLIGENCE ENGINE
3
+ ==========================================
4
+ A pluggable, image-grounded multi-turn conversational system.
5
+
6
+ Architecture:
7
+ - Session-based image memory (stored once, queried multiple times)
8
+ - Vision-Language Model (BLIP) for image-question answering
9
+ - REST-style core logic (pure functions)
10
+ - Gradio UI for demonstration
11
+
12
+ Academic Purpose:
13
+ Demonstrates AI system design for visual question answering with
14
+ conversational context, suitable for research evaluation.
15
+ """
16
+
17
+ import gradio as gr
18
+ from PIL import Image
19
+ from transformers import BlipProcessor, BlipForQuestionAnswering
20
+ import torch
21
+ from typing import Optional, Tuple, List
22
+ import uuid
23
+ import re
24
+
25
+
26
+ # ============================================================================
27
+ # SESSION MEMORY MANAGEMENT
28
+ # ============================================================================
29
+
30
+ class SessionMemory:
31
+ """
32
+ Manages session state for image-grounded conversations.
33
+
34
+ Each session stores:
35
+ - uploaded_image: PIL Image object
36
+ - conversation_history: List of (question, answer) tuples
37
+ - session_id: Unique identifier for the session
38
+ """
39
+
40
+ def __init__(self):
41
+ self.sessions = {}
42
+
43
+ def create_session(self) -> str:
44
+ """Create a new session and return its ID."""
45
+ session_id = str(uuid.uuid4())
46
+ self.sessions[session_id] = {
47
+ 'uploaded_image': None,
48
+ 'conversation_history': []
49
+ }
50
+ return session_id
51
+
52
+ def store_image(self, session_id: str, image: Image.Image) -> None:
53
+ """Store an image in session memory."""
54
+ if session_id in self.sessions:
55
+ self.sessions[session_id]['uploaded_image'] = image
56
+
57
+ def get_image(self, session_id: str) -> Optional[Image.Image]:
58
+ """Retrieve the stored image from session."""
59
+ if session_id in self.sessions:
60
+ return self.sessions[session_id]['uploaded_image']
61
+ return None
62
+
63
+ def add_to_history(self, session_id: str, question: str, answer: str) -> None:
64
+ """Add a Q&A pair to conversation history."""
65
+ if session_id in self.sessions:
66
+ self.sessions[session_id]['conversation_history'].append((question, answer))
67
+
68
+ def get_history(self, session_id: str) -> List[Tuple[str, str]]:
69
+ """Retrieve conversation history."""
70
+ if session_id in self.sessions:
71
+ return self.sessions[session_id]['conversation_history']
72
+ return []
73
+
74
+ def reset_session(self, session_id: str) -> None:
75
+ """Clear all session data (image + conversation history)."""
76
+ if session_id in self.sessions:
77
+ self.sessions[session_id] = {
78
+ 'uploaded_image': None,
79
+ 'conversation_history': []
80
+ }
81
+
82
+
83
+ # ============================================================================
84
+ # VISION-LANGUAGE MODEL INITIALIZATION
85
+ # ============================================================================
86
+
87
+ class VisualQAEngine:
88
+ """
89
+ Core inference engine using BLIP (Bootstrapping Language-Image Pre-training).
90
+
91
+ BLIP is a vision-language model that can answer questions about images.
92
+ We use the pretrained model without any fine-tuning.
93
+ """
94
+
95
+ def __init__(self, model_name: str = "Salesforce/blip-vqa-base"):
96
+ """
97
+ Initialize the BLIP model and processor.
98
+
99
+ Args:
100
+ model_name: HuggingFace model identifier
101
+ """
102
+ print(f"Loading model: {model_name}")
103
+ self.processor = BlipProcessor.from_pretrained(model_name)
104
+ self.model = BlipForQuestionAnswering.from_pretrained(model_name)
105
+
106
+ # Use GPU if available, otherwise CPU (for HuggingFace Spaces compatibility)
107
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
108
+ self.model.to(self.device)
109
+ print(f"Model loaded on device: {self.device}")
110
+
111
+ def answer_question(self, image: Image.Image, question: str) -> str:
112
+ """
113
+ Generate an answer to a question about the image.
114
+
115
+ This is a PURE FUNCTION suitable for REST APIs:
116
+ - Takes image + question as input
117
+ - Returns answer as output
118
+ - No side effects
119
+
120
+ Args:
121
+ image: PIL Image object
122
+ question: User's question about the image
123
+
124
+ Returns:
125
+ Generated answer grounded in the image
126
+ """
127
+ # Preprocess image and question
128
+ inputs = self.processor(image, question, return_tensors="pt").to(self.device)
129
+
130
+ # Generate answer using the vision-language model
131
+ with torch.no_grad():
132
+ outputs = self.model.generate(**inputs, max_length=50)
133
+
134
+ # Decode the generated answer
135
+ answer = self.processor.decode(outputs[0], skip_special_tokens=True)
136
+
137
+ return answer
138
+
139
+
140
+ # ============================================================================
141
+ # APPLICATION LOGIC (REST-STYLE PURE FUNCTIONS)
142
+ # ============================================================================
143
+
144
+ def validate_question(question: str, image: Optional[Image.Image]) -> Tuple[bool, str]:
145
+ """
146
+ Validate that conditions are met for answering a question.
147
+
148
+ Validation rules:
149
+ 1. Image must be uploaded
150
+ 2. Question must not be empty
151
+
152
+ Args:
153
+ question: User's input question
154
+ image: Stored image (or None)
155
+
156
+ Returns:
157
+ (is_valid, error_message)
158
+ """
159
+ if image is None:
160
+ return False, "⚠️ Please upload an image first before asking questions."
161
+
162
+ if not question or question.strip() == "":
163
+ return False, "⚠️ Please enter a question."
164
+
165
+ return True, ""
166
+
167
+
168
+ def process_question(
169
+ vqa_engine: VisualQAEngine,
170
+ session_memory: SessionMemory,
171
+ session_id: str,
172
+ question: str
173
+ ) -> Tuple[str, List[Tuple[str, str]]]:
174
+ """
175
+ Process a user question and generate an image-grounded answer.
176
+
177
+ This function orchestrates the core conversational flow:
178
+ 1. Validate inputs
179
+ 2. Retrieve image from session
180
+ 3. Generate answer using vision-language model
181
+ 4. Update conversation history
182
+ 5. Return answer + updated history
183
+
184
+ Args:
185
+ vqa_engine: Visual QA inference engine
186
+ session_memory: Session storage
187
+ session_id: Current session identifier
188
+ question: User's question
189
+
190
+ Returns:
191
+ (answer, updated_conversation_history)
192
+ """
193
+ # Retrieve stored image
194
+ image = session_memory.get_image(session_id)
195
+
196
+ # Validate inputs
197
+ is_valid, error_msg = validate_question(question, image)
198
+ if not is_valid:
199
+ return error_msg, session_memory.get_history(session_id)
200
+
201
+ # Generate image-grounded answer
202
+ answer = vqa_engine.answer_question(image, question)
203
+
204
+ # Update conversation history
205
+ session_memory.add_to_history(session_id, question, answer)
206
+
207
+ # Return answer and updated history
208
+ return answer, session_memory.get_history(session_id)
209
+
210
+
211
+ def handle_image_upload(
212
+ session_memory: SessionMemory,
213
+ session_id: str,
214
+ image: Image.Image
215
+ ) -> str:
216
+ """
217
+ Handle image upload and store in session memory.
218
+
219
+ Args:
220
+ session_memory: Session storage
221
+ session_id: Current session identifier
222
+ image: Uploaded PIL Image
223
+
224
+ Returns:
225
+ Confirmation message
226
+ """
227
+ if image is None:
228
+ return "⚠️ No image uploaded."
229
+
230
+ # Store image in session
231
+ session_memory.store_image(session_id, image)
232
+
233
+ return "✅ Image uploaded successfully! You can now ask questions about this image."
234
+
235
+
236
+ def reset_conversation(
237
+ session_memory: SessionMemory,
238
+ session_id: str
239
+ ) -> Tuple[str, List, None]:
240
+ """
241
+ Reset the conversation (clear image and history).
242
+
243
+ Args:
244
+ session_memory: Session storage
245
+ session_id: Current session identifier
246
+
247
+ Returns:
248
+ (status_message, empty_history, None_for_image)
249
+ """
250
+ session_memory.reset_session(session_id)
251
+ return "🔄 Conversation reset. Please upload a new image.", [], None
252
+
253
+
254
+ # ============================================================================
255
+ # GRADIO UI INTERFACE
256
+ # ============================================================================
257
+
258
+ def format_history_for_chatbot(history: List[Tuple[str, str]]) -> List[dict]:
259
+ """
260
+ Convert internal (question, answer) tuples into
261
+ Gradio v4 Chatbot message format.
262
+ """
263
+ messages = []
264
+ for q, a in history:
265
+ messages.append({"role": "user", "content": q})
266
+ messages.append({"role": "assistant", "content": a})
267
+ return messages
268
+
269
+
270
+ def generate_visual_topic_suggestions(
271
+ vqa_engine: VisualQAEngine,
272
+ image: Image.Image
273
+ ) -> List[str]:
274
+ """
275
+ Generate guided visual topic suggestions using the SAME BLIP VQA model.
276
+
277
+ IMPORTANT:
278
+ - This is GUIDANCE ONLY, not object detection
279
+ - Uses a small, fixed set of internal prompts
280
+ - Extracts 1-4 single-word nouns only
281
+ - Does NOT claim to list all objects
282
+
283
+ Args:
284
+ vqa_engine: Visual QA inference engine
285
+ image: Uploaded PIL Image
286
+
287
+ Returns:
288
+ List of 1-4 single-word topic suggestions
289
+ """
290
+ if image is None:
291
+ return []
292
+
293
+ # Fixed set of internal prompts for guidance
294
+ internal_prompts = [
295
+ "What is the main object in the image?",
296
+ "Is there a furniture item?",
297
+ "Is there an electronic device?",
298
+ "Is there a plant?"
299
+ ]
300
+
301
+ suggestions = []
302
+
303
+ for prompt in internal_prompts:
304
+ try:
305
+ answer = vqa_engine.answer_question(image, prompt)
306
+ # Extract single-word nouns only
307
+ words = re.findall(r'\b[a-zA-Z]+\b', answer.lower())
308
+ # Filter out common stop words and keep only meaningful nouns
309
+ stop_words = {'yes', 'no', 'the', 'a', 'an', 'is', 'are', 'there', 'not'}
310
+ meaningful_words = [w for w in words if w not in stop_words and len(w) > 2]
311
+ if meaningful_words:
312
+ suggestions.append(meaningful_words[0])
313
+ except:
314
+ continue
315
+
316
+ # Return unique suggestions, max 4
317
+ unique_suggestions = list(dict.fromkeys(suggestions))[:4]
318
+ return unique_suggestions
319
+
320
+
321
+ def clear_chat_only(
322
+ session_memory: SessionMemory,
323
+ session_id: str
324
+ ) -> Tuple[str, List]:
325
+ """
326
+ Clear conversation history only (keep image).
327
+
328
+ Args:
329
+ session_memory: Session storage
330
+ session_id: Current session identifier
331
+
332
+ Returns:
333
+ (status_message, empty_history)
334
+ """
335
+ if session_id in session_memory.sessions:
336
+ session_memory.sessions[session_id]['conversation_history'] = []
337
+ return "💬 Chat cleared. Image retained.", []
338
+
339
+
340
+ def remove_image_only(
341
+ session_memory: SessionMemory,
342
+ session_id: str
343
+ ) -> Tuple[str, None]:
344
+ """
345
+ Remove image only (keep conversation history).
346
+
347
+ Args:
348
+ session_memory: Session storage
349
+ session_id: Current session identifier
350
+
351
+ Returns:
352
+ (status_message, None_for_image)
353
+ """
354
+ if session_id in session_memory.sessions:
355
+ session_memory.sessions[session_id]['uploaded_image'] = None
356
+ return "🖼️ Image removed. Chat history retained.", None
357
+
358
+
359
+ def get_session_metadata(
360
+ session_memory: SessionMemory,
361
+ session_id: str
362
+ ) -> str:
363
+ """
364
+ Get session metadata for Advanced Mode display.
365
+
366
+ Args:
367
+ session_memory: Session storage
368
+ session_id: Current session identifier
369
+
370
+ Returns:
371
+ Formatted metadata string
372
+ """
373
+ if session_id not in session_memory.sessions:
374
+ return "Session ID: Unknown\nImage Loaded: No\nConversation Turns: 0"
375
+
376
+ session = session_memory.sessions[session_id]
377
+ image_loaded = "Yes" if session['uploaded_image'] is not None else "No"
378
+ turn_count = len(session['conversation_history'])
379
+
380
+ return f"""**Session ID:** `{session_id[:8]}...`
381
+ **Image Loaded:** {image_loaded}
382
+ **Conversation Turns:** {turn_count}"""
383
+
384
+ def create_gradio_interface(vqa_engine: VisualQAEngine, session_memory: SessionMemory) -> gr.Blocks:
385
+ """
386
+ Create the Gradio UI for the Visual Conversational Intelligence Engine.
387
+
388
+ UI Components:
389
+ - Mode selector (Basic / Advanced)
390
+ - Image upload with guided topic suggestions
391
+ - Question input with type selector (Advanced Mode)
392
+ - Chat history display
393
+ - Advanced controls and metadata (Advanced Mode only)
394
+ """
395
+
396
+ # Custom CSS for visual polish and theming
397
+ custom_css = """
398
+ .mode-selector {font-size: 16px; font-weight: bold;}
399
+ .topic-chip {margin: 4px; padding: 8px 16px; border-radius: 16px; background: #e3f2fd; cursor: pointer;}
400
+ .capability-box {background: #f5f5f5; padding: 16px; border-radius: 8px; margin: 8px 0;}
401
+ .metadata-box {background: #fafafa; padding: 12px; border-radius: 6px; font-family: monospace;}
402
+ """
403
+
404
+ with gr.Blocks(title="Visual Conversational Intelligence Engine", css=custom_css) as demo:
405
+ # Session state (hidden)
406
+ session_id = gr.State(value=session_memory.create_session())
407
+
408
+ # Mode state (Basic = default)
409
+ mode_state = gr.State(value="Basic")
410
+
411
+ # Header
412
+ gr.Markdown("""
413
+ # 🔍 Visual Conversational Intelligence Engine
414
+
415
+ **An image-grounded multi-turn conversational system for academic demonstration**
416
+ """)
417
+
418
+ # MODE SELECTOR (TOP OF UI)
419
+ with gr.Row():
420
+ mode_selector = gr.Radio(
421
+ choices=["Basic Mode", "Advanced Mode"],
422
+ value="Basic Mode",
423
+ label="Interface Mode",
424
+ info="Basic Mode: Student-friendly interface | Advanced Mode: Research/admin view with additional controls",
425
+ elem_classes="mode-selector"
426
+ )
427
+
428
+ # BASIC MODE INSTRUCTIONS (shown only in Basic Mode)
429
+ basic_instructions = gr.Markdown("""
430
+ ### 🎓 How to use (Student View):
431
+ 1. **Upload an image** 📤
432
+ 2. **Ask questions** about the image 💬
433
+ 3. **Continue the conversation** - ask follow-up questions without re-uploading
434
+ 4. **Reset** to start over with a new image 🔄
435
+
436
+ **Note:** All answers are strictly grounded in the uploaded image.
437
+ """, visible=True)
438
+
439
+ # MAIN LAYOUT (TWO COLUMNS)
440
+ with gr.Row():
441
+ # LEFT COLUMN: IMAGE UPLOAD SECTION
442
+ with gr.Column(scale=1):
443
+ with gr.Group():
444
+ gr.Markdown("### 📤 Upload Image")
445
+ image_input = gr.Image(
446
+ type="pil",
447
+ label="Drag and drop or click to upload",
448
+ height=400
449
+ )
450
+ upload_status = gr.Textbox(
451
+ label="Status",
452
+ interactive=False,
453
+ lines=1
454
+ )
455
+ upload_btn = gr.Button("📥 Upload Image", variant="primary", size="lg")
456
+
457
+ # GUIDED VISUAL TOPIC SUGGESTIONS (shown after upload)
458
+ gr.Markdown("#### 💡 Suggested Visual Topics (Guidance Only)")
459
+ gr.Markdown("*Click a topic to prefill your question. These are suggestions, not exhaustive object lists.*")
460
+
461
+ with gr.Row():
462
+ topic_btn_1 = gr.Button("", visible=False, size="sm")
463
+ topic_btn_2 = gr.Button("", visible=False, size="sm")
464
+ topic_btn_3 = gr.Button("", visible=False, size="sm")
465
+ topic_btn_4 = gr.Button("", visible=False, size="sm")
466
+
467
+ # RIGHT COLUMN: CHAT / CONVERSATION SECTION
468
+ with gr.Column(scale=1):
469
+ with gr.Group():
470
+ gr.Markdown("### 💬 Ask Questions")
471
+ chatbot = gr.Chatbot(
472
+ label="Conversation History",
473
+ height=400
474
+ )
475
+ question_input = gr.Textbox(
476
+ label="Your Question",
477
+ placeholder="Ask a question about the uploaded image...",
478
+ lines=2
479
+ )
480
+
481
+ with gr.Row():
482
+ submit_btn = gr.Button("🚀 Ask Question", variant="primary", size="lg")
483
+ reset_btn_basic = gr.Button("🔄 Reset All", variant="secondary", size="lg")
484
+
485
+ # ADVANCED MODE PANEL (shown only in Advanced Mode)
486
+ with gr.Group(visible=False) as advanced_panel:
487
+ gr.Markdown("## 🔬 Advanced Controls & Metadata")
488
+
489
+ with gr.Row():
490
+ # QUESTION TYPE SELECTOR (GUIDANCE ONLY)
491
+ with gr.Column(scale=1):
492
+ gr.Markdown("### Question Type Selector (Guidance)")
493
+ question_type = gr.Dropdown(
494
+ choices=[
495
+ "Object Presence",
496
+ "Object Attribute (Color / Shape)",
497
+ "Scene Understanding",
498
+ "Yes / No Verification"
499
+ ],
500
+ label="Select Question Type",
501
+ info="This is for guidance only. It does not change AI logic.",
502
+ value="Object Presence"
503
+ )
504
+
505
+ # SESSION METADATA PANEL (READ-ONLY)
506
+ with gr.Column(scale=1):
507
+ gr.Markdown("### Session Metadata")
508
+ session_metadata = gr.Markdown(
509
+ "**Session ID:** Not initialized\n**Image Loaded:** No\n**Conversation Turns:** 0"
510
+ )
511
+ refresh_metadata_btn = gr.Button("🔄 Refresh Metadata", size="sm")
512
+
513
+ # CAPABILITY / SCOPE INDICATOR (STATIC INFO BOX)
514
+ with gr.Row():
515
+ gr.Markdown("""
516
+ ### ⚙️ System Capabilities & Limitations
517
+
518
+ **What this system CAN do:**
519
+ - ✅ Image-grounded Question Answering
520
+ - ✅ Single-image Conversational Memory
521
+ - ✅ Multi-turn dialogue about the same image
522
+
523
+ **What this system CANNOT do:**
524
+ - ❌ Exhaustive Object Listing (not object detection)
525
+ - ❌ Multi-image Reasoning
526
+ - ❌ Precise Counting (VQA models have known limitations)
527
+ - ❌ Open-domain knowledge questions unrelated to the image
528
+
529
+ *This is an academic prototype demonstrating AI system design, not a production object detection system.*
530
+ """)
531
+
532
+ # ADVANCED RESET CONTROLS
533
+ with gr.Row():
534
+ gr.Markdown("### Reset Controls")
535
+ with gr.Row():
536
+ clear_chat_btn = gr.Button("💬 Clear Chat Only", variant="secondary")
537
+ remove_image_btn = gr.Button("🖼️ Remove Image Only", variant="secondary")
538
+ full_reset_btn = gr.Button("🔄 Full Reset (Image + Chat)", variant="stop")
539
+
540
+ # Footer
541
+ gr.Markdown("""
542
+ ---
543
+ **Academic Prototype** | Demonstrates AI system design for visual question answering
544
+
545
+ **Tech Stack:** Python • HuggingFace BLIP • Gradio • Session-based Memory
546
+ """)
547
+
548
+ # ====================================================================
549
+ # EVENT HANDLERS
550
+ # ====================================================================
551
+
552
+ def toggle_mode(mode_choice):
553
+ """
554
+ Toggle between Basic and Advanced Mode.
555
+ Mode toggle does NOT reset session or image.
556
+ """
557
+ is_advanced = (mode_choice == "Advanced Mode")
558
+ return {
559
+ advanced_panel: gr.update(visible=is_advanced),
560
+ basic_instructions: gr.update(visible=not is_advanced),
561
+ reset_btn_basic: gr.update(visible=not is_advanced)
562
+ }
563
+
564
+ def upload_image_handler(image, session_id):
565
+ """
566
+ Handle image upload event.
567
+ Stores image and generates guided topic suggestions.
568
+ """
569
+ status = handle_image_upload(session_memory, session_id, image)
570
+
571
+ # Generate guided topic suggestions
572
+ suggestions = generate_visual_topic_suggestions(vqa_engine, image)
573
+
574
+ # Update topic buttons
575
+ updates = []
576
+ for i in range(4):
577
+ if i < len(suggestions):
578
+ updates.append(gr.update(value=suggestions[i], visible=True))
579
+ else:
580
+ updates.append(gr.update(value="", visible=False))
581
+
582
+ return [status] + updates
583
+
584
+ def topic_click_handler(topic_text):
585
+ """
586
+ Handle topic chip click.
587
+ Prefills question input with suggested topic.
588
+ User can edit before submitting.
589
+ """
590
+ return f"What is the {topic_text} in the image?"
591
+
592
+ def ask_question_handler(question, session_id):
593
+ """
594
+ Handle question submission.
595
+ Uses existing process_question logic (unchanged).
596
+ """
597
+ answer, history = process_question(
598
+ vqa_engine, session_memory, session_id, question
599
+ )
600
+ formatted_history = format_history_for_chatbot(history)
601
+ return formatted_history, ""
602
+
603
+ def question_type_change_handler(question_type):
604
+ """
605
+ Handle question type selector change.
606
+ Optionally prefills question input with example.
607
+ This is GUIDANCE ONLY.
608
+ """
609
+ examples = {
610
+ "Object Presence": "Is there a [object] in the image?",
611
+ "Object Attribute (Color / Shape)": "What color is the [object]?",
612
+ "Scene Understanding": "What is happening in the image?",
613
+ "Yes / No Verification": "Is the [object] [attribute]?"
614
+ }
615
+ return examples.get(question_type, "")
616
+
617
+ def refresh_metadata_handler(session_id):
618
+ """
619
+ Refresh session metadata display.
620
+ """
621
+ return get_session_metadata(session_memory, session_id)
622
+
623
+ def clear_chat_handler(session_id):
624
+ """
625
+ Clear chat only (Advanced Mode).
626
+ """
627
+ status, history = clear_chat_only(session_memory, session_id)
628
+ return status, []
629
+
630
+ def remove_image_handler(session_id):
631
+ """
632
+ Remove image only (Advanced Mode).
633
+ """
634
+ status, image = remove_image_only(session_memory, session_id)
635
+ return status, image
636
+
637
+ def full_reset_handler(session_id):
638
+ """
639
+ Full reset (Advanced Mode).
640
+ """
641
+ status, history, image = reset_conversation(session_memory, session_id)
642
+ return status, [], image, "", "", "", ""
643
+
644
+ def basic_reset_handler(session_id):
645
+ """
646
+ Basic mode reset.
647
+ """
648
+ status, history, image = reset_conversation(session_memory, session_id)
649
+ return status, [], image
650
+
651
+ # ====================================================================
652
+ # WIRE UP EVENTS
653
+ # ====================================================================
654
+
655
+ # Mode toggle
656
+ mode_selector.change(
657
+ fn=toggle_mode,
658
+ inputs=[mode_selector],
659
+ outputs=[advanced_panel, basic_instructions, reset_btn_basic]
660
+ )
661
+
662
+ # Image upload
663
+ upload_btn.click(
664
+ fn=upload_image_handler,
665
+ inputs=[image_input, session_id],
666
+ outputs=[upload_status, topic_btn_1, topic_btn_2, topic_btn_3, topic_btn_4]
667
+ )
668
+
669
+ # Topic chip clicks
670
+ topic_btn_1.click(
671
+ fn=topic_click_handler,
672
+ inputs=[topic_btn_1],
673
+ outputs=[question_input]
674
+ )
675
+ topic_btn_2.click(
676
+ fn=topic_click_handler,
677
+ inputs=[topic_btn_2],
678
+ outputs=[question_input]
679
+ )
680
+ topic_btn_3.click(
681
+ fn=topic_click_handler,
682
+ inputs=[topic_btn_3],
683
+ outputs=[question_input]
684
+ )
685
+ topic_btn_4.click(
686
+ fn=topic_click_handler,
687
+ inputs=[topic_btn_4],
688
+ outputs=[question_input]
689
+ )
690
+
691
+ # Question submission
692
+ submit_btn.click(
693
+ fn=ask_question_handler,
694
+ inputs=[question_input, session_id],
695
+ outputs=[chatbot, question_input]
696
+ )
697
+
698
+ question_input.submit(
699
+ fn=ask_question_handler,
700
+ inputs=[question_input, session_id],
701
+ outputs=[chatbot, question_input]
702
+ )
703
+
704
+ # Question type selector (Advanced Mode)
705
+ question_type.change(
706
+ fn=question_type_change_handler,
707
+ inputs=[question_type],
708
+ outputs=[question_input]
709
+ )
710
+
711
+ # Metadata refresh (Advanced Mode)
712
+ refresh_metadata_btn.click(
713
+ fn=refresh_metadata_handler,
714
+ inputs=[session_id],
715
+ outputs=[session_metadata]
716
+ )
717
+
718
+ # Advanced reset controls
719
+ clear_chat_btn.click(
720
+ fn=clear_chat_handler,
721
+ inputs=[session_id],
722
+ outputs=[upload_status, chatbot]
723
+ )
724
+
725
+ remove_image_btn.click(
726
+ fn=remove_image_handler,
727
+ inputs=[session_id],
728
+ outputs=[upload_status, image_input]
729
+ )
730
+
731
+ full_reset_btn.click(
732
+ fn=full_reset_handler,
733
+ inputs=[session_id],
734
+ outputs=[upload_status, chatbot, image_input, topic_btn_1, topic_btn_2, topic_btn_3, topic_btn_4]
735
+ )
736
+
737
+ # Basic mode reset
738
+ reset_btn_basic.click(
739
+ fn=basic_reset_handler,
740
+ inputs=[session_id],
741
+ outputs=[upload_status, chatbot, image_input]
742
+ )
743
+
744
+ return demo
745
+
746
+
747
+ # ============================================================================
748
+ # MAIN APPLICATION ENTRY POINT
749
+ # ============================================================================
750
+
751
+ def main():
752
+ """
753
+ Initialize and launch the Visual Conversational Intelligence Engine.
754
+ """
755
+ print("=" * 60)
756
+ print("VISUAL CONVERSATIONAL INTELLIGENCE ENGINE")
757
+ print("=" * 60)
758
+
759
+ # Initialize core components
760
+ print("\n[1/3] Initializing Vision-Language Model...")
761
+ vqa_engine = VisualQAEngine(model_name="Salesforce/blip-vqa-base")
762
+
763
+ print("\n[2/3] Setting up session memory...")
764
+ session_memory = SessionMemory()
765
+
766
+ print("\n[3/3] Creating Gradio interface...")
767
+ demo = create_gradio_interface(vqa_engine, session_memory)
768
+
769
+ print("\n" + "=" * 60)
770
+ print("🚀 Launching application...")
771
+ print("=" * 60)
772
+
773
+ # Launch the application
774
+ demo.launch(
775
+ share=False, # Set to True for public sharing
776
+ server_name="0.0.0.0", # Allow external access
777
+ server_port=7860 # Standard Gradio port
778
+ )
779
+
780
+
781
+ if __name__ == "__main__":
782
+ main()