Spaces:

ganeshkumar383
/

VisualBot

Sleeping

App Files Files Community

ganeshkumar383 commited on Feb 8

Commit

6628dcd

verified ·

1 Parent(s): 0b5da16

Create app_advanced.py

Browse files

Files changed (1) hide show

app_advanced.py +782 -0

app_advanced.py ADDED Viewed

	@@ -0,0 +1,782 @@

+"""
+VISUAL CONVERSATIONAL INTELLIGENCE ENGINE
+==========================================
+A pluggable, image-grounded multi-turn conversational system.
+Architecture:
+- Session-based image memory (stored once, queried multiple times)
+- Vision-Language Model (BLIP) for image-question answering
+- REST-style core logic (pure functions)
+- Gradio UI for demonstration
+Academic Purpose:
+Demonstrates AI system design for visual question answering with
+conversational context, suitable for research evaluation.
+"""
+import gradio as gr
+from PIL import Image
+from transformers import BlipProcessor, BlipForQuestionAnswering
+import torch
+from typing import Optional, Tuple, List
+import uuid
+import re
+# ============================================================================
+# SESSION MEMORY MANAGEMENT
+# ============================================================================
+class SessionMemory:
+    """
+    Manages session state for image-grounded conversations.
+    Each session stores:
+    - uploaded_image: PIL Image object
+    - conversation_history: List of (question, answer) tuples
+    - session_id: Unique identifier for the session
+    """
+    def __init__(self):
+        self.sessions = {}
+    def create_session(self) -> str:
+        """Create a new session and return its ID."""
+        session_id = str(uuid.uuid4())
+        self.sessions[session_id] = {
+            'uploaded_image': None,
+            'conversation_history': []
+        }
+        return session_id
+    def store_image(self, session_id: str, image: Image.Image) -> None:
+        """Store an image in session memory."""
+        if session_id in self.sessions:
+            self.sessions[session_id]['uploaded_image'] = image
+    def get_image(self, session_id: str) -> Optional[Image.Image]:
+        """Retrieve the stored image from session."""
+        if session_id in self.sessions:
+            return self.sessions[session_id]['uploaded_image']
+        return None
+    def add_to_history(self, session_id: str, question: str, answer: str) -> None:
+        """Add a Q&A pair to conversation history."""
+        if session_id in self.sessions:
+            self.sessions[session_id]['conversation_history'].append((question, answer))
+    def get_history(self, session_id: str) -> List[Tuple[str, str]]:
+        """Retrieve conversation history."""
+        if session_id in self.sessions:
+            return self.sessions[session_id]['conversation_history']
+        return []
+    def reset_session(self, session_id: str) -> None:
+        """Clear all session data (image + conversation history)."""
+        if session_id in self.sessions:
+            self.sessions[session_id] = {
+                'uploaded_image': None,
+                'conversation_history': []
+            }
+# ============================================================================
+# VISION-LANGUAGE MODEL INITIALIZATION
+# ============================================================================
+class VisualQAEngine:
+    """
+    Core inference engine using BLIP (Bootstrapping Language-Image Pre-training).
+    BLIP is a vision-language model that can answer questions about images.
+    We use the pretrained model without any fine-tuning.
+    """
+    def __init__(self, model_name: str = "Salesforce/blip-vqa-base"):
+        """
+        Initialize the BLIP model and processor.
+        Args:
+            model_name: HuggingFace model identifier
+        """
+        print(f"Loading model: {model_name}")
+        self.processor = BlipProcessor.from_pretrained(model_name)
+        self.model = BlipForQuestionAnswering.from_pretrained(model_name)
+        # Use GPU if available, otherwise CPU (for HuggingFace Spaces compatibility)
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(self.device)
+        print(f"Model loaded on device: {self.device}")
+    def answer_question(self, image: Image.Image, question: str) -> str:
+        """
+        Generate an answer to a question about the image.
+        This is a PURE FUNCTION suitable for REST APIs:
+        - Takes image + question as input
+        - Returns answer as output
+        - No side effects
+        Args:
+            image: PIL Image object
+            question: User's question about the image
+        Returns:
+            Generated answer grounded in the image
+        """
+        # Preprocess image and question
+        inputs = self.processor(image, question, return_tensors="pt").to(self.device)
+        # Generate answer using the vision-language model
+        with torch.no_grad():
+            outputs = self.model.generate(**inputs, max_length=50)
+        # Decode the generated answer
+        answer = self.processor.decode(outputs[0], skip_special_tokens=True)
+        return answer
+# ============================================================================
+# APPLICATION LOGIC (REST-STYLE PURE FUNCTIONS)
+# ============================================================================
+def validate_question(question: str, image: Optional[Image.Image]) -> Tuple[bool, str]:
+    """
+    Validate that conditions are met for answering a question.
+    Validation rules:
+    1. Image must be uploaded
+    2. Question must not be empty
+    Args:
+        question: User's input question
+        image: Stored image (or None)
+    Returns:
+        (is_valid, error_message)
+    """
+    if image is None:
+        return False, "⚠️ Please upload an image first before asking questions."
+    if not question or question.strip() == "":
+        return False, "⚠️ Please enter a question."
+    return True, ""
+def process_question(
+    vqa_engine: VisualQAEngine,
+    session_memory: SessionMemory,
+    session_id: str,
+    question: str
+) -> Tuple[str, List[Tuple[str, str]]]:
+    """
+    Process a user question and generate an image-grounded answer.
+    This function orchestrates the core conversational flow:
+    1. Validate inputs
+    2. Retrieve image from session
+    3. Generate answer using vision-language model
+    4. Update conversation history
+    5. Return answer + updated history
+    Args:
+        vqa_engine: Visual QA inference engine
+        session_memory: Session storage
+        session_id: Current session identifier
+        question: User's question
+    Returns:
+        (answer, updated_conversation_history)
+    """
+    # Retrieve stored image
+    image = session_memory.get_image(session_id)
+    # Validate inputs
+    is_valid, error_msg = validate_question(question, image)
+    if not is_valid:
+        return error_msg, session_memory.get_history(session_id)
+    # Generate image-grounded answer
+    answer = vqa_engine.answer_question(image, question)
+    # Update conversation history
+    session_memory.add_to_history(session_id, question, answer)
+    # Return answer and updated history
+    return answer, session_memory.get_history(session_id)
+def handle_image_upload(
+    session_memory: SessionMemory,
+    session_id: str,
+    image: Image.Image
+) -> str:
+    """
+    Handle image upload and store in session memory.
+    Args:
+        session_memory: Session storage
+        session_id: Current session identifier
+        image: Uploaded PIL Image
+    Returns:
+        Confirmation message
+    """
+    if image is None:
+        return "⚠️ No image uploaded."
+    # Store image in session
+    session_memory.store_image(session_id, image)
+    return "✅ Image uploaded successfully! You can now ask questions about this image."
+def reset_conversation(
+    session_memory: SessionMemory,
+    session_id: str
+) -> Tuple[str, List, None]:
+    """
+    Reset the conversation (clear image and history).
+    Args:
+        session_memory: Session storage
+        session_id: Current session identifier
+    Returns:
+        (status_message, empty_history, None_for_image)
+    """
+    session_memory.reset_session(session_id)
+    return "🔄 Conversation reset. Please upload a new image.", [], None
+# ============================================================================
+# GRADIO UI INTERFACE
+# ============================================================================
+def format_history_for_chatbot(history: List[Tuple[str, str]]) -> List[dict]:
+    """
+    Convert internal (question, answer) tuples into
+    Gradio v4 Chatbot message format.
+    """
+    messages = []
+    for q, a in history:
+        messages.append({"role": "user", "content": q})
+        messages.append({"role": "assistant", "content": a})
+    return messages
+def generate_visual_topic_suggestions(
+    vqa_engine: VisualQAEngine,
+    image: Image.Image
+) -> List[str]:
+    """
+    Generate guided visual topic suggestions using the SAME BLIP VQA model.
+    IMPORTANT:
+    - This is GUIDANCE ONLY, not object detection
+    - Uses a small, fixed set of internal prompts
+    - Extracts 1-4 single-word nouns only
+    - Does NOT claim to list all objects
+    Args:
+        vqa_engine: Visual QA inference engine
+        image: Uploaded PIL Image
+    Returns:
+        List of 1-4 single-word topic suggestions
+    """
+    if image is None:
+        return []
+    # Fixed set of internal prompts for guidance
+    internal_prompts = [
+        "What is the main object in the image?",
+        "Is there a furniture item?",
+        "Is there an electronic device?",
+        "Is there a plant?"
+    ]
+    suggestions = []
+    for prompt in internal_prompts:
+        try:
+            answer = vqa_engine.answer_question(image, prompt)
+            # Extract single-word nouns only
+            words = re.findall(r'\b[a-zA-Z]+\b', answer.lower())
+            # Filter out common stop words and keep only meaningful nouns
+            stop_words = {'yes', 'no', 'the', 'a', 'an', 'is', 'are', 'there', 'not'}
+            meaningful_words = [w for w in words if w not in stop_words and len(w) > 2]
+            if meaningful_words:
+                suggestions.append(meaningful_words[0])
+        except:
+            continue
+    # Return unique suggestions, max 4
+    unique_suggestions = list(dict.fromkeys(suggestions))[:4]
+    return unique_suggestions
+def clear_chat_only(
+    session_memory: SessionMemory,
+    session_id: str
+) -> Tuple[str, List]:
+    """
+    Clear conversation history only (keep image).
+    Args:
+        session_memory: Session storage
+        session_id: Current session identifier
+    Returns:
+        (status_message, empty_history)
+    """
+    if session_id in session_memory.sessions:
+        session_memory.sessions[session_id]['conversation_history'] = []
+    return "💬 Chat cleared. Image retained.", []
+def remove_image_only(
+    session_memory: SessionMemory,
+    session_id: str
+) -> Tuple[str, None]:
+    """
+    Remove image only (keep conversation history).
+    Args:
+        session_memory: Session storage
+        session_id: Current session identifier
+    Returns:
+        (status_message, None_for_image)
+    """
+    if session_id in session_memory.sessions:
+        session_memory.sessions[session_id]['uploaded_image'] = None
+    return "🖼️ Image removed. Chat history retained.", None
+def get_session_metadata(
+    session_memory: SessionMemory,
+    session_id: str
+) -> str:
+    """
+    Get session metadata for Advanced Mode display.
+    Args:
+        session_memory: Session storage
+        session_id: Current session identifier
+    Returns:
+        Formatted metadata string
+    """
+    if session_id not in session_memory.sessions:
+        return "Session ID: Unknown\nImage Loaded: No\nConversation Turns: 0"
+    session = session_memory.sessions[session_id]
+    image_loaded = "Yes" if session['uploaded_image'] is not None else "No"
+    turn_count = len(session['conversation_history'])
+    return f"""**Session ID:** `{session_id[:8]}...`
+**Image Loaded:** {image_loaded}
+**Conversation Turns:** {turn_count}"""
+def create_gradio_interface(vqa_engine: VisualQAEngine, session_memory: SessionMemory) -> gr.Blocks:
+    """
+    Create the Gradio UI for the Visual Conversational Intelligence Engine.
+    UI Components:
+    - Mode selector (Basic / Advanced)
+    - Image upload with guided topic suggestions
+    - Question input with type selector (Advanced Mode)
+    - Chat history display
+    - Advanced controls and metadata (Advanced Mode only)
+    """
+    # Custom CSS for visual polish and theming
+    custom_css = """
+    .mode-selector {font-size: 16px; font-weight: bold;}
+    .topic-chip {margin: 4px; padding: 8px 16px; border-radius: 16px; background: #e3f2fd; cursor: pointer;}
+    .capability-box {background: #f5f5f5; padding: 16px; border-radius: 8px; margin: 8px 0;}
+    .metadata-box {background: #fafafa; padding: 12px; border-radius: 6px; font-family: monospace;}
+    """
+    with gr.Blocks(title="Visual Conversational Intelligence Engine", css=custom_css) as demo:
+        # Session state (hidden)
+        session_id = gr.State(value=session_memory.create_session())
+        # Mode state (Basic = default)
+        mode_state = gr.State(value="Basic")
+        # Header
+        gr.Markdown("""
+        # 🔍 Visual Conversational Intelligence Engine
+        **An image-grounded multi-turn conversational system for academic demonstration**
+        """)
+        # MODE SELECTOR (TOP OF UI)
+        with gr.Row():
+            mode_selector = gr.Radio(
+                choices=["Basic Mode", "Advanced Mode"],
+                value="Basic Mode",
+                label="Interface Mode",
+                info="Basic Mode: Student-friendly interface | Advanced Mode: Research/admin view with additional controls",
+                elem_classes="mode-selector"
+            )
+        # BASIC MODE INSTRUCTIONS (shown only in Basic Mode)
+        basic_instructions = gr.Markdown("""
+        ### 🎓 How to use (Student View):
+        1. **Upload an image** 📤
+        2. **Ask questions** about the image 💬
+        3. **Continue the conversation** - ask follow-up questions without re-uploading
+        4. **Reset** to start over with a new image 🔄
+        **Note:** All answers are strictly grounded in the uploaded image.
+        """, visible=True)
+        # MAIN LAYOUT (TWO COLUMNS)
+        with gr.Row():
+            # LEFT COLUMN: IMAGE UPLOAD SECTION
+            with gr.Column(scale=1):
+                with gr.Group():
+                    gr.Markdown("### 📤 Upload Image")
+                    image_input = gr.Image(
+                        type="pil",
+                        label="Drag and drop or click to upload",
+                        height=400
+                    )
+                    upload_status = gr.Textbox(
+                        label="Status",
+                        interactive=False,
+                        lines=1
+                    )
+                    upload_btn = gr.Button("📥 Upload Image", variant="primary", size="lg")
+                    # GUIDED VISUAL TOPIC SUGGESTIONS (shown after upload)
+                    gr.Markdown("#### 💡 Suggested Visual Topics (Guidance Only)")
+                    gr.Markdown("*Click a topic to prefill your question. These are suggestions, not exhaustive object lists.*")
+                    with gr.Row():
+                        topic_btn_1 = gr.Button("", visible=False, size="sm")
+                        topic_btn_2 = gr.Button("", visible=False, size="sm")
+                        topic_btn_3 = gr.Button("", visible=False, size="sm")
+                        topic_btn_4 = gr.Button("", visible=False, size="sm")
+            # RIGHT COLUMN: CHAT / CONVERSATION SECTION
+            with gr.Column(scale=1):
+                with gr.Group():
+                    gr.Markdown("### 💬 Ask Questions")
+                    chatbot = gr.Chatbot(
+                        label="Conversation History",
+                        height=400
+                    )
+                    question_input = gr.Textbox(
+                        label="Your Question",
+                        placeholder="Ask a question about the uploaded image...",
+                        lines=2
+                    )
+                    with gr.Row():
+                        submit_btn = gr.Button("🚀 Ask Question", variant="primary", size="lg")
+                        reset_btn_basic = gr.Button("🔄 Reset All", variant="secondary", size="lg")
+        # ADVANCED MODE PANEL (shown only in Advanced Mode)
+        with gr.Group(visible=False) as advanced_panel:
+            gr.Markdown("## 🔬 Advanced Controls & Metadata")
+            with gr.Row():
+                # QUESTION TYPE SELECTOR (GUIDANCE ONLY)
+                with gr.Column(scale=1):
+                    gr.Markdown("### Question Type Selector (Guidance)")
+                    question_type = gr.Dropdown(
+                        choices=[
+                            "Object Presence",
+                            "Object Attribute (Color / Shape)",
+                            "Scene Understanding",
+                            "Yes / No Verification"
+                        ],
+                        label="Select Question Type",
+                        info="This is for guidance only. It does not change AI logic.",
+                        value="Object Presence"
+                    )
+                # SESSION METADATA PANEL (READ-ONLY)
+                with gr.Column(scale=1):
+                    gr.Markdown("### Session Metadata")
+                    session_metadata = gr.Markdown(
+                        "**Session ID:** Not initialized\n**Image Loaded:** No\n**Conversation Turns:** 0"
+                    )
+                    refresh_metadata_btn = gr.Button("🔄 Refresh Metadata", size="sm")
+            # CAPABILITY / SCOPE INDICATOR (STATIC INFO BOX)
+            with gr.Row():
+                gr.Markdown("""
+                ### ⚙️ System Capabilities & Limitations
+                **What this system CAN do:**
+                - ✅ Image-grounded Question Answering
+                - ✅ Single-image Conversational Memory
+                - ✅ Multi-turn dialogue about the same image
+                **What this system CANNOT do:**
+                - ❌ Exhaustive Object Listing (not object detection)
+                - ❌ Multi-image Reasoning
+                - ❌ Precise Counting (VQA models have known limitations)
+                - ❌ Open-domain knowledge questions unrelated to the image
+                *This is an academic prototype demonstrating AI system design, not a production object detection system.*
+                """)
+            # ADVANCED RESET CONTROLS
+            with gr.Row():
+                gr.Markdown("### Reset Controls")
+            with gr.Row():
+                clear_chat_btn = gr.Button("💬 Clear Chat Only", variant="secondary")
+                remove_image_btn = gr.Button("🖼️ Remove Image Only", variant="secondary")
+                full_reset_btn = gr.Button("🔄 Full Reset (Image + Chat)", variant="stop")
+        # Footer
+        gr.Markdown("""
+        ---
+        **Academic Prototype** | Demonstrates AI system design for visual question answering
+        **Tech Stack:** Python • HuggingFace BLIP • Gradio • Session-based Memory
+        """)
+        # ====================================================================
+        # EVENT HANDLERS
+        # ====================================================================
+        def toggle_mode(mode_choice):
+            """
+            Toggle between Basic and Advanced Mode.
+            Mode toggle does NOT reset session or image.
+            """
+            is_advanced = (mode_choice == "Advanced Mode")
+            return {
+                advanced_panel: gr.update(visible=is_advanced),
+                basic_instructions: gr.update(visible=not is_advanced),
+                reset_btn_basic: gr.update(visible=not is_advanced)
+            }
+        def upload_image_handler(image, session_id):
+            """
+            Handle image upload event.
+            Stores image and generates guided topic suggestions.
+            """
+            status = handle_image_upload(session_memory, session_id, image)
+            # Generate guided topic suggestions
+            suggestions = generate_visual_topic_suggestions(vqa_engine, image)
+            # Update topic buttons
+            updates = []
+            for i in range(4):
+                if i < len(suggestions):
+                    updates.append(gr.update(value=suggestions[i], visible=True))
+                else:
+                    updates.append(gr.update(value="", visible=False))
+            return [status] + updates
+        def topic_click_handler(topic_text):
+            """
+            Handle topic chip click.
+            Prefills question input with suggested topic.
+            User can edit before submitting.
+            """
+            return f"What is the {topic_text} in the image?"
+        def ask_question_handler(question, session_id):
+            """
+            Handle question submission.
+            Uses existing process_question logic (unchanged).
+            """
+            answer, history = process_question(
+                vqa_engine, session_memory, session_id, question
+            )
+            formatted_history = format_history_for_chatbot(history)
+            return formatted_history, ""
+        def question_type_change_handler(question_type):
+            """
+            Handle question type selector change.
+            Optionally prefills question input with example.
+            This is GUIDANCE ONLY.
+            """
+            examples = {
+                "Object Presence": "Is there a [object] in the image?",
+                "Object Attribute (Color / Shape)": "What color is the [object]?",
+                "Scene Understanding": "What is happening in the image?",
+                "Yes / No Verification": "Is the [object] [attribute]?"
+            }
+            return examples.get(question_type, "")
+        def refresh_metadata_handler(session_id):
+            """
+            Refresh session metadata display.
+            """
+            return get_session_metadata(session_memory, session_id)
+        def clear_chat_handler(session_id):
+            """
+            Clear chat only (Advanced Mode).
+            """
+            status, history = clear_chat_only(session_memory, session_id)
+            return status, []
+        def remove_image_handler(session_id):
+            """
+            Remove image only (Advanced Mode).
+            """
+            status, image = remove_image_only(session_memory, session_id)
+            return status, image
+        def full_reset_handler(session_id):
+            """
+            Full reset (Advanced Mode).
+            """
+            status, history, image = reset_conversation(session_memory, session_id)
+            return status, [], image, "", "", "", ""
+        def basic_reset_handler(session_id):
+            """
+            Basic mode reset.
+            """
+            status, history, image = reset_conversation(session_memory, session_id)
+            return status, [], image
+        # ====================================================================
+        # WIRE UP EVENTS
+        # ====================================================================
+        # Mode toggle
+        mode_selector.change(
+            fn=toggle_mode,
+            inputs=[mode_selector],
+            outputs=[advanced_panel, basic_instructions, reset_btn_basic]
+        )
+        # Image upload
+        upload_btn.click(
+            fn=upload_image_handler,
+            inputs=[image_input, session_id],
+            outputs=[upload_status, topic_btn_1, topic_btn_2, topic_btn_3, topic_btn_4]
+        )
+        # Topic chip clicks
+        topic_btn_1.click(
+            fn=topic_click_handler,
+            inputs=[topic_btn_1],
+            outputs=[question_input]
+        )
+        topic_btn_2.click(
+            fn=topic_click_handler,
+            inputs=[topic_btn_2],
+            outputs=[question_input]
+        )
+        topic_btn_3.click(
+            fn=topic_click_handler,
+            inputs=[topic_btn_3],
+            outputs=[question_input]
+        )
+        topic_btn_4.click(
+            fn=topic_click_handler,
+            inputs=[topic_btn_4],
+            outputs=[question_input]
+        )
+        # Question submission
+        submit_btn.click(
+            fn=ask_question_handler,
+            inputs=[question_input, session_id],
+            outputs=[chatbot, question_input]
+        )
+        question_input.submit(
+            fn=ask_question_handler,
+            inputs=[question_input, session_id],
+            outputs=[chatbot, question_input]
+        )
+        # Question type selector (Advanced Mode)
+        question_type.change(
+            fn=question_type_change_handler,
+            inputs=[question_type],
+            outputs=[question_input]
+        )
+        # Metadata refresh (Advanced Mode)
+        refresh_metadata_btn.click(
+            fn=refresh_metadata_handler,
+            inputs=[session_id],
+            outputs=[session_metadata]
+        )
+        # Advanced reset controls
+        clear_chat_btn.click(
+            fn=clear_chat_handler,
+            inputs=[session_id],
+            outputs=[upload_status, chatbot]
+        )
+        remove_image_btn.click(
+            fn=remove_image_handler,
+            inputs=[session_id],
+            outputs=[upload_status, image_input]
+        )
+        full_reset_btn.click(
+            fn=full_reset_handler,
+            inputs=[session_id],
+            outputs=[upload_status, chatbot, image_input, topic_btn_1, topic_btn_2, topic_btn_3, topic_btn_4]
+        )
+        # Basic mode reset
+        reset_btn_basic.click(
+            fn=basic_reset_handler,
+            inputs=[session_id],
+            outputs=[upload_status, chatbot, image_input]
+        )
+    return demo
+# ============================================================================
+# MAIN APPLICATION ENTRY POINT
+# ============================================================================
+def main():
+    """
+    Initialize and launch the Visual Conversational Intelligence Engine.
+    """
+    print("=" * 60)
+    print("VISUAL CONVERSATIONAL INTELLIGENCE ENGINE")
+    print("=" * 60)
+    # Initialize core components
+    print("\n[1/3] Initializing Vision-Language Model...")
+    vqa_engine = VisualQAEngine(model_name="Salesforce/blip-vqa-base")
+    print("\n[2/3] Setting up session memory...")
+    session_memory = SessionMemory()
+    print("\n[3/3] Creating Gradio interface...")
+    demo = create_gradio_interface(vqa_engine, session_memory)
+    print("\n" + "=" * 60)
+    print("🚀 Launching application...")
+    print("=" * 60)
+    # Launch the application
+    demo.launch(
+        share=False,  # Set to True for public sharing
+        server_name="0.0.0.0",  # Allow external access
+        server_port=7860  # Standard Gradio port
+    )
+if __name__ == "__main__":
+    main()