Spaces:

ganeshkumar383
/

VisualBot

Sleeping

App Files Files Community

ganeshkumar383 commited on Feb 8

Commit

928bc12

verified ·

1 Parent(s): 631dbaf

Create app.py

Browse files

Files changed (1) hide show

app.py +413 -0

app.py ADDED Viewed

	@@ -0,0 +1,413 @@

+"""
+VISUAL CONVERSATIONAL INTELLIGENCE ENGINE
+==========================================
+A pluggable, image-grounded multi-turn conversational system.
+Architecture:
+- Session-based image memory (stored once, queried multiple times)
+- Vision-Language Model (BLIP) for image-question answering
+- REST-style core logic (pure functions)
+- Gradio UI for demonstration
+Academic Purpose:
+Demonstrates AI system design for visual question answering with
+conversational context, suitable for research evaluation.
+"""
+import gradio as gr
+from PIL import Image
+from transformers import BlipProcessor, BlipForQuestionAnswering
+import torch
+from typing import Optional, Tuple, List
+import uuid
+# ============================================================================
+# SESSION MEMORY MANAGEMENT
+# ============================================================================
+class SessionMemory:
+    """
+    Manages session state for image-grounded conversations.
+    Each session stores:
+    - uploaded_image: PIL Image object
+    - conversation_history: List of (question, answer) tuples
+    - session_id: Unique identifier for the session
+    """
+    def __init__(self):
+        self.sessions = {}
+    def create_session(self) -> str:
+        """Create a new session and return its ID."""
+        session_id = str(uuid.uuid4())
+        self.sessions[session_id] = {
+            'uploaded_image': None,
+            'conversation_history': []
+        }
+        return session_id
+    def store_image(self, session_id: str, image: Image.Image) -> None:
+        """Store an image in session memory."""
+        if session_id in self.sessions:
+            self.sessions[session_id]['uploaded_image'] = image
+    def get_image(self, session_id: str) -> Optional[Image.Image]:
+        """Retrieve the stored image from session."""
+        if session_id in self.sessions:
+            return self.sessions[session_id]['uploaded_image']
+        return None
+    def add_to_history(self, session_id: str, question: str, answer: str) -> None:
+        """Add a Q&A pair to conversation history."""
+        if session_id in self.sessions:
+            self.sessions[session_id]['conversation_history'].append((question, answer))
+    def get_history(self, session_id: str) -> List[Tuple[str, str]]:
+        """Retrieve conversation history."""
+        if session_id in self.sessions:
+            return self.sessions[session_id]['conversation_history']
+        return []
+    def reset_session(self, session_id: str) -> None:
+        """Clear all session data (image + conversation history)."""
+        if session_id in self.sessions:
+            self.sessions[session_id] = {
+                'uploaded_image': None,
+                'conversation_history': []
+            }
+# ============================================================================
+# VISION-LANGUAGE MODEL INITIALIZATION
+# ============================================================================
+class VisualQAEngine:
+    """
+    Core inference engine using BLIP (Bootstrapping Language-Image Pre-training).
+    BLIP is a vision-language model that can answer questions about images.
+    We use the pretrained model without any fine-tuning.
+    """
+    def __init__(self, model_name: str = "Salesforce/blip-vqa-base"):
+        """
+        Initialize the BLIP model and processor.
+        Args:
+            model_name: HuggingFace model identifier
+        """
+        print(f"Loading model: {model_name}")
+        self.processor = BlipProcessor.from_pretrained(model_name)
+        self.model = BlipForQuestionAnswering.from_pretrained(model_name)
+        # Use GPU if available, otherwise CPU (for HuggingFace Spaces compatibility)
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(self.device)
+        print(f"Model loaded on device: {self.device}")
+    def answer_question(self, image: Image.Image, question: str) -> str:
+        """
+        Generate an answer to a question about the image.
+        This is a PURE FUNCTION suitable for REST APIs:
+        - Takes image + question as input
+        - Returns answer as output
+        - No side effects
+        Args:
+            image: PIL Image object
+            question: User's question about the image
+        Returns:
+            Generated answer grounded in the image
+        """
+        # Preprocess image and question
+        inputs = self.processor(image, question, return_tensors="pt").to(self.device)
+        # Generate answer using the vision-language model
+        with torch.no_grad():
+            outputs = self.model.generate(**inputs, max_length=50)
+        # Decode the generated answer
+        answer = self.processor.decode(outputs[0], skip_special_tokens=True)
+        return answer
+# ============================================================================
+# APPLICATION LOGIC (REST-STYLE PURE FUNCTIONS)
+# ============================================================================
+def validate_question(question: str, image: Optional[Image.Image]) -> Tuple[bool, str]:
+    """
+    Validate that conditions are met for answering a question.
+    Validation rules:
+    1. Image must be uploaded
+    2. Question must not be empty
+    Args:
+        question: User's input question
+        image: Stored image (or None)
+    Returns:
+        (is_valid, error_message)
+    """
+    if image is None:
+        return False, "⚠️ Please upload an image first before asking questions."
+    if not question or question.strip() == "":
+        return False, "⚠️ Please enter a question."
+    return True, ""
+def process_question(
+    vqa_engine: VisualQAEngine,
+    session_memory: SessionMemory,
+    session_id: str,
+    question: str
+) -> Tuple[str, List[Tuple[str, str]]]:
+    """
+    Process a user question and generate an image-grounded answer.
+    This function orchestrates the core conversational flow:
+    1. Validate inputs
+    2. Retrieve image from session
+    3. Generate answer using vision-language model
+    4. Update conversation history
+    5. Return answer + updated history
+    Args:
+        vqa_engine: Visual QA inference engine
+        session_memory: Session storage
+        session_id: Current session identifier
+        question: User's question
+    Returns:
+        (answer, updated_conversation_history)
+    """
+    # Retrieve stored image
+    image = session_memory.get_image(session_id)
+    # Validate inputs
+    is_valid, error_msg = validate_question(question, image)
+    if not is_valid:
+        return error_msg, session_memory.get_history(session_id)
+    # Generate image-grounded answer
+    answer = vqa_engine.answer_question(image, question)
+    # Update conversation history
+    session_memory.add_to_history(session_id, question, answer)
+    # Return answer and updated history
+    return answer, session_memory.get_history(session_id)
+def handle_image_upload(
+    session_memory: SessionMemory,
+    session_id: str,
+    image: Image.Image
+) -> str:
+    """
+    Handle image upload and store in session memory.
+    Args:
+        session_memory: Session storage
+        session_id: Current session identifier
+        image: Uploaded PIL Image
+    Returns:
+        Confirmation message
+    """
+    if image is None:
+        return "⚠️ No image uploaded."
+    # Store image in session
+    session_memory.store_image(session_id, image)
+    return "✅ Image uploaded successfully! You can now ask questions about this image."
+def reset_conversation(
+    session_memory: SessionMemory,
+    session_id: str
+) -> Tuple[str, List, None]:
+    """
+    Reset the conversation (clear image and history).
+    Args:
+        session_memory: Session storage
+        session_id: Current session identifier
+    Returns:
+        (status_message, empty_history, None_for_image)
+    """
+    session_memory.reset_session(session_id)
+    return "🔄 Conversation reset. Please upload a new image.", [], None
+# ============================================================================
+# GRADIO UI INTERFACE
+# ============================================================================
+def create_gradio_interface(vqa_engine: VisualQAEngine, session_memory: SessionMemory) -> gr.Blocks:
+    """
+    Create the Gradio UI for the Visual Conversational Intelligence Engine.
+    UI Components:
+    - Image upload
+    - Question input
+    - Chat history display
+    - Reset button
+    """
+    with gr.Blocks(title="Visual Conversational Intelligence Engine") as demo:
+        # Session state (hidden)
+        session_id = gr.State(value=session_memory.create_session())
+        # Header
+        gr.Markdown("""
+        # 🔍 Visual Conversational Intelligence Engine
+        **An image-grounded multi-turn conversational system**
+        ### How to use:
+        1. **Upload an image** (required)
+        2. **Ask questions** about the image
+        3. **Continue the conversation** - ask follow-up questions without re-uploading
+        4. **Reset** to start over with a new image
+        ### Important:
+        - All answers are strictly grounded in the uploaded image
+        - Questions unrelated to the image will be politely declined
+        - The system uses BLIP (Vision-Language Model) for inference
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Image upload section
+                gr.Markdown("### 📤 Step 1: Upload Image")
+                image_input = gr.Image(
+                    type="pil",
+                    label="Upload an image to analyze",
+                    height=300
+                )
+                upload_status = gr.Textbox(
+                    label="Upload Status",
+                    interactive=False,
+                    lines=1
+                )
+                # Upload button
+                upload_btn = gr.Button("📥 Upload Image", variant="primary")
+            with gr.Column(scale=1):
+                # Question and conversation section
+                gr.Markdown("### 💬 Step 2: Ask Questions")
+                chatbot = gr.Chatbot(
+                    label="Conversation History",
+                    height=300
+                )
+                question_input = gr.Textbox(
+                    label="Your Question",
+                    placeholder="Ask a question about the uploaded image...",
+                    lines=2
+                )
+                with gr.Row():
+                    submit_btn = gr.Button("🚀 Ask Question", variant="primary")
+                    reset_btn = gr.Button("🔄 Reset Conversation", variant="secondary")
+        # Event handlers
+        def upload_image_handler(image, session_id):
+            """Handle image upload event."""
+            status = handle_image_upload(session_memory, session_id, image)
+            return status
+        def ask_question_handler(question, session_id):
+            """Handle question submission event."""
+            answer, history = process_question(vqa_engine, session_memory, session_id, question)
+            return history, ""  # Return updated history and clear input
+        def reset_handler(session_id):
+            """Handle reset button event."""
+            status, history, image = reset_conversation(session_memory, session_id)
+            return status, history, image
+        # Wire up events
+        upload_btn.click(
+            fn=upload_image_handler,
+            inputs=[image_input, session_id],
+            outputs=[upload_status]
+        )
+        submit_btn.click(
+            fn=ask_question_handler,
+            inputs=[question_input, session_id],
+            outputs=[chatbot, question_input]
+        )
+        question_input.submit(
+            fn=ask_question_handler,
+            inputs=[question_input, session_id],
+            outputs=[chatbot, question_input]
+        )
+        reset_btn.click(
+            fn=reset_handler,
+            inputs=[session_id],
+            outputs=[upload_status, chatbot, image_input]
+        )
+        # Footer
+        gr.Markdown("""
+        ---
+        **Academic Prototype** | Demonstrates AI system design for visual question answering
+        **Tech Stack:** Python • HuggingFace BLIP • Gradio • Session-based Memory
+        """)
+    return demo
+# ============================================================================
+# MAIN APPLICATION ENTRY POINT
+# ============================================================================
+def main():
+    """
+    Initialize and launch the Visual Conversational Intelligence Engine.
+    """
+    print("=" * 60)
+    print("VISUAL CONVERSATIONAL INTELLIGENCE ENGINE")
+    print("=" * 60)
+    # Initialize core components
+    print("\n[1/3] Initializing Vision-Language Model...")
+    vqa_engine = VisualQAEngine(model_name="Salesforce/blip-vqa-base")
+    print("\n[2/3] Setting up session memory...")
+    session_memory = SessionMemory()
+    print("\n[3/3] Creating Gradio interface...")
+    demo = create_gradio_interface(vqa_engine, session_memory)
+    print("\n" + "=" * 60)
+    print("🚀 Launching application...")
+    print("=" * 60)
+    # Launch the application
+    demo.launch(
+        share=False,  # Set to True for public sharing
+        server_name="0.0.0.0",  # Allow external access
+        server_port=7860  # Standard Gradio port
+    )
+if __name__ == "__main__":
+    main()