Spaces:
Sleeping
Sleeping
| """ | |
| VISUAL CONVERSATIONAL INTELLIGENCE ENGINE | |
| ========================================== | |
| A pluggable, image-grounded multi-turn conversational system. | |
| Architecture: | |
| - Session-based image memory (stored once, queried multiple times) | |
| - Vision-Language Model (BLIP) for image-question answering | |
| - REST-style core logic (pure functions) | |
| - Gradio UI for demonstration | |
| Academic Purpose: | |
| Demonstrates AI system design for visual question answering with | |
| conversational context, suitable for research evaluation. | |
| """ | |
| import gradio as gr | |
| from PIL import Image | |
| from transformers import BlipProcessor, BlipForQuestionAnswering | |
| import torch | |
| from typing import Optional, Tuple, List | |
| import uuid | |
| # ============================================================================ | |
| # SESSION MEMORY MANAGEMENT | |
| # ============================================================================ | |
| class SessionMemory: | |
| """ | |
| Manages session state for image-grounded conversations. | |
| Each session stores: | |
| - uploaded_image: PIL Image object | |
| - conversation_history: List of (question, answer) tuples | |
| - session_id: Unique identifier for the session | |
| """ | |
| def __init__(self): | |
| self.sessions = {} | |
| def create_session(self) -> str: | |
| """Create a new session and return its ID.""" | |
| session_id = str(uuid.uuid4()) | |
| self.sessions[session_id] = { | |
| 'uploaded_image': None, | |
| 'conversation_history': [] | |
| } | |
| return session_id | |
| def store_image(self, session_id: str, image: Image.Image) -> None: | |
| """Store an image in session memory.""" | |
| if session_id in self.sessions: | |
| self.sessions[session_id]['uploaded_image'] = image | |
| def get_image(self, session_id: str) -> Optional[Image.Image]: | |
| """Retrieve the stored image from session.""" | |
| if session_id in self.sessions: | |
| return self.sessions[session_id]['uploaded_image'] | |
| return None | |
| def add_to_history(self, session_id: str, question: str, answer: str) -> None: | |
| """Add a Q&A pair to conversation history.""" | |
| if session_id in self.sessions: | |
| self.sessions[session_id]['conversation_history'].append((question, answer)) | |
| def get_history(self, session_id: str) -> List[Tuple[str, str]]: | |
| """Retrieve conversation history.""" | |
| if session_id in self.sessions: | |
| return self.sessions[session_id]['conversation_history'] | |
| return [] | |
| def reset_session(self, session_id: str) -> None: | |
| """Clear all session data (image + conversation history).""" | |
| if session_id in self.sessions: | |
| self.sessions[session_id] = { | |
| 'uploaded_image': None, | |
| 'conversation_history': [] | |
| } | |
| # ============================================================================ | |
| # VISION-LANGUAGE MODEL INITIALIZATION | |
| # ============================================================================ | |
| class VisualQAEngine: | |
| """ | |
| Core inference engine using BLIP (Bootstrapping Language-Image Pre-training). | |
| BLIP is a vision-language model that can answer questions about images. | |
| We use the pretrained model without any fine-tuning. | |
| """ | |
| def __init__(self, model_name: str = "Salesforce/blip-vqa-base"): | |
| """ | |
| Initialize the BLIP model and processor. | |
| Args: | |
| model_name: HuggingFace model identifier | |
| """ | |
| print(f"Loading model: {model_name}") | |
| self.processor = BlipProcessor.from_pretrained(model_name) | |
| self.model = BlipForQuestionAnswering.from_pretrained(model_name) | |
| # Use GPU if available, otherwise CPU (for HuggingFace Spaces compatibility) | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.model.to(self.device) | |
| print(f"Model loaded on device: {self.device}") | |
| def answer_question(self, image: Image.Image, question: str) -> str: | |
| """ | |
| Generate an answer to a question about the image. | |
| This is a PURE FUNCTION suitable for REST APIs: | |
| - Takes image + question as input | |
| - Returns answer as output | |
| - No side effects | |
| Args: | |
| image: PIL Image object | |
| question: User's question about the image | |
| Returns: | |
| Generated answer grounded in the image | |
| """ | |
| # Preprocess image and question | |
| inputs = self.processor(image, question, return_tensors="pt").to(self.device) | |
| # Generate answer using the vision-language model | |
| with torch.no_grad(): | |
| outputs = self.model.generate(**inputs, max_length=50) | |
| # Decode the generated answer | |
| answer = self.processor.decode(outputs[0], skip_special_tokens=True) | |
| return answer | |
| # ============================================================================ | |
| # APPLICATION LOGIC (REST-STYLE PURE FUNCTIONS) | |
| # ============================================================================ | |
| def validate_question(question: str, image: Optional[Image.Image]) -> Tuple[bool, str]: | |
| """ | |
| Validate that conditions are met for answering a question. | |
| Validation rules: | |
| 1. Image must be uploaded | |
| 2. Question must not be empty | |
| Args: | |
| question: User's input question | |
| image: Stored image (or None) | |
| Returns: | |
| (is_valid, error_message) | |
| """ | |
| if image is None: | |
| return False, "β οΈ Please upload an image first before asking questions." | |
| if not question or question.strip() == "": | |
| return False, "β οΈ Please enter a question." | |
| return True, "" | |
| def process_question( | |
| vqa_engine: VisualQAEngine, | |
| session_memory: SessionMemory, | |
| session_id: str, | |
| question: str | |
| ) -> Tuple[str, List[Tuple[str, str]]]: | |
| """ | |
| Process a user question and generate an image-grounded answer. | |
| This function orchestrates the core conversational flow: | |
| 1. Validate inputs | |
| 2. Retrieve image from session | |
| 3. Generate answer using vision-language model | |
| 4. Update conversation history | |
| 5. Return answer + updated history | |
| Args: | |
| vqa_engine: Visual QA inference engine | |
| session_memory: Session storage | |
| session_id: Current session identifier | |
| question: User's question | |
| Returns: | |
| (answer, updated_conversation_history) | |
| """ | |
| # Retrieve stored image | |
| image = session_memory.get_image(session_id) | |
| # Validate inputs | |
| is_valid, error_msg = validate_question(question, image) | |
| if not is_valid: | |
| return error_msg, session_memory.get_history(session_id) | |
| # Generate image-grounded answer | |
| answer = vqa_engine.answer_question(image, question) | |
| # Update conversation history | |
| session_memory.add_to_history(session_id, question, answer) | |
| # Return answer and updated history | |
| return answer, session_memory.get_history(session_id) | |
| def handle_image_upload( | |
| session_memory: SessionMemory, | |
| session_id: str, | |
| image: Image.Image | |
| ) -> str: | |
| """ | |
| Handle image upload and store in session memory. | |
| Args: | |
| session_memory: Session storage | |
| session_id: Current session identifier | |
| image: Uploaded PIL Image | |
| Returns: | |
| Confirmation message | |
| """ | |
| if image is None: | |
| return "β οΈ No image uploaded." | |
| # Store image in session | |
| session_memory.store_image(session_id, image) | |
| return "β Image uploaded successfully! You can now ask questions about this image." | |
| def reset_conversation( | |
| session_memory: SessionMemory, | |
| session_id: str | |
| ) -> Tuple[str, List, None]: | |
| """ | |
| Reset the conversation (clear image and history). | |
| Args: | |
| session_memory: Session storage | |
| session_id: Current session identifier | |
| Returns: | |
| (status_message, empty_history, None_for_image) | |
| """ | |
| session_memory.reset_session(session_id) | |
| return "π Conversation reset. Please upload a new image.", [], None | |
| # ============================================================================ | |
| # GRADIO UI INTERFACE | |
| # ============================================================================ | |
| def format_history_for_chatbot(history: List[Tuple[str, str]]) -> List[dict]: | |
| """ | |
| Convert internal (question, answer) tuples into | |
| Gradio v4 Chatbot message format. | |
| """ | |
| messages = [] | |
| for q, a in history: | |
| messages.append({"role": "user", "content": q}) | |
| messages.append({"role": "assistant", "content": a}) | |
| return messages | |
| def create_gradio_interface(vqa_engine: VisualQAEngine, session_memory: SessionMemory) -> gr.Blocks: | |
| """ | |
| Create the Gradio UI for the Visual Conversational Intelligence Engine. | |
| UI Components: | |
| - Image upload | |
| - Question input | |
| - Chat history display | |
| - Reset button | |
| """ | |
| with gr.Blocks(title="Visual Conversational Intelligence Engine") as demo: | |
| # Session state (hidden) | |
| session_id = gr.State(value=session_memory.create_session()) | |
| # Header | |
| gr.Markdown(""" | |
| # π Visual Conversational Intelligence Engine | |
| **An image-grounded multi-turn conversational system** | |
| ### How to use: | |
| 1. **Upload an image** (required) | |
| 2. **Ask questions** about the image | |
| 3. **Continue the conversation** - ask follow-up questions without re-uploading | |
| 4. **Reset** to start over with a new image | |
| ### Important: | |
| - All answers are strictly grounded in the uploaded image | |
| - Questions unrelated to the image will be politely declined | |
| - The system uses BLIP (Vision-Language Model) for inference | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Image upload section | |
| gr.Markdown("### π€ Step 1: Upload Image") | |
| image_input = gr.Image( | |
| type="pil", | |
| label="Upload an image to analyze", | |
| height=300 | |
| ) | |
| upload_status = gr.Textbox( | |
| label="Upload Status", | |
| interactive=False, | |
| lines=1 | |
| ) | |
| # Upload button | |
| upload_btn = gr.Button("π₯ Upload Image", variant="primary") | |
| with gr.Column(scale=1): | |
| # Question and conversation section | |
| gr.Markdown("### π¬ Step 2: Ask Questions") | |
| chatbot = gr.Chatbot( | |
| label="Conversation History", | |
| height=300 | |
| ) | |
| question_input = gr.Textbox( | |
| label="Your Question", | |
| placeholder="Ask a question about the uploaded image...", | |
| lines=2 | |
| ) | |
| with gr.Row(): | |
| submit_btn = gr.Button("π Ask Question", variant="primary") | |
| reset_btn = gr.Button("π Reset Conversation", variant="secondary") | |
| # Event handlers | |
| def upload_image_handler(image, session_id): | |
| """Handle image upload event.""" | |
| status = handle_image_upload(session_memory, session_id, image) | |
| return status | |
| def ask_question_handler(question, session_id): | |
| answer, history = process_question( | |
| vqa_engine, session_memory, session_id, question | |
| ) | |
| formatted_history = format_history_for_chatbot(history) | |
| return formatted_history, "" # Return updated history and clear input | |
| def reset_handler(session_id): | |
| status, history, image = reset_conversation(session_memory, session_id) | |
| return status, [], image | |
| # Wire up events | |
| upload_btn.click( | |
| fn=upload_image_handler, | |
| inputs=[image_input, session_id], | |
| outputs=[upload_status] | |
| ) | |
| submit_btn.click( | |
| fn=ask_question_handler, | |
| inputs=[question_input, session_id], | |
| outputs=[chatbot, question_input] | |
| ) | |
| question_input.submit( | |
| fn=ask_question_handler, | |
| inputs=[question_input, session_id], | |
| outputs=[chatbot, question_input] | |
| ) | |
| reset_btn.click( | |
| fn=reset_handler, | |
| inputs=[session_id], | |
| outputs=[upload_status, chatbot, image_input] | |
| ) | |
| # Footer | |
| gr.Markdown(""" | |
| --- | |
| **Academic Prototype** | Demonstrates AI system design for visual question answering | |
| **Tech Stack:** Python β’ HuggingFace BLIP β’ Gradio β’ Session-based Memory | |
| """) | |
| return demo | |
| # ============================================================================ | |
| # MAIN APPLICATION ENTRY POINT | |
| # ============================================================================ | |
| def main(): | |
| """ | |
| Initialize and launch the Visual Conversational Intelligence Engine. | |
| """ | |
| print("=" * 60) | |
| print("VISUAL CONVERSATIONAL INTELLIGENCE ENGINE") | |
| print("=" * 60) | |
| # Initialize core components | |
| print("\n[1/3] Initializing Vision-Language Model...") | |
| vqa_engine = VisualQAEngine(model_name="Salesforce/blip-vqa-base") | |
| print("\n[2/3] Setting up session memory...") | |
| session_memory = SessionMemory() | |
| print("\n[3/3] Creating Gradio interface...") | |
| demo = create_gradio_interface(vqa_engine, session_memory) | |
| print("\n" + "=" * 60) | |
| print("π Launching application...") | |
| print("=" * 60) | |
| # Launch the application | |
| demo.launch( | |
| share=False, # Set to True for public sharing | |
| server_name="0.0.0.0", # Allow external access | |
| server_port=7860 # Standard Gradio port | |
| ) | |
| if __name__ == "__main__": | |
| main() | |