Spaces:

shukdevdattaEX
/

Gemma-3n-Multi-modal-chatbot

Sleeping

App Files Files Community

shukdevdattaEX commited on Jul 19

Commit

8c4798d

verified ·

1 Parent(s): 46e842f

Update app.py

Browse files

Files changed (1) hide show

app.py +430 -130

app.py CHANGED Viewed

@@ -11,6 +11,8 @@ import cv2
 import numpy as np
 from typing import List, Tuple, Optional
 import json
 class MultimodalChatbot:
     def __init__(self, api_key: str):
@@ -23,15 +25,21 @@ class MultimodalChatbot:
     def encode_image_to_base64(self, image) -> str:
         """Convert PIL Image to base64 string"""
-        if isinstance(image, str):
-            # If it's a file path
-            with open(image, "rb") as img_file:
-                return base64.b64encode(img_file.read()).decode('utf-8')
-        else:
-            # If it's a PIL Image
-            buffered = io.BytesIO()
-            image.save(buffered, format="PNG")
-            return base64.b64encode(buffered.getvalue()).decode('utf-8')
     def extract_pdf_text(self, pdf_file) -> str:
         """Extract text from PDF file"""
@@ -45,30 +53,70 @@ class MultimodalChatbot:
             text = ""
             with open(pdf_path, 'rb') as file:
                 pdf_reader = PyPDF2.PdfReader(file)
-                for page in pdf_reader.pages:
-                    text += page.extract_text() + "\n"
-            return text.strip()
         except Exception as e:
             return f"Error extracting PDF: {str(e)}"
-    def transcribe_audio(self, audio_file) -> str:
-        """Transcribe audio file to text"""
         try:
-            recognizer = sr.Recognizer()
             if hasattr(audio_file, 'name'):
                 audio_path = audio_file.name
             else:
                 audio_path = audio_file
-            with sr.AudioFile(audio_path) as source:
                 audio_data = recognizer.record(source)
-                text = recognizer.recognize_google(audio_data)
-                return text
         except Exception as e:
             return f"Error transcribing audio: {str(e)}"
-    def process_video(self, video_file) -> List[str]:
         """Extract frames from video and convert to base64"""
         try:
             if hasattr(video_file, 'name'):
@@ -77,24 +125,43 @@ class MultimodalChatbot:
                 video_path = video_file
             cap = cv2.VideoCapture(video_path)
             frames = []
             frame_count = 0
-            # Extract frames (every 30 frames to avoid too many)
-            while cap.read()[0] and frame_count < 10:  # Limit to 10 frames
                 ret, frame = cap.read()
-                if ret and frame_count % 30 == 0:
                     # Convert BGR to RGB
                     rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                     pil_image = Image.fromarray(rgb_frame)
                     base64_frame = self.encode_image_to_base64(pil_image)
-                    frames.append(base64_frame)
                 frame_count += 1
             cap.release()
-            return frames
         except Exception as e:
-            return [f"Error processing video: {str(e)}"]
     def create_multimodal_message(self,
                                 text_input: str = "",
@@ -105,6 +172,7 @@ class MultimodalChatbot:
         """Create a multimodal message for the API"""
         content_parts = []
         # Add text content
         if text_input:
@@ -117,6 +185,7 @@ class MultimodalChatbot:
                 "type": "text",
                 "text": f"PDF Content:\n{pdf_text}"
             })
         # Process Audio
         if audio_file is not None:
@@ -125,30 +194,35 @@ class MultimodalChatbot:
                 "type": "text",
                 "text": f"Audio Transcription:\n{audio_text}"
             })
-        # Process Image
         if image_file is not None:
-            image_base64 = self.encode_image_to_base64(image_file)
             content_parts.append({
-                "type": "image_url",
-                "image_url": {
-                    "url": f"data:image/png;base64,{image_base64}"
-                }
             })
-        # Process Video
-        if video_file is not None:
-            video_frames = self.process_video(video_file)
-            for i, frame_base64 in enumerate(video_frames):
-                if not frame_base64.startswith("Error"):
-                    content_parts.append({
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{frame_base64}"
-                        }
-                    })
-        return {"role": "user", "content": content_parts}
     def chat(self,
              text_input: str = "",
@@ -179,10 +253,14 @@ class MultimodalChatbot:
             user_display = " | ".join(user_message_parts)
             # Create multimodal message
-            user_message = self.create_multimodal_message(
                 text_input, pdf_file, audio_file, image_file, video_file
             )
             # Add to conversation history
             messages = [user_message]
@@ -194,7 +272,7 @@ class MultimodalChatbot:
                 },
                 model=self.model,
                 messages=messages,
-                max_tokens=1024,
                 temperature=0.7
             )
@@ -213,9 +291,6 @@ class MultimodalChatbot:
 def create_interface():
     """Create the Gradio interface"""
-    # Chatbot will be initialized when API key is provided
-    chatbot = None
     with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🤖 Multimodal Chatbot with Gemma 3n
@@ -223,9 +298,9 @@ def create_interface():
         This chatbot can process multiple types of input:
         - **Text**: Regular text messages
         - **PDF**: Extract and analyze document content
-        - **Audio**: Transcribe speech to text
-        - **Images**: Analyze visual content
-        - **Video**: Extract frames and analyze video content
         **Setup**: Enter your OpenRouter API key below to get started
         """)
@@ -245,53 +320,175 @@ def create_interface():
                     interactive=False
                 )
-        with gr.Row():
-            with gr.Column(scale=1):
-                # Input components
-                text_input = gr.Textbox(
-                    label="💬 Text Input",
-                    placeholder="Type your message here...",
-                    lines=3
-                )
-                pdf_input = gr.File(
-                    label="📄 PDF Upload",
-                    file_types=[".pdf"],
-                    type="filepath"
-                )
-                audio_input = gr.File(
-                    label="🎤 Audio Upload",
-                    file_types=[".wav", ".mp3", ".m4a", ".flac"],
-                    type="filepath"
-                )
-                image_input = gr.Image(
-                    label="🖼️ Image Upload",
-                    type="pil"
-                )
-                video_input = gr.File(
-                    label="🎥 Video Upload",
-                    file_types=[".mp4", ".avi", ".mov", ".mkv"],
-                    type="filepath"
-                )
-                submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
-                clear_btn = gr.Button("🗑️ Clear", variant="secondary")
-            with gr.Column(scale=2):
-                # Chat interface
-                chatbot_interface = gr.Chatbot(
-                    label="Chat History",
-                    height=600,
-                    bubble_full_width=False
-                )
         # Event handlers
         def validate_api_key(api_key):
             if not api_key or len(api_key.strip()) == 0:
-                return "❌ API Key not provided", gr.update(interactive=False)
             try:
                 # Test the API key by creating a client
@@ -299,60 +496,157 @@ def create_interface():
                     base_url="https://openrouter.ai/api/v1",
                     api_key=api_key.strip(),
                 )
-                return "✅ API Key validated successfully", gr.update(interactive=True)
             except Exception as e:
-                return f"❌ API Key validation failed: {str(e)}", gr.update(interactive=False)
-        def process_input(api_key, text, pdf, audio, image, video, history):
             if not api_key or len(api_key.strip()) == 0:
                 if history is None:
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
-            # Initialize chatbot with the provided API key
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text, pdf, audio, image, video, history)
-        def clear_all():
             return [], "", None, None, None, None
         # API Key validation
         api_key_input.change(
             validate_api_key,
             inputs=[api_key_input],
-            outputs=[api_status, submit_btn]
         )
-        # Button events
-        submit_btn.click(
-            process_input,
-            inputs=[api_key_input, text_input, pdf_input, audio_input, image_input, video_input, chatbot_interface],
-            outputs=[chatbot_interface, text_input]
         )
-        clear_btn.click(
-            clear_all,
-            outputs=[chatbot_interface, text_input, pdf_input, audio_input, image_input, video_input]
         )
-        # Enter key support
-        text_input.submit(
-            process_input,
-            inputs=[api_key_input, text_input, pdf_input, audio_input, image_input, video_input, chatbot_interface],
-            outputs=[chatbot_interface, text_input]
         )
-        # Examples
         gr.Markdown("""
-        ### 🎯 Example Usage:
-        1. **First**: Enter your OpenRouter API key in the field above
-        2. **Then try these examples**:
-           - Upload a PDF and ask "Summarize this document"
-           - Upload an image and ask "What do you see in this image?"
-           - Record audio and ask "What did I say?"
-           - Upload a video and ask "Describe what's happening"
-           - Combine multiple inputs: "Compare this image with the PDF content"
         ### 🔑 Getting an API Key:
         1. Go to [OpenRouter.ai](https://openrouter.ai)
@@ -360,6 +654,11 @@ def create_interface():
         3. Navigate to the API Keys section
         4. Create a new API key
         5. Copy and paste it in the field above
         """)
     return demo
@@ -373,20 +672,21 @@ if __name__ == "__main__":
         "Pillow",
         "SpeechRecognition",
         "opencv-python",
-        "numpy"
     ]
     print("🚀 Multimodal Chatbot with Gemma 3n")
     print("=" * 50)
     print("Required packages:", ", ".join(required_packages))
     print("\n📦 To install: pip install " + " ".join(required_packages))
     print("\n🔑 Get your API key from: https://openrouter.ai")
     print("💡 Enter your API key in the web interface when it loads")
     demo = create_interface()
     demo.launch(
-        share=True,
-        server_name="0.0.0.0",
-        server_port=7860,
-        debug=True
     )

 import numpy as np
 from typing import List, Tuple, Optional
 import json
+import pydub
+from pydub import AudioSegment
 class MultimodalChatbot:
     def __init__(self, api_key: str):
     def encode_image_to_base64(self, image) -> str:
         """Convert PIL Image to base64 string"""
+        try:
+            if isinstance(image, str):
+                # If it's a file path
+                with open(image, "rb") as img_file:
+                    return base64.b64encode(img_file.read()).decode('utf-8')
+            else:
+                # If it's a PIL Image
+                buffered = io.BytesIO()
+                # Convert to RGB if it's RGBA
+                if image.mode == 'RGBA':
+                    image = image.convert('RGB')
+                image.save(buffered, format="JPEG", quality=85)
+                return base64.b64encode(buffered.getvalue()).decode('utf-8')
+        except Exception as e:
+            return f"Error encoding image: {str(e)}"
     def extract_pdf_text(self, pdf_file) -> str:
         """Extract text from PDF file"""
             text = ""
             with open(pdf_path, 'rb') as file:
                 pdf_reader = PyPDF2.PdfReader(file)
+                for page_num, page in enumerate(pdf_reader.pages):
+                    page_text = page.extract_text()
+                    if page_text.strip():
+                        text += f"Page {page_num + 1}:\n{page_text}\n\n"
+            return text.strip() if text.strip() else "No text could be extracted from this PDF."
         except Exception as e:
             return f"Error extracting PDF: {str(e)}"
+    def convert_audio_to_wav(self, audio_file) -> str:
+        """Convert audio file to WAV format for speech recognition"""
         try:
             if hasattr(audio_file, 'name'):
                 audio_path = audio_file.name
             else:
                 audio_path = audio_file
+            # Get file extension
+            file_ext = os.path.splitext(audio_path)[1].lower()
+            # If already WAV, return as is
+            if file_ext == '.wav':
+                return audio_path
+            # Convert to WAV using pydub
+            audio = AudioSegment.from_file(audio_path)
+            # Export as WAV with proper settings for speech recognition
+            wav_path = tempfile.mktemp(suffix='.wav')
+            audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
+            return wav_path
+        except Exception as e:
+            raise Exception(f"Error converting audio: {str(e)}")
+    def transcribe_audio(self, audio_file) -> str:
+        """Transcribe audio file to text"""
+        try:
+            recognizer = sr.Recognizer()
+            # Convert audio to WAV format
+            wav_path = self.convert_audio_to_wav(audio_file)
+            with sr.AudioFile(wav_path) as source:
+                # Adjust for ambient noise
+                recognizer.adjust_for_ambient_noise(source, duration=0.2)
                 audio_data = recognizer.record(source)
+                # Try Google Speech Recognition
+                try:
+                    text = recognizer.recognize_google(audio_data)
+                    return text
+                except sr.UnknownValueError:
+                    return "Could not understand the audio. Please try with clearer audio."
+                except sr.RequestError as e:
+                    # Fallback to offline recognition if available
+                    try:
+                        text = recognizer.recognize_sphinx(audio_data)
+                        return text
+                    except:
+                        return f"Speech recognition service error: {str(e)}"
         except Exception as e:
             return f"Error transcribing audio: {str(e)}"
+    def process_video(self, video_file) -> Tuple[List[str], str]:
         """Extract frames from video and convert to base64"""
         try:
             if hasattr(video_file, 'name'):
                 video_path = video_file
             cap = cv2.VideoCapture(video_path)
+            if not cap.isOpened():
+                return [], "Error: Could not open video file"
             frames = []
+            frame_descriptions = []
             frame_count = 0
+            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            fps = cap.get(cv2.CAP_PROP_FPS)
+            # Extract frames (every 60 frames or every 2 seconds)
+            frame_interval = max(60, int(fps * 2)) if fps > 0 else 60
+            while cap.read()[0] and len(frames) < 5:  # Limit to 5 frames
                 ret, frame = cap.read()
+                if ret and frame_count % frame_interval == 0:
                     # Convert BGR to RGB
                     rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                     pil_image = Image.fromarray(rgb_frame)
+                    # Resize image to reduce size
+                    pil_image.thumbnail((800, 600), Image.Resampling.LANCZOS)
                     base64_frame = self.encode_image_to_base64(pil_image)
+                    if not base64_frame.startswith("Error"):
+                        frames.append(base64_frame)
+                        timestamp = frame_count / fps if fps > 0 else frame_count
+                        frame_descriptions.append(f"Frame at {timestamp:.1f}s")
                 frame_count += 1
             cap.release()
+            description = f"Video processed: {len(frames)} frames extracted from {total_frames} total frames"
+            return frames, description
         except Exception as e:
+            return [], f"Error processing video: {str(e)}"
     def create_multimodal_message(self,
                                 text_input: str = "",
         """Create a multimodal message for the API"""
         content_parts = []
+        processing_info = []
         # Add text content
         if text_input:
                 "type": "text",
                 "text": f"PDF Content:\n{pdf_text}"
             })
+            processing_info.append("📄 PDF processed")
         # Process Audio
         if audio_file is not None:
                 "type": "text",
                 "text": f"Audio Transcription:\n{audio_text}"
             })
+            processing_info.append("🎤 Audio transcribed")
+        # Process Image - Use text-only approach since vision isn't supported
         if image_file is not None:
+            # Since vision isn't supported, we'll describe what we can about the image
+            if hasattr(image_file, 'size'):
+                width, height = image_file.size
+                mode = image_file.mode
+                content_parts.append({
+                    "type": "text",
+                    "text": f"Image uploaded: {width}x{height} pixels, mode: {mode}. Note: Visual analysis not available with current model configuration."
+                })
+            else:
+                content_parts.append({
+                    "type": "text",
+                    "text": "Image uploaded. Note: Visual analysis not available with current model configuration."
+                })
+            processing_info.append("🖼️ Image received (metadata only)")
+        # Process Video - Use text-only approach since vision isn't supported
+        if video_file is not None:
+            frames, video_desc = self.process_video(video_file)
             content_parts.append({
+                "type": "text",
+                "text": f"Video uploaded: {video_desc}. Note: Visual analysis not available with current model configuration."
             })
+            processing_info.append("🎥 Video processed (metadata only)")
+        return {"role": "user", "content": content_parts}, processing_info
     def chat(self,
              text_input: str = "",
             user_display = " | ".join(user_message_parts)
             # Create multimodal message
+            user_message, processing_info = self.create_multimodal_message(
                 text_input, pdf_file, audio_file, image_file, video_file
             )
+            # Add processing info to display
+            if processing_info:
+                user_display += f"\n{' | '.join(processing_info)}"
             # Add to conversation history
             messages = [user_message]
                 },
                 model=self.model,
                 messages=messages,
+                max_tokens=2048,
                 temperature=0.7
             )
 def create_interface():
     """Create the Gradio interface"""
     with gr.Blocks(title="Multimodal Chatbot with Gemma 3n", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🤖 Multimodal Chatbot with Gemma 3n
         This chatbot can process multiple types of input:
         - **Text**: Regular text messages
         - **PDF**: Extract and analyze document content
+        - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
+        - **Images**: Upload images (metadata analysis only due to model limitations)
+        - **Video**: Upload videos (metadata analysis only due to model limitations)
         **Setup**: Enter your OpenRouter API key below to get started
         """)
                     interactive=False
                 )
+        # Tabbed Interface
+        with gr.Tabs():
+            # Text Chat Tab
+            with gr.TabItem("💬 Text Chat"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        text_input = gr.Textbox(
+                            label="💬 Text Input",
+                            placeholder="Type your message here...",
+                            lines=5
+                        )
+                        text_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
+                        text_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                    with gr.Column(scale=2):
+                        text_chatbot = gr.Chatbot(
+                            label="Text Chat History",
+                            height=600,
+                            bubble_full_width=False,
+                            show_copy_button=True
+                        )
+            # PDF Chat Tab
+            with gr.TabItem("📄 PDF Chat"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        pdf_input = gr.File(
+                            label="📄 PDF Upload",
+                            file_types=[".pdf"],
+                            type="filepath"
+                        )
+                        pdf_text_input = gr.Textbox(
+                            label="💬 Question about PDF",
+                            placeholder="Ask something about the PDF...",
+                            lines=3
+                        )
+                        pdf_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
+                        pdf_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                    with gr.Column(scale=2):
+                        pdf_chatbot = gr.Chatbot(
+                            label="PDF Chat History",
+                            height=600,
+                            bubble_full_width=False,
+                            show_copy_button=True
+                        )
+            # Audio Chat Tab
+            with gr.TabItem("🎤 Audio Chat"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        audio_input = gr.File(
+                            label="🎤 Audio Upload",
+                            file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
+                            type="filepath"
+                        )
+                        audio_text_input = gr.Textbox(
+                            label="💬 Question about Audio",
+                            placeholder="Ask something about the audio...",
+                            lines=3
+                        )
+                        audio_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
+                        audio_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                    with gr.Column(scale=2):
+                        audio_chatbot = gr.Chatbot(
+                            label="Audio Chat History",
+                            height=600,
+                            bubble_full_width=False,
+                            show_copy_button=True
+                        )
+            # Image Chat Tab
+            with gr.TabItem("🖼️ Image Chat"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        image_input = gr.Image(
+                            label="🖼️ Image Upload",
+                            type="pil"
+                        )
+                        image_text_input = gr.Textbox(
+                            label="💬 Question about Image",
+                            placeholder="Ask something about the image...",
+                            lines=3
+                        )
+                        image_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
+                        image_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                    with gr.Column(scale=2):
+                        image_chatbot = gr.Chatbot(
+                            label="Image Chat History",
+                            height=600,
+                            bubble_full_width=False,
+                            show_copy_button=True
+                        )
+            # Video Chat Tab
+            with gr.TabItem("🎥 Video Chat"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        video_input = gr.File(
+                            label="🎥 Video Upload",
+                            file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
+                            type="filepath"
+                        )
+                        video_text_input = gr.Textbox(
+                            label="💬 Question about Video",
+                            placeholder="Ask something about the video...",
+                            lines=3
+                        )
+                        video_submit_btn = gr.Button("🚀 Send", variant="primary", size="lg", interactive=False)
+                        video_clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                    with gr.Column(scale=2):
+                        video_chatbot = gr.Chatbot(
+                            label="Video Chat History",
+                            height=600,
+                            bubble_full_width=False,
+                            show_copy_button=True
+                        )
+            # Combined Chat Tab
+            with gr.TabItem("🌟 Combined Chat"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        combined_text_input = gr.Textbox(
+                            label="💬 Text Input",
+                            placeholder="Type your message here...",
+                            lines=3
+                        )
+                        combined_pdf_input = gr.File(
+                            label="📄 PDF Upload",
+                            file_types=[".pdf"],
+                            type="filepath"
+                        )
+                        combined_audio_input = gr.File(
+                            label="🎤 Audio Upload",
+                            file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
+                            type="filepath"
+                        )
+                        combined_image_input = gr.Image(
+                            label="🖼️ Image Upload",
+                            type="pil"
+                        )
+                        combined_video_input = gr.File(
+                            label="🎥 Video Upload",
+                            file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
+                            type="filepath"
+                        )
+                        combined_submit_btn = gr.Button("🚀 Send All", variant="primary", size="lg", interactive=False)
+                        combined_clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
+                    with gr.Column(scale=2):
+                        combined_chatbot = gr.Chatbot(
+                            label="Combined Chat History",
+                            height=600,
+                            bubble_full_width=False,
+                            show_copy_button=True
+                        )
         # Event handlers
         def validate_api_key(api_key):
             if not api_key or len(api_key.strip()) == 0:
+                return "❌ API Key not provided", *[gr.update(interactive=False) for _ in range(6)]
             try:
                 # Test the API key by creating a client
                     base_url="https://openrouter.ai/api/v1",
                     api_key=api_key.strip(),
                 )
+                return "✅ API Key validated successfully", *[gr.update(interactive=True) for _ in range(6)]
             except Exception as e:
+                return f"❌ API Key validation failed: {str(e)}", *[gr.update(interactive=False) for _ in range(6)]
+        def process_text_input(api_key, text, history):
+            if not api_key or len(api_key.strip()) == 0:
+                if history is None:
+                    history = []
+                history.append(("Error", "❌ Please provide a valid API key first"))
+                return history, ""
+            chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text_input=text, history=history)
+        def process_pdf_input(api_key, pdf, text, history):
+            if not api_key or len(api_key.strip()) == 0:
+                if history is None:
+                    history = []
+                history.append(("Error", "❌ Please provide a valid API key first"))
+                return history, ""
+            chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
+        def process_audio_input(api_key, audio, text, history):
+            if not api_key or len(api_key.strip()) == 0:
+                if history is None:
+                    history = []
+                history.append(("Error", "❌ Please provide a valid API key first"))
+                return history, ""
+            chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text_input=text, audio_file=audio, history=history)
+        def process_image_input(api_key, image, text, history):
+            if not api_key or len(api_key.strip()) == 0:
+                if history is None:
+                    history = []
+                history.append(("Error", "❌ Please provide a valid API key first"))
+                return history, ""
+            chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text_input=text, image_file=image, history=history)
+        def process_video_input(api_key, video, text, history):
+            if not api_key or len(api_key.strip()) == 0:
+                if history is None:
+                    history = []
+                history.append(("Error", "❌ Please provide a valid API key first"))
+                return history, ""
+            chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text_input=text, video_file=video, history=history)
+        def process_combined_input(api_key, text, pdf, audio, image, video, history):
             if not api_key or len(api_key.strip()) == 0:
                 if history is None:
                     history = []
                 history.append(("Error", "❌ Please provide a valid API key first"))
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text, pdf, audio, image, video, history)
+        def clear_chat():
+            return [], ""
+        def clear_all_inputs():
             return [], "", None, None, None, None
         # API Key validation
         api_key_input.change(
             validate_api_key,
             inputs=[api_key_input],
+            outputs=[api_status, text_submit_btn, pdf_submit_btn, audio_submit_btn,
+                    image_submit_btn, video_submit_btn, combined_submit_btn]
         )
+        # Text chat events
+        text_submit_btn.click(
+            process_text_input,
+            inputs=[api_key_input, text_input, text_chatbot],
+            outputs=[text_chatbot, text_input]
         )
+        text_input.submit(
+            process_text_input,
+            inputs=[api_key_input, text_input, text_chatbot],
+            outputs=[text_chatbot, text_input]
+        )
+        text_clear_btn.click(clear_chat, outputs=[text_chatbot, text_input])
+        # PDF chat events
+        pdf_submit_btn.click(
+            process_pdf_input,
+            inputs=[api_key_input, pdf_input, pdf_text_input, pdf_chatbot],
+            outputs=[pdf_chatbot, pdf_text_input]
         )
+        pdf_clear_btn.click(lambda: ([], "", None), outputs=[pdf_chatbot, pdf_text_input, pdf_input])
+        # Audio chat events
+        audio_submit_btn.click(
+            process_audio_input,
+            inputs=[api_key_input, audio_input, audio_text_input, audio_chatbot],
+            outputs=[audio_chatbot, audio_text_input]
         )
+        audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
+        # Image chat events
+        image_submit_btn.click(
+            process_image_input,
+            inputs=[api_key_input, image_input, image_text_input, image_chatbot],
+            outputs=[image_chatbot, image_text_input]
+        )
+        image_clear_btn.click(lambda: ([], "", None), outputs=[image_chatbot, image_text_input, image_input])
+        # Video chat events
+        video_submit_btn.click(
+            process_video_input,
+            inputs=[api_key_input, video_input, video_text_input, video_chatbot],
+            outputs=[video_chatbot, video_text_input]
+        )
+        video_clear_btn.click(lambda: ([], "", None), outputs=[video_chatbot, video_text_input, video_input])
+        # Combined chat events
+        combined_submit_btn.click(
+            process_combined_input,
+            inputs=[api_key_input, combined_text_input, combined_pdf_input,
+                   combined_audio_input, combined_image_input, combined_video_input, combined_chatbot],
+            outputs=[combined_chatbot, combined_text_input]
+        )
+        combined_clear_btn.click(clear_all_inputs,
+                               outputs=[combined_chatbot, combined_text_input, combined_pdf_input,
+                                      combined_audio_input, combined_image_input, combined_video_input])
+        # Examples and Instructions
         gr.Markdown("""
+        ### 🎯 How to Use Each Tab:
+        **💬 Text Chat**: Simple text conversations with the AI
+        **📄 PDF Chat**: Upload a PDF and ask questions about its content
+        **🎤 Audio Chat**: Upload audio files for transcription and analysis
+        - Supports: WAV, MP3, M4A, FLAC, OGG formats
+        - Best results with clear speech and minimal background noise
+        **🖼️ Image Chat**: Upload images (currently metadata only due to model limitations)
+        **🎥 Video Chat**: Upload videos (currently metadata only due to model limitations)
+        **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
         ### 🔑 Getting an API Key:
         1. Go to [OpenRouter.ai](https://openrouter.ai)
         3. Navigate to the API Keys section
         4. Create a new API key
         5. Copy and paste it in the field above
+        ### ⚠️ Current Limitations:
+        - Image and video visual analysis not supported by the free Gemma 3n model
+        - Audio transcription requires internet connection for best results
+        - Large files may take longer to process
         """)
     return demo
         "Pillow",
         "SpeechRecognition",
         "opencv-python",
+        "numpy",
+        "pydub"
     ]
     print("🚀 Multimodal Chatbot with Gemma 3n")
     print("=" * 50)
     print("Required packages:", ", ".join(required_packages))
     print("\n📦 To install: pip install " + " ".join(required_packages))
+    print("\n🎤 For audio processing, you may also need:")
+    print("   - ffmpeg (for audio conversion)")
+    print("   - sudo apt-get install espeak espeak-data libespeak1 libespeak-dev (for offline speech recognition)")
     print("\n🔑 Get your API key from: https://openrouter.ai")
     print("💡 Enter your API key in the web interface when it loads")
     demo = create_interface()
     demo.launch(
+        share=True
     )