Spaces:

shukdevdattaEX
/

Gemma-3n-Multi-modal-chatbot

Sleeping

App Files Files Community

shukdevdattaEX commited on Jul 19

Commit

a660b93

verified ·

1 Parent(s): dcec17f

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -14

app.py CHANGED Viewed

@@ -80,10 +80,39 @@ class MultimodalChatbot:
         except Exception as e:
             return f"Error transcribing audio: {str(e)}"
     def create_multimodal_message(self,
                                 text_input: str = "",
                                 pdf_file=None,
-                                audio_file=None) -> dict:
         """Create a multimodal message for the API"""
         content_parts = []
         processing_info = []
@@ -107,12 +136,21 @@ class MultimodalChatbot:
             })
             processing_info.append("🎤 Audio transcribed")
         return {"role": "user", "content": content_parts}, processing_info
     def chat(self,
              text_input: str = "",
              pdf_file=None,
              audio_file=None,
              history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
         """Main chat function"""
         if history is None:
@@ -126,11 +164,13 @@ class MultimodalChatbot:
                 user_message_parts.append("📄 PDF uploaded")
             if audio_file:
                 user_message_parts.append("🎤 Audio uploaded")
             user_display = " | ".join(user_message_parts)
             user_message, processing_info = self.create_multimodal_message(
-                text_input, pdf_file, audio_file
             )
             if processing_info:
@@ -168,7 +208,7 @@ def create_interface():
         This chatbot can process multiple types of input:
         - **Text**: Regular text messages
         - **PDF**: Extract and analyze document content
-        - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC)
         **Setup**: Enter your OpenRouter API key below to get started
         """)
@@ -239,6 +279,11 @@ def create_interface():
                             file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
                             type="filepath"
                         )
                         audio_text_input = gr.Textbox(
                             label="💬 Question about Audio",
                             placeholder="Ask something about the audio...",
@@ -273,6 +318,11 @@ def create_interface():
                             file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
                             type="filepath"
                         )
                         combined_submit_btn = gr.Button("🚀 Send All", variant="primary", size="lg", interactive=False)
                         combined_clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
@@ -317,7 +367,7 @@ def create_interface():
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
-        def process_audio_input(api_key, audio, text, history):
             if not api_key or len(api_key.strip()) == 0:
                 if history is None:
                     history = []
@@ -325,9 +375,9 @@ def create_interface():
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
-            return chatbot.chat(text_input=text, audio_file=audio, history=history)
-        def process_combined_input(api_key, text, pdf, audio, history):
             if not api_key or len(api_key.strip()) == 0:
                 if history is None:
                     history = []
@@ -335,14 +385,17 @@ def create_interface():
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
-            return chatbot.chat(text, pdf, audio, history)
         def clear_chat():
             return [], ""
-        def clear_all_inputs():
             return [], "", None, None
         api_key_input.change(
             validate_api_key,
             inputs=[api_key_input],
@@ -370,20 +423,21 @@ def create_interface():
         audio_submit_btn.click(
             process_audio_input,
-            inputs=[api_key_input, audio_input, audio_text_input, audio_chatbot],
             outputs=[audio_chatbot, audio_text_input]
         )
-        audio_clear_btn.click(lambda: ([], "", None), outputs=[audio_chatbot, audio_text_input, audio_input])
         combined_submit_btn.click(
             process_combined_input,
             inputs=[api_key_input, combined_text_input, combined_pdf_input,
-                   combined_audio_input, combined_chatbot],
             outputs=[combined_chatbot, combined_text_input]
         )
         combined_clear_btn.click(clear_all_inputs,
                                outputs=[combined_chatbot, combined_text_input,
-                                      combined_pdf_input, combined_audio_input])
         gr.Markdown("""
         ### 🎯 How to Use Each Tab:
@@ -392,8 +446,9 @@ def create_interface():
         **📄 PDF Chat**: Upload a PDF and ask questions about its content
-        **🎤 Audio Chat**: Upload audio files for transcription and analysis
-        - Supports: WAV, MP3, M4A, FLAC, OGG formats
         - Best results with clear speech and minimal background noise
         **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
@@ -408,6 +463,7 @@ def create_interface():
         ### ⚠️ Current Limitations:
         - Audio transcription requires internet connection for best results
         - Large files may take longer to process
         """)
     return demo

         except Exception as e:
             return f"Error transcribing audio: {str(e)}"
+    def transcribe_recorded_audio(self, audio_data) -> str:
+        """Transcribe recorded audio to text"""
+        try:
+            recognizer = sr.Recognizer()
+            wav_path = tempfile.mktemp(suffix='.wav')
+            # Convert raw audio data to WAV
+            audio = AudioSegment.from_file(io.BytesIO(audio_data), format="wav")
+            audio.export(wav_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
+            with sr.AudioFile(wav_path) as source:
+                recognizer.adjust_for_ambient_noise(source, duration=0.2)
+                audio_data = recognizer.record(source)
+                try:
+                    text = recognizer.recognize_google(audio_data)
+                    return text
+                except sr.UnknownValueError:
+                    return "Could not understand the recorded audio. Please try with clearer audio."
+                except sr.RequestError as e:
+                    try:
+                        text = recognizer.recognize_sphinx(audio_data)
+                        return text
+                    except:
+                        return f"Speech recognition service error: {str(e)}"
+        except Exception as e:
+            return f"Error transcribing recorded audio: {str(e)}"
     def create_multimodal_message(self,
                                 text_input: str = "",
                                 pdf_file=None,
+                                audio_file=None,
+                                recorded_audio=None) -> dict:
         """Create a multimodal message for the API"""
         content_parts = []
         processing_info = []
             })
             processing_info.append("🎤 Audio transcribed")
+        if recorded_audio is not None:
+            audio_text = self.transcribe_recorded_audio(recorded_audio)
+            content_parts.append({
+                "type": "text",
+                "text": f"Recorded Audio Transcription:\n{audio_text}"
+            })
+            processing_info.append("🎙️ Recorded audio transcribed")
         return {"role": "user", "content": content_parts}, processing_info
     def chat(self,
              text_input: str = "",
              pdf_file=None,
              audio_file=None,
+             recorded_audio=None,
              history: List[Tuple[str, str]] = None) -> Tuple[List[Tuple[str, str]], str]:
         """Main chat function"""
         if history is None:
                 user_message_parts.append("📄 PDF uploaded")
             if audio_file:
                 user_message_parts.append("🎤 Audio uploaded")
+            if recorded_audio:
+                user_message_parts.append("🎙️ Recorded audio")
             user_display = " | ".join(user_message_parts)
             user_message, processing_info = self.create_multimodal_message(
+                text_input, pdf_file, audio_file, recorded_audio
             )
             if processing_info:
         This chatbot can process multiple types of input:
         - **Text**: Regular text messages
         - **PDF**: Extract and analyze document content
+        - **Audio**: Transcribe speech to text (supports WAV, MP3, M4A, FLAC, recorded audio)
         **Setup**: Enter your OpenRouter API key below to get started
         """)
                             file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
                             type="filepath"
                         )
+                        audio_recorder = gr.Audio(
+                            label="🎙️ Record Audio",
+                            source="microphone",
+                            type="numpy"
+                        )
                         audio_text_input = gr.Textbox(
                             label="💬 Question about Audio",
                             placeholder="Ask something about the audio...",
                             file_types=[".wav", ".mp3", ".m4a", ".flac", ".ogg"],
                             type="filepath"
                         )
+                        combined_audio_recorder = gr.Audio(
+                            label="🎙️ Record Audio",
+                            source="microphone",
+                            type="numpy"
+                        )
                         combined_submit_btn = gr.Button("🚀 Send All", variant="primary", size="lg", interactive=False)
                         combined_clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
             chatbot = MultimodalChatbot(api_key.strip())
             return chatbot.chat(text_input=text, pdf_file=pdf, history=history)
+        def process_audio_input(api_key, audio, recorded_audio, text, history):
             if not api_key or len(api_key.strip()) == 0:
                 if history is None:
                     history = []
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text_input=text, audio_file=audio, recorded_audio=recorded_audio, history=history)
+        def process_combined_input(api_key, text, pdf, audio, recorded_audio, history):
             if not api_key or len(api_key.strip()) == 0:
                 if history is None:
                     history = []
                 return history, ""
             chatbot = MultimodalChatbot(api_key.strip())
+            return chatbot.chat(text, pdf, audio, recorded_audio, history)
         def clear_chat():
             return [], ""
+        def clear_audio_inputs():
             return [], "", None, None
+        def clear_all_inputs():
+            return [], "", None, None, None
         api_key_input.change(
             validate_api_key,
             inputs=[api_key_input],
         audio_submit_btn.click(
             process_audio_input,
+            inputs=[api_key_input, audio_input, audio_recorder, audio_text_input, audio_chatbot],
             outputs=[audio_chatbot, audio_text_input]
         )
+        audio_clear_btn.click(clear_audio_inputs, outputs=[audio_chatbot, audio_text_input, audio_input, audio_recorder])
         combined_submit_btn.click(
             process_combined_input,
             inputs=[api_key_input, combined_text_input, combined_pdf_input,
+                   combined_audio_input, combined_audio_recorder, combined_chatbot],
             outputs=[combined_chatbot, combined_text_input]
         )
         combined_clear_btn.click(clear_all_inputs,
                                outputs=[combined_chatbot, combined_text_input,
+                                      combined_pdf_input, combined_audio_input,
+                                      combined_audio_recorder])
         gr.Markdown("""
         ### 🎯 How to Use Each Tab:
         **📄 PDF Chat**: Upload a PDF and ask questions about its content
+        **🎤 Audio Chat**: Upload or record audio files for transcription and analysis
+        - Supports: WAV, MP3, M4A, FLAC, OGG formats for uploads
+        - Recorded audio is processed directly from your microphone
         - Best results with clear speech and minimal background noise
         **🌟 Combined Chat**: Use multiple input types together for comprehensive analysis
         ### ⚠️ Current Limitations:
         - Audio transcription requires internet connection for best results
         - Large files may take longer to process
+        - Recorded audio quality depends on your microphone
         """)
     return demo