Spaces:

gzsol
/

lab2

Sleeping

App Files Files Community

zsolnai commited on Dec 1, 2025

Commit

102e36f

1 Parent(s): 276657d

Fix claude mistake v5

Browse files

Files changed (1) hide show

app.py +268 -118

app.py CHANGED Viewed

@@ -2,90 +2,59 @@ import os
 import tempfile
 import gradio as gr
-# Note: Added numpy/soundfile import which might be needed by TTS/Whisper internally
 import numpy as np
 import soundfile as sf
 import torch
 # --- Device Setup (Explicitly set to CPU) ---
 device = "cpu"
-# --- STT Setup (using Hugging Face's transformers pipeline for Whisper) ---
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 STT_MODEL_NAME = "openai/whisper-tiny.en"
 stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_NAME, device=device)
-# --- LLM Setup (using Hugging Face's transformers for text generation) ---
 LLM_MODEL_NAME = "microsoft/DialoGPT-medium"
 chatbot_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
 chatbot_model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_NAME)
 chatbot_model.to(device)
-# --- TTS Setup (using coqui-ai/TTS) ---
-from TTS.api import TTS
 TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"
 tts_model = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
-def speech_to_text(audio_file_path):
-    """Performs Speech-to-Text using the Whisper model."""
-    if audio_file_path is None:
-        return "Please upload an audio file or record your voice."
-    try:
-        result = stt_pipe(audio_file_path)
-        return result["text"]
-    except Exception as e:
-        return f"Error during STT: {e}"
-def text_to_speech(text):
-    """Performs Text-to-Speech using the Coqui TTS model."""
-    if not text:
-        return None, "Please enter text for synthesis."
-    try:
-        # Create a temporary file for each request
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-        output_path = temp_file.name
-        temp_file.close()
-        # Generate the speech (slow on CPU)
-        tts_model.tts_to_file(
-            text=text,
-            file_path=output_path,
-        )
-        return output_path, "Speech synthesis complete. (Completed slowly on CPU)"
-    except Exception as e:
-        # Clean up temp file on failure
-        if os.path.exists(output_path):
-            os.remove(output_path)
-        return None, f"Error during TTS: {e}"
 def chat_with_bot(message, history, chat_history_ids=None):
-    """Chat with the conversational AI model using DialoGPT."""
     if not message or not message.strip():
-        # If message is empty, return current history and state
-        return history, chat_history_ids
     try:
-        # Move inputs to CPU (required for consistent CPU-only operation)
         new_input_ids = chatbot_tokenizer.encode(
             message + chatbot_tokenizer.eos_token, return_tensors="pt"
         ).to(device)
-        # Append the new user input tokens to the chat history
         if chat_history_ids is not None:
-            # Ensure history is on the correct device before concatenation
-            bot_input_ids = torch.cat(
-                [chat_history_ids.to(device), new_input_ids], dim=-1
-            )
         else:
             bot_input_ids = new_input_ids
-        # Generate a response
         chat_history_ids = chatbot_model.generate(
             bot_input_ids,
             max_length=1000,
@@ -96,23 +65,92 @@ def chat_with_bot(message, history, chat_history_ids=None):
             top_p=0.95,
         )
-        # Decode the response
         response = chatbot_tokenizer.decode(
-            # Select the response part only
-            chat_history_ids[:, bot_input_ids.shape[-1] :][0],
-            skip_special_tokens=True,
         )
-        # CRITICAL FIX: Append to history in the Gradio Chatbot (list of lists/tuples) format
         history.append((message, response))
-        # Return the updated history for display and the new state for the next turn
-        return history, chat_history_ids
     except Exception as e:
-        # Append error message to history using the correct format
         history.append((message, f"Error: {e}"))
-        return history, chat_history_ids
 # --- Gradio Interface ---
@@ -125,26 +163,153 @@ custom_css = """
     height: 400px;
 }
 """
-# CRITICAL FIX: The 'css' argument must be passed to launch() (it's correct here)
 with gr.Blocks() as demo:
-    gr.Markdown("# 🗣️ STT, TTS & Chat App (CPU Only)")
     gr.Markdown(
-        "**NOTE:** This app is running on CPU-only hardware. Speech-to-Text (Whisper) is fast, but **Text-to-Speech (Coqui TTS) and Chat will be slow**."
     )
-    # Hidden state to store chat history IDs (PyTorch Tensor)
-    chat_state = gr.State(value=None)
-    # Create tabs for different features
     with gr.Tabs():
-        # Tab 1: Chat Interface
-        with gr.TabItem("💬 Chat"):
-            gr.Markdown("## Chat with AI Assistant")
             gr.Markdown(
-                "Have a conversation with the DialoGPT model. It remembers context from your conversation!"
             )
-            # Initialized to an empty list, which Gradio's Chatbot expects
             chatbot = gr.Chatbot(
                 label="Conversation", elem_classes=["chatbot"], value=[]
             )
@@ -153,63 +318,48 @@ with gr.Blocks() as demo:
                 placeholder="Type your message here and press Enter...",
                 lines=2,
             )
             with gr.Row():
                 submit_btn = gr.Button("Send", variant="primary")
                 clear_btn = gr.Button("Clear Chat")
-            # Functionality to handle chat submission
-            # The 'chatbot' component provides the 'history' (list of tuples)
-            fn_call = submit_btn.click(
-                chat_with_bot,
-                inputs=[msg, chatbot, chat_state],
-                outputs=[chatbot, chat_state],
-                # Clear the message box after the main function runs
             ).then(lambda: "", None, msg)
-            # Ensure msg.submit does the same thing as the submit button
-            msg.submit(
-                chat_with_bot,
-                inputs=[msg, chatbot, chat_state],
-                outputs=[chatbot, chat_state],
             ).then(lambda: "", None, msg)
-            # Clear button resets both the displayed history and the token history state
-            clear_btn.click(lambda: ([], None), None, [chatbot, chat_state])
-        # Tab 2: STT
-        with gr.TabItem("🎤 Speech-to-Text"):
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("## 🎤 Speech-to-Text (STT)")
-                    audio_input = gr.Audio(
-                        sources=["microphone", "upload"],
-                        type="filepath",
-                        label="Input Audio (Mic or Upload)",
-                    )
-                    stt_button = gr.Button("Convert Speech to Text")
-                with gr.Column():
-                    stt_output = gr.Textbox(label="Transcribed Text", lines=3)
-            stt_button.click(fn=speech_to_text, inputs=audio_input, outputs=stt_output)
-        # Tab 3: TTS
-        with gr.TabItem("🔊 Text-to-Speech"):
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("## 🔊 Text-to-Speech (TTS)")
-                    text_input = gr.Textbox(
-                        label="Text to Synthesize",
-                        lines=3,
-                        value="Hello there, this is a demonstration of the text to speech model.",
-                    )
-                    tts_button = gr.Button("Synthesize Speech (Will be slow)")
-                with gr.Column():
-                    audio_output = gr.Audio(label="Synthesized Audio")
-                    tts_status = gr.Textbox(elem_id="status", label="Status")
-            tts_button.click(
-                fn=text_to_speech, inputs=text_input, outputs=[audio_output, tts_status]
             )
-# Pass the 'css' argument to launch()
 demo.launch(css=custom_css)

 import tempfile
 import gradio as gr
 import numpy as np
 import soundfile as sf
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from TTS.api import TTS
 # --- Device Setup (Explicitly set to CPU) ---
 device = "cpu"
+# --- Model Initialization ---
+# STT
 STT_MODEL_NAME = "openai/whisper-tiny.en"
 stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_NAME, device=device)
+# LLM (Chatbot)
 LLM_MODEL_NAME = "microsoft/DialoGPT-medium"
 chatbot_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
 chatbot_model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_NAME)
 chatbot_model.to(device)
+# TTS
 TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"
 tts_model = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
+# --- Core Functions ---
 def chat_with_bot(message, history, chat_history_ids=None):
+    """
+    Chat with the conversational AI model using DialoGPT.
+    Returns: (updated_history, updated_chat_ids, response_text)
+    """
     if not message or not message.strip():
+        # Add an empty entry to history to maintain the structure expected by Gradio
+        history.append(("", ""))
+        return history, chat_history_ids, ""
     try:
+        # 1. Encode user message and move to CPU
         new_input_ids = chatbot_tokenizer.encode(
             message + chatbot_tokenizer.eos_token, return_tensors="pt"
         ).to(device)
+        # 2. Prepare full input IDs (previous history + new message)
         if chat_history_ids is not None:
+            # Ensure history tensor is on CPU before concatenation
+            chat_history_ids = chat_history_ids.to(device)
+            bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1)
         else:
             bot_input_ids = new_input_ids
+        # 3. Generate response
         chat_history_ids = chatbot_model.generate(
             bot_input_ids,
             max_length=1000,
             top_p=0.95,
         )
+        # 4. Decode response
         response = chatbot_tokenizer.decode(
+            chat_history_ids[:, bot_input_ids.shape[-1] :][0], skip_special_tokens=True
         )
+        # CRITICAL FIX: Append to history in the Gradio Chatbot (list of tuples) format
         history.append((message, response))
+        return history, chat_history_ids, response
     except Exception as e:
+        # CRITICAL FIX: Append error to history in the Gradio Chatbot (list of tuples) format
         history.append((message, f"Error: {e}"))
+        return history, chat_history_ids, f"Error: {e}"
+def text_to_speech_from_chat(chat_response):
+    """Takes the chat response and converts it to speech."""
+    if not chat_response or chat_response.startswith("Error"):
+        return None, "No valid response to synthesize."
+    output_path = None
+    try:
+        # Create a temporary file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        output_path = temp_file.name
+        temp_file.close()
+        # Generate the speech (slow on CPU)
+        tts_model.tts_to_file(
+            text=chat_response,
+            file_path=output_path,
+        )
+        return output_path, "Speech synthesis complete. (Completed slowly on CPU)"
+    except Exception as e:
+        # Clean up temp file on failure
+        if output_path and os.path.exists(output_path):
+            os.remove(output_path)
+        return None, f"Error during TTS: {e}"
+def speech_to_text_and_chat(audio_file_path, history, chat_history_ids):
+    """Performs STT, then Chatbot generation, returning the final response text and audio."""
+    if audio_file_path is None:
+        return (
+            "Please upload an audio file or record your voice.",
+            history,
+            chat_history_ids,
+            "",
+            None,
+            "Awaiting audio input.",
+        )
+    # 1. STT
+    try:
+        result = stt_pipe(audio_file_path)
+        transcribed_text = result["text"]
+    except Exception as e:
+        return (
+            f"Error during STT: {e}",
+            history,
+            chat_history_ids,
+            "",
+            None,
+            f"Error during STT: {e}",
+        )
+    # 2. Chatbot
+    # The third returned value, last_response_text, is the pure text response.
+    updated_history, updated_chat_ids, last_response_text = chat_with_bot(
+        transcribed_text, history, chat_history_ids
+    )
+    # 3. TTS
+    audio_path, status_text = text_to_speech_from_chat(last_response_text)
+    # Returns: transcription, history, chat_ids, response_text, audio_path, status
+    return (
+        transcribed_text,
+        updated_history,
+        updated_chat_ids,
+        last_response_text,
+        audio_path,
+        status_text,
+    )
 # --- Gradio Interface ---
     height: 400px;
 }
 """
+# CRITICAL FIX: Removed css argument from gr.Blocks()
 with gr.Blocks() as demo:
+    gr.Markdown("# 🗣️ Integrated Voice Assistant (CPU Only)")
     gr.Markdown(
+        "**NOTE:** This app is running on CPU-only hardware. The full voice flow will be slow due to **Text-to-Speech**."
     )
+    # The global chat state can be used if tabs share history, or use local states per tab
+    global_chat_state = gr.State(value=None)
     with gr.Tabs():
+        # --- NEW FULL VOICE CHAT TAB (STT -> CHAT -> TTS) ---
+        with gr.TabItem("🗣️ Voice Assistant"):
+            gr.Markdown("## Talk to the AI Assistant")
             gr.Markdown(
+                "Speak into the microphone. Your speech will be transcribed, sent to the chatbot, and the chatbot's text response will be converted to audio."
+            )
+            # States specific to this tab
+            voice_chat_history = gr.Chatbot(
+                label="Conversation Log", elem_classes=["chatbot"], value=[]
             )
+            voice_chat_state = gr.State(value=None)  # Chat state IDs for this tab
+            with gr.Row():
+                audio_in = gr.Audio(
+                    sources=["microphone", "upload"],
+                    type="filepath",
+                    label="Input Audio (Mic or Upload)",
+                )
+                voice_audio_out = gr.Audio(label="AI Voice Response", autoplay=True)
+            voice_transcription = gr.Textbox(label="User Transcription", lines=2)
+            voice_response_text = gr.Textbox(label="AI Response (Text)", lines=2)
+            with gr.Row():
+                run_btn = gr.Button("Transcribe, Chat & Speak", variant="primary")
+                clear_voice_btn = gr.Button("Clear Conversation")
+            voice_status = gr.Textbox(elem_id="status", label="Status")
+            # Chain the functions together
+            run_btn.click(
+                fn=speech_to_text_and_chat,
+                inputs=[audio_in, voice_chat_history, voice_chat_state],
+                outputs=[
+                    voice_transcription,
+                    voice_chat_history,
+                    voice_chat_state,
+                    voice_response_text,
+                    voice_audio_out,
+                    voice_status,
+                ],
+            )
+            clear_voice_btn.click(
+                lambda: (None, [], None, "", None, ""),
+                None,
+                [
+                    audio_in,
+                    voice_chat_history,
+                    voice_chat_state,
+                    voice_response_text,
+                    voice_audio_out,
+                    voice_status,
+                ],
+            )
+        # --- EXISTING CHAT -> TTS TAB ---
+        with gr.TabItem("💬 Chat → Voice Output"):
+            gr.Markdown("## 💬 Chat with Voice Output")
+            tts_chatbot = gr.Chatbot(
+                label="Conversation", elem_classes=["chatbot"], value=[]
+            )
+            tts_msg = gr.Textbox(
+                label="Your Message",
+                placeholder="Type your message here and press Enter...",
+                lines=2,
+            )
+            tts_chat_state = gr.State(value=None)
+            with gr.Row():
+                tts_submit_btn = gr.Button("Send & Speak", variant="primary")
+                tts_clear_btn = gr.Button("Clear Chat")
+            with gr.Row():
+                with gr.Column():
+                    tts_response_text = gr.Textbox(label="AI Response (Text)", lines=3)
+                with gr.Column():
+                    tts_audio_output = gr.Audio(label="AI Response (Audio)")
+                    tts_status = gr.Textbox(elem_id="status", label="Status")
+            def chat_and_speak(message, history, chat_ids):
+                """Send message to chat and convert response to speech."""
+                # 1. Chatbot
+                updated_history, updated_ids, last_response = chat_with_bot(
+                    message, history, chat_ids
+                )
+                # 2. TTS
+                audio_path, status = text_to_speech_from_chat(last_response)
+                return updated_history, updated_ids, last_response, audio_path, status
+            tts_submit_btn.click(
+                fn=chat_and_speak,
+                inputs=[tts_msg, tts_chatbot, tts_chat_state],
+                outputs=[
+                    tts_chatbot,
+                    tts_chat_state,
+                    tts_response_text,
+                    tts_audio_output,
+                    tts_status,
+                ],
+            ).then(lambda: "", None, tts_msg)
+            tts_msg.submit(
+                fn=chat_and_speak,
+                inputs=[tts_msg, tts_chatbot, tts_chat_state],
+                outputs=[
+                    tts_chatbot,
+                    tts_chat_state,
+                    tts_response_text,
+                    tts_audio_output,
+                    tts_status,
+                ],
+            ).then(lambda: "", None, tts_msg)
+            tts_clear_btn.click(
+                lambda: ([], None, "", None, "Awaiting input."),
+                None,
+                [
+                    tts_chatbot,
+                    tts_chat_state,
+                    tts_response_text,
+                    tts_audio_output,
+                    tts_status,
+                ],
+            )
+        # --- EXISTING TEXT CHAT ONLY TAB ---
+        with gr.TabItem("💬 Text Chat Only"):
+            gr.Markdown("## Chat with AI Assistant")
             chatbot = gr.Chatbot(
                 label="Conversation", elem_classes=["chatbot"], value=[]
             )
                 placeholder="Type your message here and press Enter...",
                 lines=2,
             )
             with gr.Row():
                 submit_btn = gr.Button("Send", variant="primary")
                 clear_btn = gr.Button("Clear Chat")
+            # Use the global state for the text-only chat
+            fn_call = msg.submit(
+                lambda message, history, chat_state: chat_with_bot(
+                    message, history, chat_state
+                )[:2],
+                inputs=[msg, chatbot, global_chat_state],
+                outputs=[chatbot, global_chat_state],
             ).then(lambda: "", None, msg)
+            submit_btn.click(
+                lambda message, history, chat_state: chat_with_bot(
+                    message, history, chat_state
+                )[:2],
+                inputs=[msg, chatbot, global_chat_state],
+                outputs=[chatbot, global_chat_state],
             ).then(lambda: "", None, msg)
+            clear_btn.click(lambda: ([], None), None, [chatbot, global_chat_state])
+        # --- EXISTING STANDALONE TTS TAB ---
+        with gr.TabItem("🔊 Text-to-Speech Only"):
+            gr.Markdown("## 🔊 Text-to-Speech (TTS)")
+            standalone_text_input = gr.Textbox(
+                label="Text to Synthesize",
+                lines=3,
+                value="Hello there, this is a demonstration of the text to speech model.",
+            )
+            standalone_tts_button = gr.Button("Synthesize Speech (Will be slow)")
+            standalone_audio_output = gr.Audio(label="Synthesized Audio")
+            standalone_tts_status = gr.Textbox(elem_id="status", label="Status")
+            standalone_tts_button.click(
+                fn=text_to_speech_from_chat,
+                inputs=standalone_text_input,
+                outputs=[standalone_audio_output, standalone_tts_status],
             )
+# CRITICAL FIX: Passed css argument to demo.launch()
 demo.launch(css=custom_css)