Spaces:

gzsol
/

lab2

Sleeping

App Files Files Community

zsolnai commited on Dec 1, 2025

Commit

60cffca

1 Parent(s): 3944a6c

Add tab for texting llm

Browse files

Files changed (1) hide show

app.py +112 -58

app.py CHANGED Viewed

@@ -1,27 +1,26 @@
 import os
 import gradio as gr
-import numpy as np
-import soundfile as sf
 import torch
 # --- Device Setup (Explicitly set to CPU) ---
 device = "cpu"
 # --- STT Setup (using Hugging Face's transformers pipeline for Whisper) ---
-from transformers import pipeline
 STT_MODEL_NAME = "openai/whisper-tiny.en"
-# Pass device="cpu" to the pipeline
 stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_NAME, device=device)
 # --- TTS Setup (using coqui-ai/TTS) ---
 from TTS.api import TTS
 TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"
-OUTPUT_WAV_FILE = "output.wav"
-# Initialize the TTS model on CPU
 tts_model = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
@@ -29,7 +28,6 @@ def speech_to_text(audio_file_path):
     """Performs Speech-to-Text using the Whisper model."""
     if audio_file_path is None:
         return "Please upload an audio file or record your voice."
     try:
         result = stt_pipe(audio_file_path)
         return result["text"]
@@ -41,72 +39,128 @@ def text_to_speech(text):
     """Performs Text-to-Speech using the Coqui TTS model."""
     if not text:
         return None, "Please enter text for synthesis."
     try:
         # Generate the speech (slow on CPU)
         tts_model.tts_to_file(
             text=text,
-            file_path=OUTPUT_WAV_FILE,
         )
-        return OUTPUT_WAV_FILE, "Speech synthesis complete. (Completed slowly on CPU)"
     except Exception as e:
         return None, f"Error during TTS: {e}"
-# --- Gradio Interface ---
-# 1. Define the CSS here
-custom_css = "#status {font-weight: bold;}"
-# 2. Initialize Blocks without the 'css' argument
-with gr.Blocks() as demo:
-    gr.Markdown("# 🗣️ STT & TTS App (CPU Only)")
     gr.Markdown(
-        "**NOTE:** This app is running on CPU-only hardware. Speech-to-Text (Whisper) is fast, but **Text-to-Speech (Coqui TTS) will be very slow**."
     )
-    gr.HTML("<hr>")
-    # 1. STT Block
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("## 🎤 Speech-to-Text (STT)")
-            audio_input = gr.Audio(
-                sources=["microphone", "upload"],
-                type="filepath",
-                label="Input Audio (Mic or Upload)",
             )
-            stt_button = gr.Button("Convert Speech to Text")
-        with gr.Column():
-            stt_output = gr.Textbox(label="Transcribed Text", lines=3)
-    stt_button.click(fn=speech_to_text, inputs=audio_input, outputs=stt_output)
-    gr.HTML("<hr>")
-    # 2. TTS Block
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("## 🔊 Text-to-Speech (TTS)")
-            text_input = gr.Textbox(
-                label="Text to Synthesize",
-                lines=3,
-                value="Hello there, this is a demonstration of the text to speech model.",
             )
-            tts_button = gr.Button("Synthesize Speech (Will be slow)")
-        with gr.Column():
-            audio_output = gr.Audio(label="Synthesized Audio")
-            # The id="status" is still correct for applying CSS later
-            tts_status = gr.Textbox(elem_id="status", label="Status")
-    tts_button.click(
-        fn=text_to_speech, inputs=text_input, outputs=[audio_output, tts_status]
-    )
-# 3. Pass the 'css' argument to launch()
-demo.launch(css=custom_css)
-if os.path.exists(OUTPUT_WAV_FILE):
-    os.remove(OUTPUT_WAV_FILE)

 import os
+import tempfile
 import gradio as gr
 import torch
 # --- Device Setup (Explicitly set to CPU) ---
 device = "cpu"
 # --- STT Setup (using Hugging Face's transformers pipeline for Whisper) ---
+from transformers import Conversation, pipeline
 STT_MODEL_NAME = "openai/whisper-tiny.en"
 stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_NAME, device=device)
+# --- LLM Setup (using Hugging Face's transformers for text generation) ---
+LLM_MODEL_NAME = "microsoft/DialoGPT-medium"
+chatbot_pipe = pipeline("conversational", model=LLM_MODEL_NAME, device=device)
 # --- TTS Setup (using coqui-ai/TTS) ---
 from TTS.api import TTS
 TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"
 tts_model = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
     """Performs Speech-to-Text using the Whisper model."""
     if audio_file_path is None:
         return "Please upload an audio file or record your voice."
     try:
         result = stt_pipe(audio_file_path)
         return result["text"]
     """Performs Text-to-Speech using the Coqui TTS model."""
     if not text:
         return None, "Please enter text for synthesis."
     try:
+        # Create a temporary file for each request
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        output_path = temp_file.name
+        temp_file.close()
         # Generate the speech (slow on CPU)
         tts_model.tts_to_file(
             text=text,
+            file_path=output_path,
         )
+        return output_path, "Speech synthesis complete. (Completed slowly on CPU)"
     except Exception as e:
         return None, f"Error during TTS: {e}"
+def chat_with_bot(message, history):
+    """Chat with the conversational AI model."""
+    if not message:
+        return history
+    try:
+        # Create a new conversation with the full history
+        conversation = Conversation()
+        for user_msg, bot_msg in history:
+            conversation.add_user_input(user_msg)
+            if bot_msg:
+                conversation.append_response(bot_msg)
+        # Add the new user message
+        conversation.add_user_input(message)
+        # Get response from the model
+        result = chatbot_pipe(conversation)
+        response = result.generated_responses[-1]
+        # Append to history
+        history.append((message, response))
+        return history
+    except Exception as e:
+        history.append((message, f"Error: {e}"))
+        return history
+# --- Gradio Interface ---
+custom_css = """
+#status {
+    font-weight: bold;
+    color: #2563eb;
+}
+.chatbot {
+    height: 400px;
+}
+"""
+with gr.Blocks(css=custom_css) as demo:
+    gr.Markdown("# 🗣️ STT, TTS & Chat App (CPU Only)")
     gr.Markdown(
+        "**NOTE:** This app is running on CPU-only hardware. Speech-to-Text (Whisper) is fast, but **Text-to-Speech (Coqui TTS) and Chat will be slow**."
     )
+    # Create tabs for different features
+    with gr.Tabs():
+        # Tab 1: Chat Interface
+        with gr.TabItem("💬 Chat"):
+            gr.Markdown("## Chat with AI Assistant")
+            gr.Markdown(
+                "Have a conversation with the DialoGPT model. It remembers context from your conversation!"
             )
+            chatbot = gr.Chatbot(label="Conversation", elem_classes=["chatbot"])
+            msg = gr.Textbox(
+                label="Your Message",
+                placeholder="Type your message here and press Enter...",
+                lines=2,
             )
+            with gr.Row():
+                submit_btn = gr.Button("Send", variant="primary")
+                clear_btn = gr.Button("Clear Chat")
+            # Chat functionality
+            msg.submit(chat_with_bot, inputs=[msg, chatbot], outputs=chatbot).then(
+                lambda: "", None, msg
+            )
+            submit_btn.click(
+                chat_with_bot, inputs=[msg, chatbot], outputs=chatbot
+            ).then(lambda: "", None, msg)
+            clear_btn.click(lambda: [], None, chatbot)
+        # Tab 2: STT
+        with gr.TabItem("🎤 Speech-to-Text"):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("## 🎤 Speech-to-Text (STT)")
+                    audio_input = gr.Audio(
+                        sources=["microphone", "upload"],
+                        type="filepath",
+                        label="Input Audio (Mic or Upload)",
+                    )
+                    stt_button = gr.Button("Convert Speech to Text")
+                with gr.Column():
+                    stt_output = gr.Textbox(label="Transcribed Text", lines=3)
+            stt_button.click(fn=speech_to_text, inputs=audio_input, outputs=stt_output)
+        # Tab 3: TTS
+        with gr.TabItem("🔊 Text-to-Speech"):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("## 🔊 Text-to-Speech (TTS)")
+                    text_input = gr.Textbox(
+                        label="Text to Synthesize",
+                        lines=3,
+                        value="Hello there, this is a demonstration of the text to speech model.",
+                    )
+                    tts_button = gr.Button("Synthesize Speech (Will be slow)")
+                with gr.Column():
+                    audio_output = gr.Audio(label="Synthesized Audio")
+                    tts_status = gr.Textbox(elem_id="status", label="Status")
+            tts_button.click(
+                fn=text_to_speech, inputs=text_input, outputs=[audio_output, tts_status]
+            )
+demo.launch()