Spaces:

gzsol
/

lab2

Sleeping

App Files Files Community

zsolnai commited on Dec 1, 2025

Commit

367abb6

1 Parent(s): 4036a2f

Remove gpu stuff as failback

Browse files

Files changed (1) hide show

app.py +18 -19

app.py CHANGED Viewed

@@ -5,18 +5,16 @@ import numpy as np
 import soundfile as sf
 import torch
-# --- Device Setup (Only for the Whisper Pipeline) ---
-# Use the device index (0) if CUDA is available, otherwise use CPU (-1)
-device_for_stt = 0 if torch.cuda.is_available() else -1
 # --- STT Setup (using Hugging Face's transformers pipeline for Whisper) ---
 from transformers import pipeline
 STT_MODEL_NAME = "openai/whisper-tiny.en"
-# Pass the device index to the pipeline initialization
-stt_pipe = pipeline(
-    "automatic-speech-recognition", model=STT_MODEL_NAME, device=device_for_stt
-)
 # --- TTS Setup (using coqui-ai/TTS) ---
 from TTS.api import TTS
@@ -24,20 +22,20 @@ from TTS.api import TTS
 TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"
 OUTPUT_WAV_FILE = "output.wav"
-# *** CRITICAL FIX HERE ***
-# Initialize the TTS object without explicit device movement.
-# For this specific model, the internal library logic often automatically
-# detects and uses the CUDA device if it's available in the environment.
-# Relying on the internal device management is often safer than forcing 'to(device)'.
 tts_model = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
 def speech_to_text(audio_file_path):
-    # ... (function body remains the same)
     if audio_file_path is None:
         return "Please upload an audio file or record your voice."
     try:
         result = stt_pipe(audio_file_path)
         return result["text"]
     except Exception as e:
@@ -45,28 +43,29 @@ def speech_to_text(audio_file_path):
 def text_to_speech(text):
-    # ... (function body remains the same)
     if not text:
         return None, "Please enter text for synthesis."
     try:
         # Generate the speech and save to a temporary file
         tts_model.tts_to_file(
             text=text,
             file_path=OUTPUT_WAV_FILE,
         )
-        return OUTPUT_WAV_FILE, "Speech synthesis complete."
     except Exception as e:
         return None, f"Error during TTS: {e}"
 # --- Gradio Interface ---
-# (The Gradio interface block remains the same)
 with gr.Blocks(css="#status {font-weight: bold;}") as demo:
-    gr.Markdown("# 🗣️ STT & TTS App (Whisper & TTS Libraries)")
     gr.Markdown(
-        "A demo for **Speech-to-Text (STT)** using **Whisper** and **Text-to-Speech (TTS)** using **Coqui TTS**."
     )
     gr.HTML("<hr>")
@@ -98,7 +97,7 @@ with gr.Blocks(css="#status {font-weight: bold;}") as demo:
                 lines=3,
                 value="Hello there, this is a demonstration of the text to speech model.",
             )
-            tts_button = gr.Button("Synthesize Speech")
         with gr.Column():
             audio_output = gr.Audio(label="Synthesized Audio")

 import soundfile as sf
 import torch
+# --- Device Setup (Explicitly set to CPU) ---
+# This ensures PyTorch operations (used by both models) use the CPU only.
+device = "cpu"
 # --- STT Setup (using Hugging Face's transformers pipeline for Whisper) ---
 from transformers import pipeline
 STT_MODEL_NAME = "openai/whisper-tiny.en"
+# CRITICAL: Pass device="cpu" to the pipeline to ensure it doesn't try to use CUDA
+stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_NAME, device=device)
 # --- TTS Setup (using coqui-ai/TTS) ---
 from TTS.api import TTS
 TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"
 OUTPUT_WAV_FILE = "output.wav"
+# CRITICAL: Initialize the TTS model. It will use the CPU by default
+# since we are not passing any 'gpu=True' or using .to(device).
+# For older versions, this is the safest way to ensure CPU usage.
+# The `TTS` constructor should handle device placement to CPU when no GPU is found.
 tts_model = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
 def speech_to_text(audio_file_path):
+    """Performs Speech-to-Text using the Whisper model."""
     if audio_file_path is None:
         return "Please upload an audio file or record your voice."
     try:
+        # The pipeline can typically handle the file path directly
         result = stt_pipe(audio_file_path)
         return result["text"]
     except Exception as e:
 def text_to_speech(text):
+    """Performs Text-to-Speech using the Coqui TTS model."""
     if not text:
         return None, "Please enter text for synthesis."
     try:
         # Generate the speech and save to a temporary file
+        # This operation will be slow on the CPU, as expected.
         tts_model.tts_to_file(
             text=text,
             file_path=OUTPUT_WAV_FILE,
         )
+        # Return the file path and a success message
+        return OUTPUT_WAV_FILE, "Speech synthesis complete. (Completed slowly on CPU)"
     except Exception as e:
         return None, f"Error during TTS: {e}"
 # --- Gradio Interface ---
 with gr.Blocks(css="#status {font-weight: bold;}") as demo:
+    gr.Markdown("# 🗣️ STT & TTS App (CPU Only)")
     gr.Markdown(
+        "**NOTE:** This app is running on CPU-only hardware. Speech-to-Text (Whisper) is fast, but **Text-to-Speech (Coqui TTS) will be very slow**."
     )
     gr.HTML("<hr>")
                 lines=3,
                 value="Hello there, this is a demonstration of the text to speech model.",
             )
+            tts_button = gr.Button("Synthesize Speech (Will be slow)")
         with gr.Column():
             audio_output = gr.Audio(label="Synthesized Audio")