zsolnai commited on
Commit
367abb6
·
1 Parent(s): 4036a2f

Remove gpu stuff as failback

Browse files
Files changed (1) hide show
  1. app.py +18 -19
app.py CHANGED
@@ -5,18 +5,16 @@ import numpy as np
5
  import soundfile as sf
6
  import torch
7
 
8
- # --- Device Setup (Only for the Whisper Pipeline) ---
9
- # Use the device index (0) if CUDA is available, otherwise use CPU (-1)
10
- device_for_stt = 0 if torch.cuda.is_available() else -1
11
 
12
  # --- STT Setup (using Hugging Face's transformers pipeline for Whisper) ---
13
  from transformers import pipeline
14
 
15
  STT_MODEL_NAME = "openai/whisper-tiny.en"
16
- # Pass the device index to the pipeline initialization
17
- stt_pipe = pipeline(
18
- "automatic-speech-recognition", model=STT_MODEL_NAME, device=device_for_stt
19
- )
20
 
21
  # --- TTS Setup (using coqui-ai/TTS) ---
22
  from TTS.api import TTS
@@ -24,20 +22,20 @@ from TTS.api import TTS
24
  TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"
25
  OUTPUT_WAV_FILE = "output.wav"
26
 
27
- # *** CRITICAL FIX HERE ***
28
- # Initialize the TTS object without explicit device movement.
29
- # For this specific model, the internal library logic often automatically
30
- # detects and uses the CUDA device if it's available in the environment.
31
- # Relying on the internal device management is often safer than forcing 'to(device)'.
32
  tts_model = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
33
 
34
 
35
  def speech_to_text(audio_file_path):
36
- # ... (function body remains the same)
37
  if audio_file_path is None:
38
  return "Please upload an audio file or record your voice."
39
 
40
  try:
 
41
  result = stt_pipe(audio_file_path)
42
  return result["text"]
43
  except Exception as e:
@@ -45,28 +43,29 @@ def speech_to_text(audio_file_path):
45
 
46
 
47
  def text_to_speech(text):
48
- # ... (function body remains the same)
49
  if not text:
50
  return None, "Please enter text for synthesis."
51
 
52
  try:
53
  # Generate the speech and save to a temporary file
 
54
  tts_model.tts_to_file(
55
  text=text,
56
  file_path=OUTPUT_WAV_FILE,
57
  )
58
- return OUTPUT_WAV_FILE, "Speech synthesis complete."
 
59
  except Exception as e:
60
  return None, f"Error during TTS: {e}"
61
 
62
 
63
  # --- Gradio Interface ---
64
- # (The Gradio interface block remains the same)
65
 
66
  with gr.Blocks(css="#status {font-weight: bold;}") as demo:
67
- gr.Markdown("# 🗣️ STT & TTS App (Whisper & TTS Libraries)")
68
  gr.Markdown(
69
- "A demo for **Speech-to-Text (STT)** using **Whisper** and **Text-to-Speech (TTS)** using **Coqui TTS**."
70
  )
71
 
72
  gr.HTML("<hr>")
@@ -98,7 +97,7 @@ with gr.Blocks(css="#status {font-weight: bold;}") as demo:
98
  lines=3,
99
  value="Hello there, this is a demonstration of the text to speech model.",
100
  )
101
- tts_button = gr.Button("Synthesize Speech")
102
 
103
  with gr.Column():
104
  audio_output = gr.Audio(label="Synthesized Audio")
 
5
  import soundfile as sf
6
  import torch
7
 
8
+ # --- Device Setup (Explicitly set to CPU) ---
9
+ # This ensures PyTorch operations (used by both models) use the CPU only.
10
+ device = "cpu"
11
 
12
  # --- STT Setup (using Hugging Face's transformers pipeline for Whisper) ---
13
  from transformers import pipeline
14
 
15
  STT_MODEL_NAME = "openai/whisper-tiny.en"
16
+ # CRITICAL: Pass device="cpu" to the pipeline to ensure it doesn't try to use CUDA
17
+ stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_NAME, device=device)
 
 
18
 
19
  # --- TTS Setup (using coqui-ai/TTS) ---
20
  from TTS.api import TTS
 
22
  TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"
23
  OUTPUT_WAV_FILE = "output.wav"
24
 
25
+ # CRITICAL: Initialize the TTS model. It will use the CPU by default
26
+ # since we are not passing any 'gpu=True' or using .to(device).
27
+ # For older versions, this is the safest way to ensure CPU usage.
28
+ # The `TTS` constructor should handle device placement to CPU when no GPU is found.
 
29
  tts_model = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
30
 
31
 
32
  def speech_to_text(audio_file_path):
33
+ """Performs Speech-to-Text using the Whisper model."""
34
  if audio_file_path is None:
35
  return "Please upload an audio file or record your voice."
36
 
37
  try:
38
+ # The pipeline can typically handle the file path directly
39
  result = stt_pipe(audio_file_path)
40
  return result["text"]
41
  except Exception as e:
 
43
 
44
 
45
  def text_to_speech(text):
46
+ """Performs Text-to-Speech using the Coqui TTS model."""
47
  if not text:
48
  return None, "Please enter text for synthesis."
49
 
50
  try:
51
  # Generate the speech and save to a temporary file
52
+ # This operation will be slow on the CPU, as expected.
53
  tts_model.tts_to_file(
54
  text=text,
55
  file_path=OUTPUT_WAV_FILE,
56
  )
57
+ # Return the file path and a success message
58
+ return OUTPUT_WAV_FILE, "Speech synthesis complete. (Completed slowly on CPU)"
59
  except Exception as e:
60
  return None, f"Error during TTS: {e}"
61
 
62
 
63
  # --- Gradio Interface ---
 
64
 
65
  with gr.Blocks(css="#status {font-weight: bold;}") as demo:
66
+ gr.Markdown("# 🗣️ STT & TTS App (CPU Only)")
67
  gr.Markdown(
68
+ "**NOTE:** This app is running on CPU-only hardware. Speech-to-Text (Whisper) is fast, but **Text-to-Speech (Coqui TTS) will be very slow**."
69
  )
70
 
71
  gr.HTML("<hr>")
 
97
  lines=3,
98
  value="Hello there, this is a demonstration of the text to speech model.",
99
  )
100
+ tts_button = gr.Button("Synthesize Speech (Will be slow)")
101
 
102
  with gr.Column():
103
  audio_output = gr.Audio(label="Synthesized Audio")