zsolnai commited on
Commit
4036a2f
·
1 Parent(s): d6fb39f

Redo logic for error

Browse files
Files changed (1) hide show
  1. app.py +16 -11
app.py CHANGED
@@ -5,34 +5,39 @@ import numpy as np
5
  import soundfile as sf
6
  import torch
7
 
 
 
 
 
8
  # --- STT Setup (using Hugging Face's transformers pipeline for Whisper) ---
9
  from transformers import pipeline
10
 
11
- # Using a small, English-only Whisper model for fast inference on Spaces
12
  STT_MODEL_NAME = "openai/whisper-tiny.en"
13
- device = 0 if torch.cuda.is_available() else -1
14
- stt_pipe = pipeline("automatic-speech-recognition", model=STT_MODEL_NAME, device=device)
 
 
15
 
16
  # --- TTS Setup (using coqui-ai/TTS) ---
17
  from TTS.api import TTS
18
 
19
- # Using a standard, high-quality English Tacotron2 model
20
  TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"
21
  OUTPUT_WAV_FILE = "output.wav"
22
 
 
 
 
 
 
23
  tts_model = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
24
 
25
- tts_model.to(device)
26
-
27
 
28
  def speech_to_text(audio_file_path):
29
- """Performs Speech-to-Text using the Whisper model."""
30
  if audio_file_path is None:
31
  return "Please upload an audio file or record your voice."
32
 
33
  try:
34
- # The pipeline can typically handle the file path directly
35
- # and manages necessary preprocessing (resampling, loading, etc.)
36
  result = stt_pipe(audio_file_path)
37
  return result["text"]
38
  except Exception as e:
@@ -40,7 +45,7 @@ def speech_to_text(audio_file_path):
40
 
41
 
42
  def text_to_speech(text):
43
- """Performs Text-to-Speech using the Coqui TTS model."""
44
  if not text:
45
  return None, "Please enter text for synthesis."
46
 
@@ -50,13 +55,13 @@ def text_to_speech(text):
50
  text=text,
51
  file_path=OUTPUT_WAV_FILE,
52
  )
53
- # Return the file path and a success message
54
  return OUTPUT_WAV_FILE, "Speech synthesis complete."
55
  except Exception as e:
56
  return None, f"Error during TTS: {e}"
57
 
58
 
59
  # --- Gradio Interface ---
 
60
 
61
  with gr.Blocks(css="#status {font-weight: bold;}") as demo:
62
  gr.Markdown("# 🗣️ STT & TTS App (Whisper & TTS Libraries)")
 
5
  import soundfile as sf
6
  import torch
7
 
8
+ # --- Device Setup (Only for the Whisper Pipeline) ---
9
+ # Use the device index (0) if CUDA is available, otherwise use CPU (-1)
10
+ device_for_stt = 0 if torch.cuda.is_available() else -1
11
+
12
  # --- STT Setup (using Hugging Face's transformers pipeline for Whisper) ---
13
  from transformers import pipeline
14
 
 
15
  STT_MODEL_NAME = "openai/whisper-tiny.en"
16
+ # Pass the device index to the pipeline initialization
17
+ stt_pipe = pipeline(
18
+ "automatic-speech-recognition", model=STT_MODEL_NAME, device=device_for_stt
19
+ )
20
 
21
  # --- TTS Setup (using coqui-ai/TTS) ---
22
  from TTS.api import TTS
23
 
 
24
  TTS_MODEL_NAME = "tts_models/en/ljspeech/tacotron2-DDC"
25
  OUTPUT_WAV_FILE = "output.wav"
26
 
27
+ # *** CRITICAL FIX HERE ***
28
+ # Initialize the TTS object without explicit device movement.
29
+ # For this specific model, the internal library logic often automatically
30
+ # detects and uses the CUDA device if it's available in the environment.
31
+ # Relying on the internal device management is often safer than forcing 'to(device)'.
32
  tts_model = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
33
 
 
 
34
 
35
  def speech_to_text(audio_file_path):
36
+ # ... (function body remains the same)
37
  if audio_file_path is None:
38
  return "Please upload an audio file or record your voice."
39
 
40
  try:
 
 
41
  result = stt_pipe(audio_file_path)
42
  return result["text"]
43
  except Exception as e:
 
45
 
46
 
47
  def text_to_speech(text):
48
+ # ... (function body remains the same)
49
  if not text:
50
  return None, "Please enter text for synthesis."
51
 
 
55
  text=text,
56
  file_path=OUTPUT_WAV_FILE,
57
  )
 
58
  return OUTPUT_WAV_FILE, "Speech synthesis complete."
59
  except Exception as e:
60
  return None, f"Error during TTS: {e}"
61
 
62
 
63
  # --- Gradio Interface ---
64
+ # (The Gradio interface block remains the same)
65
 
66
  with gr.Blocks(css="#status {font-weight: bold;}") as demo:
67
  gr.Markdown("# 🗣️ STT & TTS App (Whisper & TTS Libraries)")