FresherDifference commited on
Commit
909f184
·
verified ·
1 Parent(s): 0513ad4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -51
app.py CHANGED
@@ -1,101 +1,120 @@
1
  import gradio as gr
2
  import numpy as np
 
 
3
  from pocket_tts import TTSModel
4
 
5
- # 1. Load the model once at startup (Global scope)
6
- # This prevents reloading the 100M parameters on every click, making it much faster.
7
  print("Loading Pocket-TTS model...")
 
8
  tts = TTSModel.load_model()
9
  print("Model loaded successfully.")
10
 
11
- # Define some preset voices available in the Kyutai library
12
- # Note: You can find more voices or exact paths in the kyutai/tts-voices repo
 
13
  PRESET_VOICES = {
14
- "Alba (American English)": "hf://kyutai/tts-voices/alba-mackenna/casual.wav",
15
- "Marius (French Accent)": "hf://kyutai/tts-voices/marius-reynaud/casual.wav",
16
- "Jean (Narrator)": "hf://kyutai/tts-voices/jean-dormeuil/casual.wav",
17
- "Fantine": "hf://kyutai/tts-voices/fantine-chevallier/casual.wav",
 
 
 
 
18
  }
19
 
20
- def generate_speech(text, voice_choice, custom_voice_file):
21
  """
22
- Generates audio from text using either a preset voice or a custom uploaded file.
 
23
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  if not text.strip():
25
- raise gr.Error("Please enter some text to generate speech.")
26
 
27
- # Determine which voice to use
28
- voice_path = None
29
-
30
- # Priority: Custom file > Preset selection
31
- if custom_voice_file is not None:
32
- print(f"Using custom voice cloning from: {custom_voice_file}")
33
- voice_path = custom_voice_file
34
- else:
35
- print(f"Using preset voice: {voice_choice}")
36
- voice_path = PRESET_VOICES.get(voice_choice)
 
 
 
 
37
 
38
- if not voice_path:
39
- raise gr.Error("Please select a voice or upload a reference audio file.")
 
 
 
 
 
 
 
40
 
41
- # 2. Process the voice prompt
42
- # This converts the wav file (or HF path) into the conditioning vector
43
- try:
44
- voice_state = tts.get_state_for_audio_prompt(voice_path)
45
- except Exception as e:
46
- raise gr.Error(f"Error loading voice: {str(e)}")
47
 
48
- # 3. Generate Audio
49
- # The output is a torch tensor, we need to convert it to numpy for Gradio
50
- try:
51
- audio_tensor = tts.generate_audio(voice_state, text)
52
  except Exception as e:
53
- raise gr.Error(f"Generation failed: {str(e)}")
54
-
55
- # Convert torch tensor to numpy array
56
- # pocket-tts usually returns (samples,) shape. Gradio expects (sample_rate, data)
57
- audio_numpy = audio_tensor.numpy()
58
-
59
- # Return tuple (sample_rate, audio_data)
60
- return (tts.sample_rate, audio_numpy)
61
 
62
- # 4. Build the Gradio Interface
63
  with gr.Blocks(title="Pocket-TTS Demo") as demo:
64
- gr.Markdown("# 🗣️ Pocket-TTS on CPU")
65
- gr.Markdown("A lightweight, 100M parameter text-to-speech model that runs purely on CPU.")
66
 
67
  with gr.Row():
68
  with gr.Column():
69
  text_input = gr.Textbox(
70
  label="Text to Speak",
71
- placeholder="Type something here...",
72
  lines=4,
73
- value="Pocket TTS is amazing because it runs efficiently on consumer hardware!"
74
  )
75
 
76
  with gr.Accordion("Voice Settings", open=True):
77
  voice_dropdown = gr.Dropdown(
78
  choices=list(PRESET_VOICES.keys()),
79
  value="Alba (American English)",
80
- label="Choose a Preset Voice"
81
  )
82
  gr.Markdown("**OR**")
83
  voice_upload = gr.Audio(
84
- label="Clone a Custom Voice (Upload .wav)",
85
  type="filepath"
86
  )
87
 
88
  submit_btn = gr.Button("Generate Audio", variant="primary")
89
 
90
  with gr.Column():
91
- audio_output = gr.Audio(label="Generated Speech", type="numpy")
92
 
93
- # Connect the button
94
  submit_btn.click(
95
  fn=generate_speech,
96
  inputs=[text_input, voice_dropdown, voice_upload],
97
  outputs=audio_output
98
  )
99
 
100
- # Launch the app
101
  demo.launch()
 
1
  import gradio as gr
2
  import numpy as np
3
+ import os
4
+ from pydub import AudioSegment
5
  from pocket_tts import TTSModel
6
 
7
+ # 1. Load the model
 
8
  print("Loading Pocket-TTS model...")
9
+ # Ensure you have HF_TOKEN in your Space Secrets for cloning to work
10
  tts = TTSModel.load_model()
11
  print("Model loaded successfully.")
12
 
13
+ # 2. Define Presets (Simple Strings Only)
14
+ # We map the display name to the internal ID string.
15
+ # We do NOT use URLs here to avoid 404 errors.
16
  PRESET_VOICES = {
17
+ "Alba (American English)": "alba",
18
+ "Marius (French)": "marius",
19
+ "Jean (Narrator)": "jean",
20
+ "Fantine": "fantine",
21
+ "Javert": "javert",
22
+ "Cosette": "cosette",
23
+ "Eponine": "eponine",
24
+ "Azelma": "azelma",
25
  }
26
 
27
+ def preprocess_audio(filepath):
28
  """
29
+ Takes any audio file (MP3, M4A, WAV), ensures it is
30
+ a valid 16-bit PCM WAV compatible with the model.
31
  """
32
+ try:
33
+ print(f"Converting file: {filepath}")
34
+ audio = AudioSegment.from_file(filepath)
35
+
36
+ # Pocket-TTS works best with mono, 24000Hz or 16000Hz, 16-bit
37
+ # We enforce standard wav settings here to prevent "RIFF id" errors
38
+ audio = audio.set_channels(1).set_sample_width(2)
39
+
40
+ output_path = filepath + "_fixed.wav"
41
+ audio.export(output_path, format="wav")
42
+ print(f"Converted to: {output_path}")
43
+ return output_path
44
+ except Exception as e:
45
+ raise gr.Error(f"Audio conversion failed. Make sure ffmpeg is installed in packages.txt. Error: {e}")
46
+
47
+ def generate_speech(text, voice_choice, custom_voice_file):
48
  if not text.strip():
49
+ raise gr.Error("Please enter some text.")
50
 
51
+ try:
52
+ # LOGIC BRANCH 1: Custom Voice Upload
53
+ if custom_voice_file is not None:
54
+ print("--- Mode: Voice Cloning ---")
55
+
56
+ # 1. Fix the audio file (Fixes 'RIFF id' error)
57
+ clean_wav_path = preprocess_audio(custom_voice_file)
58
+
59
+ # 2. Extract the speaker style
60
+ # The model analyzes the WAV to clone the voice
61
+ voice_state = tts.get_state_for_audio_prompt(clean_wav_path)
62
+
63
+ # 3. Generate
64
+ audio_tensor = tts.generate_audio(voice_state, text)
65
 
66
+ # LOGIC BRANCH 2: Built-in Preset
67
+ else:
68
+ print("--- Mode: Preset Voice ---")
69
+ voice_id = PRESET_VOICES[voice_choice]
70
+ print(f"Using Internal ID: {voice_id}")
71
+
72
+ # We pass the STRING directly.
73
+ # We do NOT use get_state_for_audio_prompt for presets (Fixes 404 error)
74
+ audio_tensor = tts.generate_audio(voice_id, text)
75
 
76
+ return (tts.sample_rate, audio_tensor.numpy())
 
 
 
 
 
77
 
 
 
 
 
78
  except Exception as e:
79
+ # Print full error to logs for debugging
80
+ import traceback
81
+ traceback.print_exc()
82
+ raise gr.Error(f"Generation Error: {str(e)}")
 
 
 
 
83
 
84
+ # 3. Build Interface
85
  with gr.Blocks(title="Pocket-TTS Demo") as demo:
86
+ gr.Markdown("# 🗣️ Pocket-TTS (Fixed)")
87
+ gr.Markdown("Supports Voice Cloning (MP3/WAV) & Built-in Voices.")
88
 
89
  with gr.Row():
90
  with gr.Column():
91
  text_input = gr.Textbox(
92
  label="Text to Speak",
 
93
  lines=4,
94
+ value="This is a test of the pocket text to speech system."
95
  )
96
 
97
  with gr.Accordion("Voice Settings", open=True):
98
  voice_dropdown = gr.Dropdown(
99
  choices=list(PRESET_VOICES.keys()),
100
  value="Alba (American English)",
101
+ label="Use a Preset Voice"
102
  )
103
  gr.Markdown("**OR**")
104
  voice_upload = gr.Audio(
105
+ label="Clone a Voice (Upload any audio)",
106
  type="filepath"
107
  )
108
 
109
  submit_btn = gr.Button("Generate Audio", variant="primary")
110
 
111
  with gr.Column():
112
+ audio_output = gr.Audio(label="Result", type="numpy")
113
 
 
114
  submit_btn.click(
115
  fn=generate_speech,
116
  inputs=[text_input, voice_dropdown, voice_upload],
117
  outputs=audio_output
118
  )
119
 
 
120
  demo.launch()