jblast94 commited on
Commit
6fd45d2
·
verified ·
1 Parent(s): 18d613d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -146
app.py CHANGED
@@ -1,179 +1,108 @@
1
- from create_env import setup_dependencies
2
-
3
- setup_dependencies()
4
-
5
- import spaces
6
  import gradio as gr
7
- from util import NemoAudioPlayer, InitModels, load_config, Examples
8
- import numpy as np
9
  import torch
10
  import os
11
 
12
- # Get HuggingFace token
13
- token_ = os.getenv('HF_TOKEN')
14
-
15
- config = load_config("./model_config.yaml")
16
- models_configs = config.models
17
- nemo_player_cfg = config.nemo_player
18
-
19
- examples_cfg = load_config("./examples.yaml")
20
- examples_maker = Examples(examples_cfg)
21
- examples = examples_maker()
22
-
23
- player = NemoAudioPlayer(nemo_player_cfg)
24
- init_models = InitModels(models_configs, player, token_)
25
- models = init_models()
26
-
27
 
28
  @spaces.GPU
29
- def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok):
30
- """
31
- Generate speech from text using the selected model on GPU
32
- """
33
-
34
  if not text.strip():
35
- return None, "Please enter text for speech generation."
36
-
37
- if not model_choice:
38
- return None, "Please select a model."
39
 
40
  try:
41
  device = "cuda" if torch.cuda.is_available() else "cpu"
42
  print(f"Using device: {device}")
43
 
44
- selected_model = models[model_choice]
45
- cfg = models_configs.get(model_choice)
 
 
 
 
 
 
46
  speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {}
47
  if speaker_display and speaker_map:
48
  speaker_id = speaker_map.get(speaker_display)
49
  else:
50
  speaker_id = None
51
-
52
  print(f"Generating speech with {model_choice}...")
53
- audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok)
54
 
55
- sample_rate = 22050
 
 
 
 
 
 
 
 
 
 
56
  print("Speech generation completed!")
57
 
58
- return (sample_rate, audio), time_report #, f"✅ Audio generated successfully using {model_choice} on {device}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
- except Exception as e:
61
- print(f"Error during generation: {str(e)}")
62
- return None, f"❌ Error during generation: {str(e)}"
63
 
64
- # Create Gradio interface
65
- with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Ocean()) as demo:
 
 
66
  gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
67
- gr.Markdown("Select a model and enter text to generate emotional speech")
68
 
69
- with gr.Row():
70
- with gr.Column(scale=1):
71
- model_dropdown = gr.Dropdown(
72
- choices=list(models_configs.keys()),
73
- value=list(models_configs.keys())[0],
74
- label="Selected Model",
75
- info="Base generates random voices"
76
- )
77
- # Speaker selector (shown only if model has speakers)
78
- # Pre-populate all available speakers for example table rendering
79
- all_speakers = []
80
- for _cfg in models_configs.values():
81
- if _cfg and _cfg.get('speaker_id'):
82
- all_speakers.extend(list(_cfg.speaker_id.keys()))
83
- all_speakers = sorted(list(set(all_speakers)))
84
- speaker_dropdown = gr.Dropdown(
85
- choices=all_speakers,
86
- value=None,
87
- label="Speaker",
88
- visible=False,
89
- allow_custom_value=True
90
- )
91
-
92
- text_input = gr.Textbox(
93
- label="Text",
94
- placeholder="Enter your text ...",
95
- lines=3,
96
- max_lines=10
97
- )
98
-
99
- with gr.Accordion("Settings", open=False):
100
- temp = gr.Slider(
101
- minimum=0.1, maximum=1.5, value=0.6, step=0.05,
102
- label="Temp",
103
- )
104
- top_p = gr.Slider(
105
- minimum=0.1, maximum=1.0, value=0.95, step=0.05,
106
- label="Top P",
107
- )
108
- rp = gr.Slider(
109
- minimum=1.0, maximum=2.0, value=1.1, step=0.05,
110
- label="Repetition Penalty",
111
- )
112
- max_tok = gr.Slider(
113
- minimum=100, maximum=2000, value=1000, step=100,
114
- label="Max Tokens",
115
- )
116
-
117
- generate_btn = gr.Button("Run", variant="primary", size="lg")
118
-
119
-
120
- with gr.Column(scale=1):
121
- audio_output = gr.Audio(
122
- label="Generated Audio",
123
- type="numpy"
124
- )
125
-
126
- time_report_output = gr.Textbox(
127
- label="Time Report",
128
- interactive=False,
129
- value="Ready to generate speech",
130
- lines=3
131
- )
132
 
133
- # Update speakers when model changes
134
- def update_speakers(model_choice):
135
- cfg = models_configs.get(model_choice)
136
- speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else []
137
- if speakers:
138
- return gr.update(choices=speakers, value=speakers[0], visible=True)
139
- else:
140
- return gr.update(choices=[], value=None, visible=False)
141
-
142
- model_dropdown.change(
143
- fn=update_speakers,
144
- inputs=[model_dropdown],
145
- outputs=[speaker_dropdown]
146
  )
147
-
148
- # Populate speakers on initial page load based on default model
149
- demo.load(
150
- fn=update_speakers,
 
 
 
 
 
 
151
  inputs=[model_dropdown],
152
  outputs=[speaker_dropdown]
153
  )
154
-
155
- # GPU generation event
156
  generate_btn.click(
157
- fn=generate_speech_gpu,
158
- inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
159
- outputs=[audio_output, time_report_output]
160
  )
161
 
162
- with gr.Row():
163
-
164
- examples = examples
165
-
166
- gr.Examples(
167
- examples=examples,
168
- inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
169
- fn=generate_speech_gpu,
170
- outputs=[audio_output, time_report_output],
171
- cache_examples=True,
172
- )
173
-
174
- if __name__ == "__main__":
175
- demo.launch(
176
- server_name="0.0.0.0",
177
- server_port=7860,
178
- show_error=True
179
- )
 
 
 
 
 
 
1
  import gradio as gr
 
 
2
  import torch
3
  import os
4
 
5
+ # You must use the exact same model name as your repo
6
+ MODEL_ID = "nineninesix/Kani-TTS-370m"
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  @spaces.GPU
9
+ def generate_speech(text: str, model_choice: str, speaker_display: str):
 
 
 
 
10
  if not text.strip():
11
+ return "Please enter text for speech generation.", None
 
 
 
12
 
13
  try:
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
  print(f"Using device: {device}")
16
 
17
+ # --- This is the key part to load a specific model ---
18
+ if model_choice not in MODELS:
19
+ return f"Model '{model_choice}' not found.", None
20
+
21
+ selected_model = MODELS[model_choice]
22
+
23
+ # --- This part handles speakers ---
24
+ cfg = selected_model[1] # Model config
25
  speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {}
26
  if speaker_display and speaker_map:
27
  speaker_id = speaker_map.get(speaker_display)
28
  else:
29
  speaker_id = None
30
+
31
  print(f"Generating speech with {model_choice}...")
 
32
 
33
+ # --- Use the specific part of the model for generation ---
34
+ model_to_generate = selected_model[0]
35
+ audio, _, time_report = model_to_generate.run_model(
36
+ text=text,
37
+ speaker_id=speaker_id,
38
+ temperature=0.7,
39
+ repetition_penalty=1.2,
40
+ max_tokens=1024
41
+ )
42
+
43
+ sample_rate = 22050
44
  print("Speech generation completed!")
45
 
46
+ return (sample_rate, audio), time_report
47
+
48
+ def load_models():
49
+ global MODELS
50
+ if not MODELS:
51
+ print("Loading models into GPU memory...")
52
+ from transformers import AutoModel
53
+ model_path = MODEL_ID
54
+
55
+ # Load both the main model and its config
56
+ model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
57
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
58
+
59
+ MODELS = {
60
+ "Kani TTS 370M": (model, config)
61
+ }
62
 
63
+ print(f"Models loaded. Available speakers: {list(config.speaker_id.keys()) if config.speaker_id else []}")
64
+ return MODELS
 
65
 
66
+ # --- Gradio interface setup ---
67
+ MODELS = load_models()
68
+
69
+ with gr.Blocks(title="😻 KaniTTS - Text to Speech") as demo:
70
  gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
 
71
 
72
+ model_dropdown = gr.Dropdown(
73
+ choices=list(MODELS.keys()),
74
+ value=list(MODELS.keys())[0],
75
+ label="Selected Model"
76
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ # --- Speaker selector (populated on model load) ---
79
+ all_speakers = list(MODELS[list(MODELS.keys())[0]][1].speaker_id.keys()) if MODELS and MODELS[list(MODELS.keys())[0]][1] and MODELS[list(MODELS.keys())[0]][1].speaker_id else []
80
+ speaker_dropdown = gr.Dropdown(
81
+ choices=all_speakers,
82
+ value=None,
83
+ label="Speaker",
84
+ visible=True,
85
+ allow_custom_value=True
 
 
 
 
 
86
  )
87
+
88
+ text_input = gr.Textbox(label="Text", lines=5)
89
+
90
+ generate_btn = gr.Button("Generate Speech", variant="primary")
91
+
92
+ audio_output = gr.Audio(label="Generated Audio", type="numpy")
93
+
94
+ # --- Event handlers ---
95
+ model_dropdown.change(
96
+ fn=lambda choice: gr.update(choices=list(MODELS[choice][1].speaker_id.keys()), value=None, visible=True) if MODELS and MODELS[choice][1].speaker_id else gr.update(visible=False),
97
  inputs=[model_dropdown],
98
  outputs=[speaker_dropdown]
99
  )
100
+
 
101
  generate_btn.click(
102
+ fn=generate_speech,
103
+ inputs=[text_input, model_dropdown, speaker_dropdown],
104
+ outputs=[audio_output]
105
  )
106
 
107
+ # --- This is the API enabling line ---
108
+ demo.queue().launch(show_api=True)