ylankgz commited on
Commit
088ca61
·
1 Parent(s): ad693da

enable settings

Browse files
Files changed (2) hide show
  1. app.py +4 -9
  2. util.py +14 -7
app.py CHANGED
@@ -50,7 +50,6 @@ models_configs = {
50
 
51
  # Global variables for models (loaded once)
52
  player = NemoAudioPlayer(Config())
53
- demo_examples = Demo()()
54
  models = {}
55
  for model_name, config in models_configs.items():
56
  print(f"Loading {model_name}...")
@@ -60,7 +59,7 @@ print("All models loaded!")
60
 
61
 
62
  @spaces.GPU
63
- def generate_speech_gpu(text, model_choice, temperature, top_p, repetition_penalty, max_new_tokens):
64
  """
65
  Generate speech from text using the selected model on GPU
66
  """
@@ -81,7 +80,7 @@ def generate_speech_gpu(text, model_choice, temperature, top_p, repetition_penal
81
 
82
  # Generate audio
83
  print(f"Generating speech with {model_choice}...")
84
- audio, _, time_report = selected_model.run_model(text)
85
 
86
  sample_rate = 22050
87
  print("Speech generation completed!")
@@ -94,8 +93,8 @@ def generate_speech_gpu(text, model_choice, temperature, top_p, repetition_penal
94
 
95
  # Create Gradio interface
96
  with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
97
- gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model")
98
- gr.Markdown("Select a model and enter text to generate high-quality speech")
99
 
100
  with gr.Row():
101
  with gr.Column(scale=1):
@@ -154,10 +153,6 @@ with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default())
154
  outputs=[audio_output, time_report_output]
155
  )
156
 
157
-
158
- def play_demo(text):
159
- return (22050, demo_examples[text]), 'DEMO'
160
-
161
  with gr.Row():
162
 
163
  examples = [
 
50
 
51
  # Global variables for models (loaded once)
52
  player = NemoAudioPlayer(Config())
 
53
  models = {}
54
  for model_name, config in models_configs.items():
55
  print(f"Loading {model_name}...")
 
59
 
60
 
61
  @spaces.GPU
62
+ def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
63
  """
64
  Generate speech from text using the selected model on GPU
65
  """
 
80
 
81
  # Generate audio
82
  print(f"Generating speech with {model_choice}...")
83
+ audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok)
84
 
85
  sample_rate = 22050
86
  print("Speech generation completed!")
 
93
 
94
  # Create Gradio interface
95
  with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
96
+ gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
97
+ gr.Markdown("Select a model and enter text to generate emotional speech")
98
 
99
  with gr.Row():
100
  with gr.Column(scale=1):
 
153
  outputs=[audio_output, time_report_output]
154
  )
155
 
 
 
 
 
156
  with gr.Row():
157
 
158
  examples = [
util.py CHANGED
@@ -173,7 +173,14 @@ class KaniModel:
173
  attention_mask = torch.ones(1, modified_input_ids.shape[1], dtype=torch.int64)
174
  return modified_input_ids, attention_mask
175
 
176
- def model_request(self, input_ids: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
 
 
 
 
 
 
 
177
  """Generate tokens using the model"""
178
  input_ids = input_ids.to(self.device)
179
  attention_mask = attention_mask.to(self.device)
@@ -182,11 +189,11 @@ class KaniModel:
182
  generated_ids = self.model.generate(
183
  input_ids=input_ids,
184
  attention_mask=attention_mask,
185
- max_new_tokens=self.conf.max_new_tokens,
186
  do_sample=True,
187
- temperature=self.conf.temperature,
188
- top_p=self.conf.top_p,
189
- repetition_penalty=self.conf.repetition_penalty,
190
  num_return_sequences=1,
191
  eos_token_id=self.player.end_of_speech,
192
  pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id
@@ -200,14 +207,14 @@ class KaniModel:
200
  report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
201
  return report
202
 
203
- def run_model(self, text: str):
204
  """Complete pipeline: text -> tokens -> generation -> audio"""
205
  # Prepare input
206
  input_ids, attention_mask = self.get_input_ids(text)
207
 
208
  # Generate tokens
209
  point_1 = time.time()
210
- model_output = self.model_request(input_ids, attention_mask)
211
 
212
  # Convert to audio
213
  point_2 = time.time()
 
173
  attention_mask = torch.ones(1, modified_input_ids.shape[1], dtype=torch.int64)
174
  return modified_input_ids, attention_mask
175
 
176
+ def model_request(
177
+ self,
178
+ input_ids: torch.tensor,
179
+ attention_mask: torch.tensor,
180
+ t:float,
181
+ top_p:float,
182
+ rp: float,
183
+ max_tok: int) -> torch.tensor:
184
  """Generate tokens using the model"""
185
  input_ids = input_ids.to(self.device)
186
  attention_mask = attention_mask.to(self.device)
 
189
  generated_ids = self.model.generate(
190
  input_ids=input_ids,
191
  attention_mask=attention_mask,
192
+ max_new_tokens=max_tok,
193
  do_sample=True,
194
+ temperature=t,
195
+ top_p=top_p,
196
+ repetition_penalty=rp,
197
  num_return_sequences=1,
198
  eos_token_id=self.player.end_of_speech,
199
  pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id
 
207
  report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
208
  return report
209
 
210
+ def run_model(self, text: str, t: float, top_p: float, rp: float, max_tok: int):
211
  """Complete pipeline: text -> tokens -> generation -> audio"""
212
  # Prepare input
213
  input_ids, attention_mask = self.get_input_ids(text)
214
 
215
  # Generate tokens
216
  point_1 = time.time()
217
+ model_output = self.model_request(input_ids, attention_mask, t, top_p, rp, max_tok)
218
 
219
  # Convert to audio
220
  point_2 = time.time()