enable settings
Browse files
app.py
CHANGED
|
@@ -50,7 +50,6 @@ models_configs = {
|
|
| 50 |
|
| 51 |
# Global variables for models (loaded once)
|
| 52 |
player = NemoAudioPlayer(Config())
|
| 53 |
-
demo_examples = Demo()()
|
| 54 |
models = {}
|
| 55 |
for model_name, config in models_configs.items():
|
| 56 |
print(f"Loading {model_name}...")
|
|
@@ -60,7 +59,7 @@ print("All models loaded!")
|
|
| 60 |
|
| 61 |
|
| 62 |
@spaces.GPU
|
| 63 |
-
def generate_speech_gpu(text, model_choice,
|
| 64 |
"""
|
| 65 |
Generate speech from text using the selected model on GPU
|
| 66 |
"""
|
|
@@ -81,7 +80,7 @@ def generate_speech_gpu(text, model_choice, temperature, top_p, repetition_penal
|
|
| 81 |
|
| 82 |
# Generate audio
|
| 83 |
print(f"Generating speech with {model_choice}...")
|
| 84 |
-
audio, _, time_report = selected_model.run_model(text)
|
| 85 |
|
| 86 |
sample_rate = 22050
|
| 87 |
print("Speech generation completed!")
|
|
@@ -94,8 +93,8 @@ def generate_speech_gpu(text, model_choice, temperature, top_p, repetition_penal
|
|
| 94 |
|
| 95 |
# Create Gradio interface
|
| 96 |
with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
|
| 97 |
-
gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model")
|
| 98 |
-
gr.Markdown("Select a model and enter text to generate
|
| 99 |
|
| 100 |
with gr.Row():
|
| 101 |
with gr.Column(scale=1):
|
|
@@ -154,10 +153,6 @@ with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default())
|
|
| 154 |
outputs=[audio_output, time_report_output]
|
| 155 |
)
|
| 156 |
|
| 157 |
-
|
| 158 |
-
def play_demo(text):
|
| 159 |
-
return (22050, demo_examples[text]), 'DEMO'
|
| 160 |
-
|
| 161 |
with gr.Row():
|
| 162 |
|
| 163 |
examples = [
|
|
|
|
| 50 |
|
| 51 |
# Global variables for models (loaded once)
|
| 52 |
player = NemoAudioPlayer(Config())
|
|
|
|
| 53 |
models = {}
|
| 54 |
for model_name, config in models_configs.items():
|
| 55 |
print(f"Loading {model_name}...")
|
|
|
|
| 59 |
|
| 60 |
|
| 61 |
@spaces.GPU
|
| 62 |
+
def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
|
| 63 |
"""
|
| 64 |
Generate speech from text using the selected model on GPU
|
| 65 |
"""
|
|
|
|
| 80 |
|
| 81 |
# Generate audio
|
| 82 |
print(f"Generating speech with {model_choice}...")
|
| 83 |
+
audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok)
|
| 84 |
|
| 85 |
sample_rate = 22050
|
| 86 |
print("Speech generation completed!")
|
|
|
|
| 93 |
|
| 94 |
# Create Gradio interface
|
| 95 |
with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
|
| 96 |
+
gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
|
| 97 |
+
gr.Markdown("Select a model and enter text to generate emotional speech")
|
| 98 |
|
| 99 |
with gr.Row():
|
| 100 |
with gr.Column(scale=1):
|
|
|
|
| 153 |
outputs=[audio_output, time_report_output]
|
| 154 |
)
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
with gr.Row():
|
| 157 |
|
| 158 |
examples = [
|
util.py
CHANGED
|
@@ -173,7 +173,14 @@ class KaniModel:
|
|
| 173 |
attention_mask = torch.ones(1, modified_input_ids.shape[1], dtype=torch.int64)
|
| 174 |
return modified_input_ids, attention_mask
|
| 175 |
|
| 176 |
-
def model_request(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
"""Generate tokens using the model"""
|
| 178 |
input_ids = input_ids.to(self.device)
|
| 179 |
attention_mask = attention_mask.to(self.device)
|
|
@@ -182,11 +189,11 @@ class KaniModel:
|
|
| 182 |
generated_ids = self.model.generate(
|
| 183 |
input_ids=input_ids,
|
| 184 |
attention_mask=attention_mask,
|
| 185 |
-
max_new_tokens=
|
| 186 |
do_sample=True,
|
| 187 |
-
temperature=
|
| 188 |
-
top_p=
|
| 189 |
-
repetition_penalty=
|
| 190 |
num_return_sequences=1,
|
| 191 |
eos_token_id=self.player.end_of_speech,
|
| 192 |
pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id
|
|
@@ -200,14 +207,14 @@ class KaniModel:
|
|
| 200 |
report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
|
| 201 |
return report
|
| 202 |
|
| 203 |
-
def run_model(self, text: str):
|
| 204 |
"""Complete pipeline: text -> tokens -> generation -> audio"""
|
| 205 |
# Prepare input
|
| 206 |
input_ids, attention_mask = self.get_input_ids(text)
|
| 207 |
|
| 208 |
# Generate tokens
|
| 209 |
point_1 = time.time()
|
| 210 |
-
model_output = self.model_request(input_ids, attention_mask)
|
| 211 |
|
| 212 |
# Convert to audio
|
| 213 |
point_2 = time.time()
|
|
|
|
| 173 |
attention_mask = torch.ones(1, modified_input_ids.shape[1], dtype=torch.int64)
|
| 174 |
return modified_input_ids, attention_mask
|
| 175 |
|
| 176 |
+
def model_request(
|
| 177 |
+
self,
|
| 178 |
+
input_ids: torch.tensor,
|
| 179 |
+
attention_mask: torch.tensor,
|
| 180 |
+
t:float,
|
| 181 |
+
top_p:float,
|
| 182 |
+
rp: float,
|
| 183 |
+
max_tok: int) -> torch.tensor:
|
| 184 |
"""Generate tokens using the model"""
|
| 185 |
input_ids = input_ids.to(self.device)
|
| 186 |
attention_mask = attention_mask.to(self.device)
|
|
|
|
| 189 |
generated_ids = self.model.generate(
|
| 190 |
input_ids=input_ids,
|
| 191 |
attention_mask=attention_mask,
|
| 192 |
+
max_new_tokens=max_tok,
|
| 193 |
do_sample=True,
|
| 194 |
+
temperature=t,
|
| 195 |
+
top_p=top_p,
|
| 196 |
+
repetition_penalty=rp,
|
| 197 |
num_return_sequences=1,
|
| 198 |
eos_token_id=self.player.end_of_speech,
|
| 199 |
pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id
|
|
|
|
| 207 |
report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
|
| 208 |
return report
|
| 209 |
|
| 210 |
+
def run_model(self, text: str, t: float, top_p: float, rp: float, max_tok: int):
|
| 211 |
"""Complete pipeline: text -> tokens -> generation -> audio"""
|
| 212 |
# Prepare input
|
| 213 |
input_ids, attention_mask = self.get_input_ids(text)
|
| 214 |
|
| 215 |
# Generate tokens
|
| 216 |
point_1 = time.time()
|
| 217 |
+
model_output = self.model_request(input_ids, attention_mask, t, top_p, rp, max_tok)
|
| 218 |
|
| 219 |
# Convert to audio
|
| 220 |
point_2 = time.time()
|