KaniTTS

Sleeping

App Files Files Community

ylankgz commited on Sep 18

Commit

088ca61

1 Parent(s): ad693da

enable settings

Browse files

Files changed (2) hide show

app.py +4 -9
util.py +14 -7

app.py CHANGED Viewed

@@ -50,7 +50,6 @@ models_configs = {
 # Global variables for models (loaded once)
 player = NemoAudioPlayer(Config())
-demo_examples = Demo()()
 models = {}
 for model_name, config in models_configs.items():
     print(f"Loading {model_name}...")
@@ -60,7 +59,7 @@ print("All models loaded!")
 @spaces.GPU
-def generate_speech_gpu(text, model_choice, temperature, top_p, repetition_penalty, max_new_tokens):
     """
     Generate speech from text using the selected model on GPU
     """
@@ -81,7 +80,7 @@ def generate_speech_gpu(text, model_choice, temperature, top_p, repetition_penal
         # Generate audio
         print(f"Generating speech with {model_choice}...")
-        audio, _, time_report = selected_model.run_model(text)
         sample_rate = 22050
         print("Speech generation completed!")
@@ -94,8 +93,8 @@ def generate_speech_gpu(text, model_choice, temperature, top_p, repetition_penal
 # Create Gradio interface
 with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
-    gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model")
-    gr.Markdown("Select a model and enter text to generate high-quality speech")
     with gr.Row():
         with gr.Column(scale=1):
@@ -154,10 +153,6 @@ with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default())
         outputs=[audio_output, time_report_output]
     )
-    def play_demo(text):
-        return (22050, demo_examples[text]), 'DEMO'
     with gr.Row():
         examples = [

 # Global variables for models (loaded once)
 player = NemoAudioPlayer(Config())
 models = {}
 for model_name, config in models_configs.items():
     print(f"Loading {model_name}...")
 @spaces.GPU
+def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
     """
     Generate speech from text using the selected model on GPU
     """
         # Generate audio
         print(f"Generating speech with {model_choice}...")
+        audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok)
         sample_rate = 22050
         print("Speech generation completed!")
 # Create Gradio interface
 with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
+    gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
+    gr.Markdown("Select a model and enter text to generate emotional speech")
     with gr.Row():
         with gr.Column(scale=1):
         outputs=[audio_output, time_report_output]
     )
     with gr.Row():
         examples = [

util.py CHANGED Viewed

@@ -173,7 +173,14 @@ class KaniModel:
         attention_mask = torch.ones(1, modified_input_ids.shape[1], dtype=torch.int64)
         return modified_input_ids, attention_mask
-    def model_request(self, input_ids: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
         """Generate tokens using the model"""
         input_ids = input_ids.to(self.device)
         attention_mask = attention_mask.to(self.device)
@@ -182,11 +189,11 @@ class KaniModel:
             generated_ids = self.model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                max_new_tokens=self.conf.max_new_tokens,
                 do_sample=True,
-                temperature=self.conf.temperature,
-                top_p=self.conf.top_p,
-                repetition_penalty=self.conf.repetition_penalty,
                 num_return_sequences=1,
                 eos_token_id=self.player.end_of_speech,
                 pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id
@@ -200,14 +207,14 @@ class KaniModel:
         report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
         return report
-    def run_model(self, text: str):
         """Complete pipeline: text -> tokens -> generation -> audio"""
         # Prepare input
         input_ids, attention_mask = self.get_input_ids(text)
         # Generate tokens
         point_1 = time.time()
-        model_output = self.model_request(input_ids, attention_mask)
         # Convert to audio
         point_2 = time.time()

         attention_mask = torch.ones(1, modified_input_ids.shape[1], dtype=torch.int64)
         return modified_input_ids, attention_mask
+    def model_request(
+            self,
+            input_ids: torch.tensor,
+            attention_mask: torch.tensor,
+            t:float,
+            top_p:float,
+            rp: float,
+            max_tok: int) -> torch.tensor:
         """Generate tokens using the model"""
         input_ids = input_ids.to(self.device)
         attention_mask = attention_mask.to(self.device)
             generated_ids = self.model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                max_new_tokens=max_tok,
                 do_sample=True,
+                temperature=t,
+                top_p=top_p,
+                repetition_penalty=rp,
                 num_return_sequences=1,
                 eos_token_id=self.player.end_of_speech,
                 pad_token_id=self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id
         report = f"SPEECH TOKENS: {model_request:.2f}\nCODEC: {player_time:.2f}\nTOTAL: {total_time:.2f}"
         return report
+    def run_model(self, text: str, t: float, top_p: float, rp: float, max_tok: int):
         """Complete pipeline: text -> tokens -> generation -> audio"""
         # Prepare input
         input_ids, attention_mask = self.get_input_ids(text)
         # Generate tokens
         point_1 = time.time()
+        model_output = self.model_request(input_ids, attention_mask, t, top_p, rp, max_tok)
         # Convert to audio
         point_2 = time.time()