Chatterbox-Multilingual-TTS

Runtime error

App Files Files Community

alexue4 commited on Sep 6, 2025

Commit

7e241f9

verified ·

1 Parent(s): ae44b9c

Update src/chatterbox/models/t3/t3.py

Browse files

Files changed (1) hide show

src/chatterbox/models/t3/t3.py +21 -17

src/chatterbox/models/t3/t3.py CHANGED Viewed

@@ -286,21 +286,21 @@ class T3(nn.Module):
         logger.warning(f"t3.inference: patch/compile backend took {time.perf_counter() - compile_start:.4f}s (compiled={self.compiled})")
         # # Run normal generate method, which calls our custom extended methods
-        return self.patched_model.generate(
-            inputs=initial_speech_tokens,
-            decoder_cond=embeds,
-            bos_token_id=self.hp.start_speech_token,
-            eos_token_id=(self.hp.stop_speech_token if stop_on_eos else -1),
-            pad_token_id=self.hp.stop_speech_token,
-            max_new_tokens=max_new_tokens or self.hp.max_speech_tokens,
-            num_return_sequences=num_return_sequences,
-            temperature=temperature,
-            min_p=min_p,
-            length_penalty=length_penalty,
-            repetition_penalty=repetition_penalty,
-            do_sample=do_sample,
-            # cache_implementation=None if not self.compiled else "static",
-        )
         device = embeds.device
@@ -371,8 +371,12 @@ class T3(nn.Module):
             # # Convert logits to probabilities and sample the next token.
             # probs = torch.softmax(logits, dim=-1)
             # next_token = torch.multinomial(probs, num_samples=1)  # shape: (B, 1)
-            gumbel = -torch.log(-torch.log(torch.rand_like(logits)))
-            next_token = (logits / temperature + gumbel).argmax(dim=-1)
             next_token = next_token.unsqueeze(-1)
             step_sampling_total += (time.perf_counter() - step_t0)

         logger.warning(f"t3.inference: patch/compile backend took {time.perf_counter() - compile_start:.4f}s (compiled={self.compiled})")
         # # Run normal generate method, which calls our custom extended methods
+        # return self.patched_model.generate(
+        #     inputs=initial_speech_tokens,
+        #     decoder_cond=embeds,
+        #     bos_token_id=self.hp.start_speech_token,
+        #     eos_token_id=(self.hp.stop_speech_token if stop_on_eos else -1),
+        #     pad_token_id=self.hp.stop_speech_token,
+        #     max_new_tokens=max_new_tokens or self.hp.max_speech_tokens,
+        #     num_return_sequences=num_return_sequences,
+        #     temperature=temperature,
+        #     min_p=min_p,
+        #     length_penalty=length_penalty,
+        #     repetition_penalty=repetition_penalty,
+        #     do_sample=do_sample,
+        #     # cache_implementation=None if not self.compiled else "static",
+        # )
         device = embeds.device
             # # Convert logits to probabilities and sample the next token.
             # probs = torch.softmax(logits, dim=-1)
             # next_token = torch.multinomial(probs, num_samples=1)  # shape: (B, 1)
+            top_k = 50
+            vals, idx = logits.topk(top_k, dim=-1)
+            masked = torch.full_like(logits, float('-inf'))
+            masked.scatter_(1, idx, vals)
+            g = -torch.log(-torch.log(torch.rand_like(masked)))
+            next_token = ((masked / temperature) + g).argmax(dim=-1)
             next_token = next_token.unsqueeze(-1)
             step_sampling_total += (time.perf_counter() - step_t0)