Update app.py
Browse files
app.py
CHANGED
|
@@ -14,6 +14,9 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
| 14 |
generator = load_miso_8b(device=device, model_path_or_repo_id="MisoLabs/MisoTTS")
|
| 15 |
SAMPLE_RATE = generator.sample_rate
|
| 16 |
|
|
|
|
|
|
|
|
|
|
| 17 |
MAX_INPUT_CHARS = 1000
|
| 18 |
|
| 19 |
DESCRIPTION = """
|
|
@@ -22,9 +25,6 @@ DESCRIPTION = """
|
|
| 22 |
Text-to-speech with the [MisoLabs/MisoTTS](https://huggingface.co/MisoLabs/MisoTTS) model — an
|
| 23 |
8B [Sesame CSM](https://github.com/SesameAILabs/csm)-style model that generates Mimi audio codes
|
| 24 |
from text, with optional voice continuation from a reference clip.
|
| 25 |
-
|
| 26 |
-
Provide a reference audio + its transcript to clone a voice, or leave them empty for a default voice.
|
| 27 |
-
Outputs carry an imperceptible watermark identifying the audio as AI-generated.
|
| 28 |
"""
|
| 29 |
|
| 30 |
|
|
@@ -43,12 +43,6 @@ def synthesize(text, ref_audio_path, ref_text, speaker_id, max_length_ms, temper
|
|
| 43 |
if len(text) > MAX_INPUT_CHARS:
|
| 44 |
raise gr.Error(f"Text too long (>{MAX_INPUT_CHARS} characters).")
|
| 45 |
|
| 46 |
-
# ZeroGPU streams weights to the real GPU on first entry but leaves the torchtune
|
| 47 |
-
# KV-cache's non-persistent buffers (e.g. cache_pos) behind, causing a cuda/cpu
|
| 48 |
-
# device mismatch. Re-place the model on the device here, inside the GPU worker.
|
| 49 |
-
generator._model.to(device)
|
| 50 |
-
generator._audio_tokenizer.to(device)
|
| 51 |
-
|
| 52 |
context = []
|
| 53 |
if ref_audio_path:
|
| 54 |
if not (ref_text or "").strip():
|
|
|
|
| 14 |
generator = load_miso_8b(device=device, model_path_or_repo_id="MisoLabs/MisoTTS")
|
| 15 |
SAMPLE_RATE = generator.sample_rate
|
| 16 |
|
| 17 |
+
generator._model.to(device)
|
| 18 |
+
generator._audio_tokenizer.to(device)
|
| 19 |
+
|
| 20 |
MAX_INPUT_CHARS = 1000
|
| 21 |
|
| 22 |
DESCRIPTION = """
|
|
|
|
| 25 |
Text-to-speech with the [MisoLabs/MisoTTS](https://huggingface.co/MisoLabs/MisoTTS) model — an
|
| 26 |
8B [Sesame CSM](https://github.com/SesameAILabs/csm)-style model that generates Mimi audio codes
|
| 27 |
from text, with optional voice continuation from a reference clip.
|
|
|
|
|
|
|
|
|
|
| 28 |
"""
|
| 29 |
|
| 30 |
|
|
|
|
| 43 |
if len(text) > MAX_INPUT_CHARS:
|
| 44 |
raise gr.Error(f"Text too long (>{MAX_INPUT_CHARS} characters).")
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
context = []
|
| 47 |
if ref_audio_path:
|
| 48 |
if not (ref_text or "").strip():
|