MisoTTS

Runtime error

multimodalart HF Staff commited on 4 days ago

Commit

d84e6ad

verified ·

1 Parent(s): 50bb7e8

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -14,6 +14,9 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 generator = load_miso_8b(device=device, model_path_or_repo_id="MisoLabs/MisoTTS")
 SAMPLE_RATE = generator.sample_rate
 MAX_INPUT_CHARS = 1000
 DESCRIPTION = """
@@ -22,9 +25,6 @@ DESCRIPTION = """
 Text-to-speech with the [MisoLabs/MisoTTS](https://huggingface.co/MisoLabs/MisoTTS) model — an
 8B [Sesame CSM](https://github.com/SesameAILabs/csm)-style model that generates Mimi audio codes
 from text, with optional voice continuation from a reference clip.
-Provide a reference audio + its transcript to clone a voice, or leave them empty for a default voice.
-Outputs carry an imperceptible watermark identifying the audio as AI-generated.
 """
@@ -43,12 +43,6 @@ def synthesize(text, ref_audio_path, ref_text, speaker_id, max_length_ms, temper
     if len(text) > MAX_INPUT_CHARS:
         raise gr.Error(f"Text too long (>{MAX_INPUT_CHARS} characters).")
-    # ZeroGPU streams weights to the real GPU on first entry but leaves the torchtune
-    # KV-cache's non-persistent buffers (e.g. cache_pos) behind, causing a cuda/cpu
-    # device mismatch. Re-place the model on the device here, inside the GPU worker.
-    generator._model.to(device)
-    generator._audio_tokenizer.to(device)
     context = []
     if ref_audio_path:
         if not (ref_text or "").strip():

 generator = load_miso_8b(device=device, model_path_or_repo_id="MisoLabs/MisoTTS")
 SAMPLE_RATE = generator.sample_rate
+generator._model.to(device)
+generator._audio_tokenizer.to(device)
 MAX_INPUT_CHARS = 1000
 DESCRIPTION = """
 Text-to-speech with the [MisoLabs/MisoTTS](https://huggingface.co/MisoLabs/MisoTTS) model — an
 8B [Sesame CSM](https://github.com/SesameAILabs/csm)-style model that generates Mimi audio codes
 from text, with optional voice continuation from a reference clip.
 """
     if len(text) > MAX_INPUT_CHARS:
         raise gr.Error(f"Text too long (>{MAX_INPUT_CHARS} characters).")
     context = []
     if ref_audio_path:
         if not (ref_text or "").strip():