multimodalart HF Staff commited on
Commit
d84e6ad
·
verified ·
1 Parent(s): 50bb7e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -9
app.py CHANGED
@@ -14,6 +14,9 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
14
  generator = load_miso_8b(device=device, model_path_or_repo_id="MisoLabs/MisoTTS")
15
  SAMPLE_RATE = generator.sample_rate
16
 
 
 
 
17
  MAX_INPUT_CHARS = 1000
18
 
19
  DESCRIPTION = """
@@ -22,9 +25,6 @@ DESCRIPTION = """
22
  Text-to-speech with the [MisoLabs/MisoTTS](https://huggingface.co/MisoLabs/MisoTTS) model — an
23
  8B [Sesame CSM](https://github.com/SesameAILabs/csm)-style model that generates Mimi audio codes
24
  from text, with optional voice continuation from a reference clip.
25
-
26
- Provide a reference audio + its transcript to clone a voice, or leave them empty for a default voice.
27
- Outputs carry an imperceptible watermark identifying the audio as AI-generated.
28
  """
29
 
30
 
@@ -43,12 +43,6 @@ def synthesize(text, ref_audio_path, ref_text, speaker_id, max_length_ms, temper
43
  if len(text) > MAX_INPUT_CHARS:
44
  raise gr.Error(f"Text too long (>{MAX_INPUT_CHARS} characters).")
45
 
46
- # ZeroGPU streams weights to the real GPU on first entry but leaves the torchtune
47
- # KV-cache's non-persistent buffers (e.g. cache_pos) behind, causing a cuda/cpu
48
- # device mismatch. Re-place the model on the device here, inside the GPU worker.
49
- generator._model.to(device)
50
- generator._audio_tokenizer.to(device)
51
-
52
  context = []
53
  if ref_audio_path:
54
  if not (ref_text or "").strip():
 
14
  generator = load_miso_8b(device=device, model_path_or_repo_id="MisoLabs/MisoTTS")
15
  SAMPLE_RATE = generator.sample_rate
16
 
17
+ generator._model.to(device)
18
+ generator._audio_tokenizer.to(device)
19
+
20
  MAX_INPUT_CHARS = 1000
21
 
22
  DESCRIPTION = """
 
25
  Text-to-speech with the [MisoLabs/MisoTTS](https://huggingface.co/MisoLabs/MisoTTS) model — an
26
  8B [Sesame CSM](https://github.com/SesameAILabs/csm)-style model that generates Mimi audio codes
27
  from text, with optional voice continuation from a reference clip.
 
 
 
28
  """
29
 
30
 
 
43
  if len(text) > MAX_INPUT_CHARS:
44
  raise gr.Error(f"Text too long (>{MAX_INPUT_CHARS} characters).")
45
 
 
 
 
 
 
 
46
  context = []
47
  if ref_audio_path:
48
  if not (ref_text or "").strip():