Spaces:

Swagcrew
/

fish-quant-samples

Runtime error

App Files Files Community

Swagcrew commited on 21 days ago

Commit

3a38d65

verified ·

1 Parent(s): ecf804f

Upload gen_samples.py with huggingface_hub

Browse files

Files changed (1) hide show

gen_samples.py +63 -59

gen_samples.py CHANGED Viewed

@@ -1,26 +1,17 @@
 #!/usr/bin/env python3
-"""Generate voice clone samples using fish-speech's generate_long API."""
-import os, sys, json, time, gc, traceback
 import torch
-import torchaudio
-import soundfile as sf
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["HF_HOME"] = "/tmp/hf_cache"
 sys.path.insert(0, "/app/fish-speech")
-DEVICE = "cuda"
-DTYPE = torch.bfloat16
-REF_TEXT = "Let me get this straight. You think that your client, one of the wealthiest most powerful men in the world, is secretly a vigilante who spends his nights beating criminals to a pulp with his bare hands. And your plan is to blackmail this person."
 GEN_TEXT = "Every man's life ends the same way. It is only the details of how he lived that distinguish one man from another."
 OUT = "/tmp/samples"
 os.makedirs(OUT, exist_ok=True)
-from fish_speech.models.text2semantic.inference import (
-    init_model, load_codec_model, encode_audio, generate_long
-)
 MODELS = [
     ("baseline_bf16", "fishaudio/s2-pro"),
     ("fp8", "drbaph/s2-pro-fp8"),
@@ -30,8 +21,6 @@ def main():
     print(f"=== Fish Speech Voice Clone Sample Generator ===")
     print(f"GPU: {torch.cuda.get_device_name(0)}, VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")
-    ref_path = "/app/reference/morgan_ref.wav"
     for name, model_id in MODELS:
         print(f"\n{'='*60}")
         print(f"  {name.upper()} ({model_id})")
@@ -43,52 +32,67 @@ def main():
             from huggingface_hub import snapshot_download
             snapshot_download(model_id, local_dir=local_dir, token=os.environ.get("HF_TOKEN"))
-        print(f"  Loading model...")
-        model, decode_fn = init_model(local_dir, DEVICE, DTYPE, compile=False)
-        codec = load_codec_model(f"{local_dir}/codec.pth", DEVICE, DTYPE)
-        with torch.device(DEVICE):
-            model.setup_caches(max_batch_size=1, max_seq_len=model.config.max_seq_len, dtype=DTYPE)
-        print(f"  Encoding reference audio...")
-        prompt_tokens = encode_audio(ref_path, codec, DEVICE).cpu()
-        print(f"  Generating voice clone...")
         out_path = f"{OUT}/fish_{name}_morgan_clone.wav"
-        try:
-            for response in generate_long(
-                model=model,
-                device=DEVICE,
-                decode_one_token=decode_fn,
-                text=GEN_TEXT,
-                max_new_tokens=1024,
-                top_p=0.7,
-                top_k=30,
-                temperature=0.7,
-                repetition_penalty=1.1,
-                compile=False,
-                iterative_prompt=False,
-                chunk_length=0,
-                prompt_text=[REF_TEXT],
-                prompt_tokens=[prompt_tokens],
-            ):
-                if response.action == "sample":
-                    codes = response.codes
-                    with torch.no_grad():
-                        with torch.amp.autocast(device_type="cuda", dtype=DTYPE):
-                            audio = codec.decode(codes.unsqueeze(0).to(DEVICE))
-                    np_audio = audio.squeeze().cpu().float().numpy()
-                    sr = getattr(codec, 'sample_rate', 44100)
-                    sf.write(out_path, np_audio, sr)
-                    dur = len(np_audio) / sr
-                    print(f"  ✅ Saved {out_path} ({dur:.1f}s)")
-        except Exception as e:
-            print(f"  ❌ FAILED: {e}")
-            traceback.print_exc()
-        del model, codec
         gc.collect()
         torch.cuda.empty_cache()
@@ -100,7 +104,7 @@ def main():
         from huggingface_hub import HfApi
         api = HfApi()
         repo = "Swagcrew/fish-speech-s2-quantized"
-        for fn in os.listdir(OUT):
             if fn.endswith(".wav"):
                 api.upload_file(
                     path_or_fileobj=os.path.join(OUT, fn),

 #!/usr/bin/env python3
+"""Generate voice clone samples using fish-speech CLI."""
+import os, sys, json, time, gc, traceback, subprocess
 import torch
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["HF_HOME"] = "/tmp/hf_cache"
 sys.path.insert(0, "/app/fish-speech")
 GEN_TEXT = "Every man's life ends the same way. It is only the details of how he lived that distinguish one man from another."
+REF_TEXT = "Let me get this straight. You think that your client, one of the wealthiest most powerful men in the world, is secretly a vigilante who spends his nights beating criminals to a pulp with his bare hands. And your plan is to blackmail this person."
 OUT = "/tmp/samples"
 os.makedirs(OUT, exist_ok=True)
 MODELS = [
     ("baseline_bf16", "fishaudio/s2-pro"),
     ("fp8", "drbaph/s2-pro-fp8"),
     print(f"=== Fish Speech Voice Clone Sample Generator ===")
     print(f"GPU: {torch.cuda.get_device_name(0)}, VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")
     for name, model_id in MODELS:
         print(f"\n{'='*60}")
         print(f"  {name.upper()} ({model_id})")
             from huggingface_hub import snapshot_download
             snapshot_download(model_id, local_dir=local_dir, token=os.environ.get("HF_TOKEN"))
         out_path = f"{OUT}/fish_{name}_morgan_clone.wav"
+        # Step 1: Generate semantic tokens using the CLI
+        semantic_dir = f"{OUT}/{name}_semantic"
+        os.makedirs(semantic_dir, exist_ok=True)
+        cmd = [
+            sys.executable, "-m", "fish_speech.models.text2semantic.inference",
+            "--text", f"<|speaker:0|>{GEN_TEXT}",
+            "--prompt-audio", "/app/reference/morgan_ref.wav",
+            "--prompt-text", REF_TEXT,
+            "--checkpoint-path", local_dir,
+            "--output-dir", semantic_dir,
+            "--num-samples", "1",
+            "--max-new-tokens", "1024",
+            "--top-p", "0.7",
+            "--top-k", "30",
+            "--temperature", "0.7",
+            "--no-iterative-prompt",
+            "--chunk-length", "0",
+            "--device", "cuda",
+        ]
+        print(f"  Generating semantic tokens...")
+        env = {**os.environ, "PYTHONPATH": "/app/fish-speech"}
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)
+        print(f"  CLI stdout (last 500): {result.stdout[-500:]}")
+        if result.stderr:
+            print(f"  CLI stderr (last 500): {result.stderr[-500:]}")
+        # Find generated .pt files
+        pt_files = [f for f in os.listdir(semantic_dir) if f.endswith('.pt')]
+        if not pt_files:
+            print(f"  ❌ No .pt files generated")
+            continue
+        print(f"  Generated {len(pt_files)} semantic files")
+        # Step 2: Decode semantic tokens to audio using codec
+        import torchaudio
+        import soundfile as sf
+        from fish_speech.models.text2semantic.inference import load_codec_model
+        codec = load_codec_model(f"{local_dir}/codec.pth", "cuda", torch.bfloat16)
+        for pt_file in pt_files:
+            codes = torch.load(os.path.join(semantic_dir, pt_file), map_location="cuda", weights_only=True)
+            print(f"  Decoding {pt_file}, codes shape: {codes.shape}")
+            with torch.no_grad():
+                with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+                    audio = codec.decode(codes.unsqueeze(0))
+            np_audio = audio.squeeze().cpu().float().numpy()
+            sr = getattr(codec, 'sample_rate', 44100)
+            sf.write(out_path, np_audio, sr)
+            dur = len(np_audio) / sr
+            print(f"  ✅ Saved {out_path} ({dur:.1f}s)")
+        del codec
         gc.collect()
         torch.cuda.empty_cache()
         from huggingface_hub import HfApi
         api = HfApi()
         repo = "Swagcrew/fish-speech-s2-quantized"
+        for fn in sorted(os.listdir(OUT)):
             if fn.endswith(".wav"):
                 api.upload_file(
                     path_or_fileobj=os.path.join(OUT, fn),