Spaces:

Swagcrew
/

fish-quant-samples

Runtime error

App Files Files Community

Swagcrew commited on 10 days ago

Commit

e2f85c6

verified ·

1 Parent(s): 14700d3

Upload gen_samples.py with huggingface_hub

Browse files

Files changed (1) hide show

gen_samples.py +60 -119

gen_samples.py CHANGED Viewed

@@ -1,176 +1,117 @@
 #!/usr/bin/env python3
-"""Generate voice clone samples from all quantized Fish Speech S2 Pro models."""
-import os, sys, json, time, gc, traceback, subprocess
 import torch
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 DEVICE = "cuda"
 DTYPE = torch.bfloat16
 REF_TEXT = "Let me get this straight. You think that your client, one of the wealthiest most powerful men in the world, is secretly a vigilante who spends his nights beating criminals to a pulp with his bare hands. And your plan is to blackmail this person."
 GEN_TEXT = "Every man's life ends the same way. It is only the details of how he lived that distinguish one man from another."
 OUT = "/tmp/samples"
-MODEL_DIR = "/tmp/models"
-print("=== Fish Speech Voice Clone Sample Generator ===")
-print(f"GPU: {torch.cuda.get_device_name(0)}, VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")
 os.makedirs(OUT, exist_ok=True)
-os.makedirs(MODEL_DIR, exist_ok=True)
-# Setup fish-speech
-sys.path.insert(0, "/app/fish-speech")
-from fish_speech.models.text2semantic.inference import init_model, load_codec_model, generate, decode_one_token_ar
-from fish_speech.conversation import Conversation, Message
-from fish_speech.content_sequence import TextPart, VQPart
-import torchaudio, soundfile as sf
-def load_ref_audio(ref_path):
-    wav, sr = torchaudio.load(ref_path)
-    if wav.shape[0] > 1: wav = wav.mean(dim=0, keepdim=True)
-    if sr != 44100: wav = torchaudio.functional.resample(wav, sr, 44100)
-    return wav
-def encode_ref(codec, wav):
-    """Encode reference audio outside inference mode."""
-    import torch
-    # Exit inference mode temporarily for codec (it needs autograd for some ops)
-    wav_clone = wav.clone().detach().to(DEVICE).requires_grad_(False)
-    # Use torch.no_grad context instead of inference_mode
-    with torch.no_grad():
-        with torch.amp.autocast(device_type="cuda", dtype=DTYPE):
-            enc = codec.encode(wav_clone.unsqueeze(0))
-    tokens = enc[0] if isinstance(enc, tuple) else enc
-    # Squeeze batch dim: (1, num_codebooks, T) -> (num_codebooks, T)
-    if tokens.ndim == 3 and tokens.shape[0] == 1:
-        tokens = tokens.squeeze(0)
-    return tokens.detach()
-def generate_clone(model, codec, ref_tokens, ref_text, gen_text, out_path):
-    """Generate voice clone using the Conversation API correctly."""
-    conv = Conversation()
-    conv.append(Message(role="user", parts=[
-        VQPart(codes=ref_tokens.cpu()),
-        TextPart(text=ref_text)
-    ]))
-    conv.append(Message(role="assistant", parts=[TextPart(text=gen_text)]))
-    nc = model.config.num_codebooks
-    tokenizer = model.tokenizer
-    result = conv.encode_for_inference(tokenizer, nc)
-    # encode_for_inference returns (prompt_tensor,) or prompt_tensor
-    if isinstance(result, tuple):
-        prompt = result[0].to(DEVICE)
-    else:
-        prompt = result.to(DEVICE)
-    cd = 1 + nc
-    am = torch.zeros(1, cd, prompt.shape[-1], dtype=torch.bool, device=DEVICE)
-    ap = torch.zeros(1, cd, prompt.shape[-1], dtype=torch.long, device=DEVICE)
-    if not getattr(model, '_cd', False):
-        model.setup_caches(1, model.config.max_seq_len, dtype=DTYPE)
-        model._cd = True
-    with torch.autocast(device_type="cuda", dtype=DTYPE):
-        r = generate(model=model, prompt=prompt, max_new_tokens=1024,
-            audio_masks=am, audio_parts=ap, temperature=0.7, top_p=0.7, top_k=30,
-            decode_one_token=decode_one_token_ar)
-    codes = r[0:1,:,:].unsqueeze(0)
-    with torch.autocast(device_type="cuda", dtype=DTYPE):
-        audio = codec.decode(codes.to(DEVICE))
-    np_audio = audio.squeeze().cpu().float().numpy()
-    sr = getattr(codec, 'sample_rate', 44100)
-    sf.write(out_path, np_audio, sr)
-    dur = len(np_audio) / sr
-    print(f"  Saved {out_path} ({dur:.1f}s)")
-    return True
-# --- Models to test ---
-MODELS = {
-    "baseline_bf16": {"source": "fishaudio/s2-pro", "quant": None},
-    "fp8": {"source": "drbaph/s2-pro-fp8", "quant": None},
-}
 def main():
-    # Load reference audio
-    ref_path = "/app/reference/morgan_ref.wav"
-    if not os.path.exists(ref_path):
-        ref_path = "/tmp/reference/morgan_ref.wav"
-    print(f"\n[1] Loading reference audio: {ref_path}")
-    ref_wav = load_ref_audio(ref_path)
-    results = {}
-    for name, cfg in MODELS.items():
         print(f"\n{'='*60}")
-        print(f"  {name.upper()}")
         print(f"{'='*60}")
-        model_id = cfg["source"]
-        local_dir = f"{MODEL_DIR}/{name}"
-        # Download if needed
         if not os.path.exists(f"{local_dir}/config.json"):
             print(f"  Downloading {model_id}...")
             from huggingface_hub import snapshot_download
             snapshot_download(model_id, local_dir=local_dir, token=os.environ.get("HF_TOKEN"))
-        # Load model
         print(f"  Loading model...")
-        model, _ = init_model(local_dir, DEVICE, DTYPE, compile=False)
         codec = load_codec_model(f"{local_dir}/codec.pth", DEVICE, DTYPE)
-        # Encode reference
-        ref_tokens = encode_ref(codec, ref_wav)
-        # Generate
         out_path = f"{OUT}/fish_{name}_morgan_clone.wav"
         try:
-            ok = generate_clone(model, codec, ref_tokens, REF_TEXT, GEN_TEXT, out_path)
-            results[name] = {"ok": ok, "file": out_path}
         except Exception as e:
-            print(f"  FAILED: {e}")
             traceback.print_exc()
-            results[name] = {"ok": False, "error": str(e)}
         del model, codec
         gc.collect()
         torch.cuda.empty_cache()
-    # Also generate from GGUF models using s2.cpp if available
-    # For now, just upload what we have
-    # Summary
     print(f"\n{'='*60}")
-    print("  RESULTS")
     print(f"{'='*60}")
-    for name, r in results.items():
-        status = "✅" if r["ok"] else "❌"
-        print(f"  {status} {name}: {r.get('file', r.get('error',''))}")
-    # Upload to Hub
-    print("\n[Final] Uploading samples to Hub...")
     try:
         from huggingface_hub import HfApi
         api = HfApi()
         repo = "Swagcrew/fish-speech-s2-quantized"
         for fn in os.listdir(OUT):
             if fn.endswith(".wav"):
-                fpath = os.path.join(OUT, fn)
                 api.upload_file(
-                    path_or_fileobj=fpath,
                     path_in_repo=f"samples/{fn}",
                     repo_id=repo,
                     repo_type="model"
                 )
                 print(f"  Uploaded samples/{fn}")
-        print(f"\n  All at https://huggingface.co/{repo}/tree/main/samples")
     except Exception as e:
         print(f"  Upload error: {e}")
-        traceback.print_exc()
     print("\nDONE!")

 #!/usr/bin/env python3
+"""Generate voice clone samples using fish-speech's generate_long API."""
+import os, sys, json, time, gc, traceback
 import torch
+import torchaudio
+import soundfile as sf
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["HF_HOME"] = "/tmp/hf_cache"
+sys.path.insert(0, "/app/fish-speech")
 DEVICE = "cuda"
 DTYPE = torch.bfloat16
 REF_TEXT = "Let me get this straight. You think that your client, one of the wealthiest most powerful men in the world, is secretly a vigilante who spends his nights beating criminals to a pulp with his bare hands. And your plan is to blackmail this person."
 GEN_TEXT = "Every man's life ends the same way. It is only the details of how he lived that distinguish one man from another."
 OUT = "/tmp/samples"
 os.makedirs(OUT, exist_ok=True)
+from fish_speech.models.text2semantic.inference import (
+    init_model, load_codec_model, encode_audio, generate_long
+)
+MODELS = [
+    ("baseline_bf16", "fishaudio/s2-pro"),
+    ("fp8", "drbaph/s2-pro-fp8"),
+]
 def main():
+    print(f"=== Fish Speech Voice Clone Sample Generator ===")
+    print(f"GPU: {torch.cuda.get_device_name(0)}, VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")
+    ref_path = "/app/reference/morgan_ref.wav"
+    for name, model_id in MODELS:
         print(f"\n{'='*60}")
+        print(f"  {name.upper()} ({model_id})")
         print(f"{'='*60}")
+        local_dir = f"/tmp/models/{name}"
         if not os.path.exists(f"{local_dir}/config.json"):
             print(f"  Downloading {model_id}...")
             from huggingface_hub import snapshot_download
             snapshot_download(model_id, local_dir=local_dir, token=os.environ.get("HF_TOKEN"))
         print(f"  Loading model...")
+        model, decode_fn = init_model(local_dir, DEVICE, DTYPE, compile=False)
         codec = load_codec_model(f"{local_dir}/codec.pth", DEVICE, DTYPE)
+        with torch.device(DEVICE):
+            model.setup_caches(max_batch_size=1, max_seq_len=model.config.max_seq_len, dtype=DTYPE)
+        print(f"  Encoding reference audio...")
+        prompt_tokens = encode_audio(ref_path, codec, DEVICE).cpu()
+        print(f"  Generating voice clone...")
         out_path = f"{OUT}/fish_{name}_morgan_clone.wav"
         try:
+            for response in generate_long(
+                model=model,
+                device=DEVICE,
+                decode_one_token=decode_fn,
+                text=GEN_TEXT,
+                max_new_tokens=1024,
+                top_p=0.7,
+                top_k=30,
+                temperature=0.7,
+                repetition_penalty=1.1,
+                compile=False,
+                iterative_prompt=False,
+                chunk_length=0,
+                prompt_text=REF_TEXT,
+                prompt_tokens=prompt_tokens,
+            ):
+                if response.action == "sample":
+                    codes = response.codes
+                    with torch.no_grad():
+                        with torch.amp.autocast(device_type="cuda", dtype=DTYPE):
+                            audio = codec.decode(codes.unsqueeze(0).to(DEVICE))
+                    np_audio = audio.squeeze().cpu().float().numpy()
+                    sr = getattr(codec, 'sample_rate', 44100)
+                    sf.write(out_path, np_audio, sr)
+                    dur = len(np_audio) / sr
+                    print(f"  ✅ Saved {out_path} ({dur:.1f}s)")
         except Exception as e:
+            print(f"  ❌ FAILED: {e}")
             traceback.print_exc()
         del model, codec
         gc.collect()
         torch.cuda.empty_cache()
+    # Upload
     print(f"\n{'='*60}")
+    print(f"  UPLOADING TO HUB")
     print(f"{'='*60}")
     try:
         from huggingface_hub import HfApi
         api = HfApi()
         repo = "Swagcrew/fish-speech-s2-quantized"
         for fn in os.listdir(OUT):
             if fn.endswith(".wav"):
                 api.upload_file(
+                    path_or_fileobj=os.path.join(OUT, fn),
                     path_in_repo=f"samples/{fn}",
                     repo_id=repo,
                     repo_type="model"
                 )
                 print(f"  Uploaded samples/{fn}")
+        print(f"\n  https://huggingface.co/{repo}/tree/main/samples")
     except Exception as e:
         print(f"  Upload error: {e}")
     print("\nDONE!")