Spaces:

Swagcrew
/

fish-quant-samples

Runtime error

App Files Files Community

Swagcrew commited on 18 days ago

Commit

8c17c76

verified ·

1 Parent(s): 9f00efa

Upload gen_samples.py with huggingface_hub

Browse files

Files changed (1) hide show

gen_samples.py +170 -0

gen_samples.py ADDED Viewed

	@@ -0,0 +1,170 @@

+#!/usr/bin/env python3
+"""Generate voice clone samples from all quantized Fish Speech S2 Pro models."""
+import os, sys, json, time, gc, traceback, subprocess
+import torch
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+DEVICE = "cuda"
+DTYPE = torch.bfloat16
+REF_TEXT = "Let me get this straight. You think that your client, one of the wealthiest most powerful men in the world, is secretly a vigilante who spends his nights beating criminals to a pulp with his bare hands. And your plan is to blackmail this person."
+GEN_TEXT = "Every man's life ends the same way. It is only the details of how he lived that distinguish one man from another."
+OUT = "/tmp/samples"
+MODEL_DIR = "/tmp/models"
+print("=== Fish Speech Voice Clone Sample Generator ===")
+print(f"GPU: {torch.cuda.get_device_name(0)}, VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")
+os.makedirs(OUT, exist_ok=True)
+os.makedirs(MODEL_DIR, exist_ok=True)
+# Setup fish-speech
+sys.path.insert(0, "/app/fish-speech")
+from fish_speech.models.text2semantic.inference import init_model, load_codec_model, generate, decode_one_token_ar
+from fish_speech.conversation import Conversation, Message
+from fish_speech.content_sequence import TextPart, VQPart
+import torchaudio, soundfile as sf
+def load_ref_audio(ref_path):
+    wav, sr = torchaudio.load(ref_path)
+    if wav.shape[0] > 1: wav = wav.mean(dim=0, keepdim=True)
+    if sr != 44100: wav = torchaudio.functional.resample(wav, sr, 44100)
+    return wav
+def encode_ref(codec, wav):
+    wav = wav.to(DEVICE)
+    with torch.autocast(device_type="cuda", dtype=DTYPE):
+        enc = codec.encode(wav.unsqueeze(0))
+        tokens = enc[0] if isinstance(enc, tuple) else enc
+        return tokens.cpu().numpy()
+def generate_clone(model, codec, ref_tokens, ref_text, gen_text, out_path):
+    """Generate voice clone using the Conversation API correctly."""
+    conv = Conversation()
+    conv.append(Message(role="user", parts=[
+        VQPart(codes=ref_tokens),
+        TextPart(text=ref_text)
+    ]))
+    conv.append(Message(role="assistant", parts=[TextPart(text=gen_text)]))
+    nc = model.config.num_codebooks
+    tokenizer = model.tokenizer
+    result = conv.encode_for_inference(tokenizer, nc)
+    # encode_for_inference returns (prompt_tensor,) or prompt_tensor
+    if isinstance(result, tuple):
+        prompt = result[0]
+    else:
+        prompt = result
+    cd = 1 + nc
+    am = torch.zeros(1, cd, prompt.shape[-1], dtype=torch.bool, device=DEVICE)
+    ap = torch.zeros(1, cd, prompt.shape[-1], dtype=torch.long, device=DEVICE)
+    if not getattr(model, '_cd', False):
+        model.setup_caches(1, model.config.max_seq_len, dtype=DTYPE)
+        model._cd = True
+    with torch.autocast(device_type="cuda", dtype=DTYPE):
+        r = generate(model=model, prompt=prompt, max_new_tokens=1024,
+            audio_masks=am, audio_parts=ap, temperature=0.7, top_p=0.7, top_k=30,
+            decode_one_token=decode_one_token_ar)
+    codes = r[0:1,:,:].unsqueeze(0)
+    with torch.autocast(device_type="cuda", dtype=DTYPE):
+        audio = codec.decode(codes.to(DEVICE))
+    np_audio = audio.squeeze().cpu().float().numpy()
+    sr = getattr(codec, 'sample_rate', 44100)
+    sf.write(out_path, np_audio, sr)
+    dur = len(np_audio) / sr
+    print(f"  Saved {out_path} ({dur:.1f}s)")
+    return True
+# --- Models to test ---
+MODELS = {
+    "baseline_bf16": {"source": "fishaudio/s2-pro", "quant": None},
+    "fp8": {"source": "drbaph/s2-pro-fp8", "quant": None},
+}
+def main():
+    # Load reference audio
+    ref_path = "/app/reference/morgan_ref.wav"
+    if not os.path.exists(ref_path):
+        ref_path = "/tmp/reference/morgan_ref.wav"
+    print(f"\n[1] Loading reference audio: {ref_path}")
+    ref_wav = load_ref_audio(ref_path)
+    results = {}
+    for name, cfg in MODELS.items():
+        print(f"\n{'='*60}")
+        print(f"  {name.upper()}")
+        print(f"{'='*60}")
+        model_id = cfg["source"]
+        local_dir = f"{MODEL_DIR}/{name}"
+        # Download if needed
+        if not os.path.exists(f"{local_dir}/config.json"):
+            print(f"  Downloading {model_id}...")
+            from huggingface_hub import snapshot_download
+            snapshot_download(model_id, local_dir=local_dir, token=os.environ.get("HF_TOKEN"))
+        # Load model
+        print(f"  Loading model...")
+        model, _ = init_model(local_dir, DEVICE, DTYPE, compile=False)
+        codec = load_codec_model(f"{local_dir}/codec.pth", DEVICE, DTYPE)
+        # Encode reference
+        ref_tokens = encode_ref(codec, ref_wav)
+        # Generate
+        out_path = f"{OUT}/fish_{name}_morgan_clone.wav"
+        try:
+            ok = generate_clone(model, codec, ref_tokens, REF_TEXT, GEN_TEXT, out_path)
+            results[name] = {"ok": ok, "file": out_path}
+        except Exception as e:
+            print(f"  FAILED: {e}")
+            traceback.print_exc()
+            results[name] = {"ok": False, "error": str(e)}
+        del model, codec
+        gc.collect()
+        torch.cuda.empty_cache()
+    # Also generate from GGUF models using s2.cpp if available
+    # For now, just upload what we have
+    # Summary
+    print(f"\n{'='*60}")
+    print("  RESULTS")
+    print(f"{'='*60}")
+    for name, r in results.items():
+        status = "✅" if r["ok"] else "❌"
+        print(f"  {status} {name}: {r.get('file', r.get('error',''))}")
+    # Upload to Hub
+    print("\n[Final] Uploading samples to Hub...")
+    try:
+        from huggingface_hub import HfApi
+        api = HfApi()
+        repo = "Swagcrew/fish-speech-s2-quantized"
+        for fn in os.listdir(OUT):
+            if fn.endswith(".wav"):
+                fpath = os.path.join(OUT, fn)
+                api.upload_file(
+                    path_or_fileobj=fpath,
+                    path_in_repo=f"samples/{fn}",
+                    repo_id=repo,
+                    repo_type="model"
+                )
+                print(f"  Uploaded samples/{fn}")
+        print(f"\n  All at https://huggingface.co/{repo}/tree/main/samples")
+    except Exception as e:
+        print(f"  Upload error: {e}")
+        traceback.print_exc()
+    print("\nDONE!")
+if __name__ == "__main__":
+    main()