#!/usr/bin/env python3 """Generate voice clone samples from ALL quantized Fish Speech S2 Pro variants.""" import os, sys, json, time, gc, traceback, subprocess import torch os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["HF_HOME"] = "/tmp/hf_cache" sys.path.insert(0, "/app/fish-speech") GEN_TEXT = "Every man's life ends the same way. It is only the details of how he lived that distinguish one man from another." REF_TEXT = "Let me get this straight. You think that your client, one of the wealthiest most powerful men in the world, is secretly a vigilante who spends his nights beating criminals to a pulp with his bare hands. And your plan is to blackmail this person." OUT = "/tmp/samples" REF_AUDIO = "/app/reference/morgan_ref.wav" os.makedirs(OUT, exist_ok=True) # === PART 1: Python-based models (bf16, fp8, gptq) === PYTHON_MODELS = [ ("baseline_bf16", "fishaudio/s2-pro"), ("fp8", "drbaph/s2-pro-fp8"), ("gptq_w4a16", "baicai1145/s2-pro-w4a16"), ] def gen_python_models(): print("\n" + "="*60) print(" PART 1: Python-based models (bf16, fp8, gptq)") print("="*60) for name, model_id in PYTHON_MODELS: print(f"\n [{name}] ({model_id})") local_dir = f"/tmp/models/{name}" if not os.path.exists(f"{local_dir}/config.json"): from huggingface_hub import snapshot_download snapshot_download(model_id, local_dir=local_dir, token=os.environ.get("HF_TOKEN")) out_path = f"{OUT}/fish_{name}_morgan_clone.wav" semantic_dir = f"{OUT}/{name}_semantic" os.makedirs(semantic_dir, exist_ok=True) cmd = [ sys.executable, "-m", "fish_speech.models.text2semantic.inference", "--text", f"<|speaker:0|>{GEN_TEXT}", "--prompt-audio", REF_AUDIO, "--prompt-text", REF_TEXT, "--checkpoint-path", local_dir, "--output-dir", semantic_dir, "--output", out_path, "--num-samples", "1", "--max-new-tokens", "1024", "--top-p", "0.7", "--top-k", "30", "--temperature", "0.7", "--no-iterative-prompt", "--chunk-length", "0", "--device", "cuda", ] env = {**os.environ, "PYTHONPATH": "/app/fish-speech"} result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env) if os.path.exists(out_path): import soundfile as sf data, sr = sf.read(out_path) dur = len(data) / sr print(f" ✅ {out_path} ({dur:.1f}s)") else: print(f" ❌ Failed: {result.stderr[-200:]}") # === PART 2: GGUF models via s2.cpp === GGUF_MODELS = [ ("gguf_q8_0", "s2-pro-q8_0.gguf"), ("gguf_q6_k", "s2-pro-q6_k.gguf"), ("gguf_q5_k_m", "s2-pro-q5_k_m.gguf"), ("gguf_q4_k_m", "s2-pro-q4_k_m.gguf"), ("gguf_q3_k", "s2-pro-q3_k.gguf"), ("gguf_q2_k", "s2-pro-q2_k.gguf"), ] def build_s2cpp(): """Build s2.cpp with CUDA support.""" print("\n Building s2.cpp with CUDA...") s2dir = "/tmp/s2.cpp" if not os.path.exists(f"{s2dir}/build/s2"): subprocess.run(["git", "clone", "--recurse-submodules", "https://github.com/rodrigomatta/s2.cpp.git", s2dir], capture_output=True, timeout=120) subprocess.run(["cmake", "-B", "build", "-DCMAKE_BUILD_TYPE=Release", "-DS2_CUDA=ON"], cwd=s2dir, capture_output=True, timeout=60) subprocess.run(["cmake", "--build", "build", "--parallel"], cwd=s2dir, capture_output=True, timeout=300) if os.path.exists(f"{s2dir}/build/s2"): print(" ✅ s2.cpp built") return f"{s2dir}/build/s2" return None def gen_gguf_models(): print("\n" + "="*60) print(" PART 2: GGUF models via s2.cpp") print("="*60) s2bin = build_s2cpp() if not s2bin: print(" ❌ Failed to build s2.cpp") return # Download GGUF models from huggingface_hub import hf_hub_download gguf_dir = "/tmp/gguf_models" os.makedirs(gguf_dir, exist_ok=True) # Download tokenizer tok_path = hf_hub_download("rodrigomt/s2-pro-gguf", "tokenizer.json", local_dir=gguf_dir) for name, gguf_file in GGUF_MODELS: print(f"\n [{name}] ({gguf_file})") # Download model model_path = hf_hub_download("rodrigomt/s2-pro-gguf", gguf_file, local_dir=gguf_dir) out_path = f"{OUT}/fish_{name}_morgan_clone.wav" cmd = [ s2bin, "-m", model_path, "-t", tok_path, "-pa", REF_AUDIO, "-pt", REF_TEXT, "-text", GEN_TEXT, "-c", "0", # CUDA device 0 "-o", out_path, ] result = subprocess.run(cmd, capture_output=True, text=True, timeout=600) if os.path.exists(out_path): import soundfile as sf data, sr = sf.read(out_path) dur = len(data) / sr print(f" ✅ {out_path} ({dur:.1f}s)") else: print(f" ❌ Failed: {result.stderr[-200:]}") # === MAIN === def main(): print(f"=== Fish Speech S2 Pro - Full Quantization Comparison ===") print(f"GPU: {torch.cuda.get_device_name(0)}, VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB") print(f"Text: {GEN_TEXT}") print(f"Ref: {REF_AUDIO}") gen_python_models() gen_gguf_models() # Upload all samples print(f"\n{'='*60}") print(f" UPLOADING ALL SAMPLES") print(f"{'='*60}") import soundfile as sf results = [] for fn in sorted(os.listdir(OUT)): if fn.endswith(".wav"): fpath = os.path.join(OUT, fn) data, sr = sf.read(fpath) dur = len(data) / sr results.append((fn, dur, os.path.getsize(fpath)/1024)) for fn, dur, sz in results: print(f" {fn}: {dur:.1f}s, {sz:.0f}KB") try: from huggingface_hub import HfApi api = HfApi() repo = "Swagcrew/fish-speech-s2-quantized" for fn in sorted(os.listdir(OUT)): if fn.endswith(".wav"): api.upload_file( path_or_fileobj=os.path.join(OUT, fn), path_in_repo=f"samples/{fn}", repo_id=repo, repo_type="model" ) print(f" Uploaded samples/{fn}") print(f"\n 🔗 https://huggingface.co/{repo}/tree/main/samples") except Exception as e: print(f" Upload error: {e}") print("\nDONE!") if __name__ == "__main__": main()