File size: 6,669 Bytes
8c17c76
696bef2
3a38d65
8c17c76
 
 
e2f85c6
 
 
8c17c76
3a38d65
8c17c76
696bef2
8c17c76
 
696bef2
 
e2f85c6
 
696bef2
e2f85c6
8c17c76
696bef2
 
 
 
8c17c76
696bef2
 
8c17c76
e2f85c6
8c17c76
 
 
 
 
3a38d65
 
 
 
 
 
696bef2
3a38d65
 
 
696bef2
3a38d65
 
 
 
 
 
 
 
 
 
 
 
 
696bef2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a38d65
60a491a
696bef2
 
 
 
60a491a
696bef2
8c17c76
696bef2
 
 
 
 
 
 
 
 
 
 
8c17c76
696bef2
8c17c76
696bef2
 
 
 
 
 
 
 
 
 
 
 
 
8c17c76
 
 
 
3a38d65
8c17c76
 
e2f85c6
8c17c76
 
 
 
 
696bef2
8c17c76
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python3
"""Generate voice clone samples from ALL quantized Fish Speech S2 Pro variants."""
import os, sys, json, time, gc, traceback, subprocess
import torch

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_HOME"] = "/tmp/hf_cache"
sys.path.insert(0, "/app/fish-speech")

GEN_TEXT = "Every man's life ends the same way. It is only the details of how he lived that distinguish one man from another."
REF_TEXT = "Let me get this straight. You think that your client, one of the wealthiest most powerful men in the world, is secretly a vigilante who spends his nights beating criminals to a pulp with his bare hands. And your plan is to blackmail this person."
OUT = "/tmp/samples"
REF_AUDIO = "/app/reference/morgan_ref.wav"
os.makedirs(OUT, exist_ok=True)

# === PART 1: Python-based models (bf16, fp8, gptq) ===
PYTHON_MODELS = [
    ("baseline_bf16", "fishaudio/s2-pro"),
    ("fp8", "drbaph/s2-pro-fp8"),
    ("gptq_w4a16", "baicai1145/s2-pro-w4a16"),
]

def gen_python_models():
    print("\n" + "="*60)
    print("  PART 1: Python-based models (bf16, fp8, gptq)")
    print("="*60)

    for name, model_id in PYTHON_MODELS:
        print(f"\n  [{name}] ({model_id})")

        local_dir = f"/tmp/models/{name}"
        if not os.path.exists(f"{local_dir}/config.json"):
            from huggingface_hub import snapshot_download
            snapshot_download(model_id, local_dir=local_dir, token=os.environ.get("HF_TOKEN"))

        out_path = f"{OUT}/fish_{name}_morgan_clone.wav"
        semantic_dir = f"{OUT}/{name}_semantic"
        os.makedirs(semantic_dir, exist_ok=True)

        cmd = [
            sys.executable, "-m", "fish_speech.models.text2semantic.inference",
            "--text", f"<|speaker:0|>{GEN_TEXT}",
            "--prompt-audio", REF_AUDIO,
            "--prompt-text", REF_TEXT,
            "--checkpoint-path", local_dir,
            "--output-dir", semantic_dir,
            "--output", out_path,
            "--num-samples", "1",
            "--max-new-tokens", "1024",
            "--top-p", "0.7",
            "--top-k", "30",
            "--temperature", "0.7",
            "--no-iterative-prompt",
            "--chunk-length", "0",
            "--device", "cuda",
        ]

        env = {**os.environ, "PYTHONPATH": "/app/fish-speech"}
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, env=env)

        if os.path.exists(out_path):
            import soundfile as sf
            data, sr = sf.read(out_path)
            dur = len(data) / sr
            print(f"    βœ… {out_path} ({dur:.1f}s)")
        else:
            print(f"    ❌ Failed: {result.stderr[-200:]}")

# === PART 2: GGUF models via s2.cpp ===
GGUF_MODELS = [
    ("gguf_q8_0", "s2-pro-q8_0.gguf"),
    ("gguf_q6_k", "s2-pro-q6_k.gguf"),
    ("gguf_q5_k_m", "s2-pro-q5_k_m.gguf"),
    ("gguf_q4_k_m", "s2-pro-q4_k_m.gguf"),
    ("gguf_q3_k", "s2-pro-q3_k.gguf"),
    ("gguf_q2_k", "s2-pro-q2_k.gguf"),
]

def build_s2cpp():
    """Build s2.cpp with CUDA support."""
    print("\n  Building s2.cpp with CUDA...")
    s2dir = "/tmp/s2.cpp"
    if not os.path.exists(f"{s2dir}/build/s2"):
        subprocess.run(["git", "clone", "--recurse-submodules", 
                       "https://github.com/rodrigomatta/s2.cpp.git", s2dir],
                      capture_output=True, timeout=120)
        subprocess.run(["cmake", "-B", "build", "-DCMAKE_BUILD_TYPE=Release", "-DS2_CUDA=ON"],
                      cwd=s2dir, capture_output=True, timeout=60)
        subprocess.run(["cmake", "--build", "build", "--parallel"], 
                      cwd=s2dir, capture_output=True, timeout=300)
    
    if os.path.exists(f"{s2dir}/build/s2"):
        print("    βœ… s2.cpp built")
        return f"{s2dir}/build/s2"
    return None

def gen_gguf_models():
    print("\n" + "="*60)
    print("  PART 2: GGUF models via s2.cpp")
    print("="*60)

    s2bin = build_s2cpp()
    if not s2bin:
        print("    ❌ Failed to build s2.cpp")
        return

    # Download GGUF models
    from huggingface_hub import hf_hub_download
    gguf_dir = "/tmp/gguf_models"
    os.makedirs(gguf_dir, exist_ok=True)

    # Download tokenizer
    tok_path = hf_hub_download("rodrigomt/s2-pro-gguf", "tokenizer.json", local_dir=gguf_dir)

    for name, gguf_file in GGUF_MODELS:
        print(f"\n  [{name}] ({gguf_file})")

        # Download model
        model_path = hf_hub_download("rodrigomt/s2-pro-gguf", gguf_file, local_dir=gguf_dir)
        out_path = f"{OUT}/fish_{name}_morgan_clone.wav"

        cmd = [
            s2bin,
            "-m", model_path,
            "-t", tok_path,
            "-pa", REF_AUDIO,
            "-pt", REF_TEXT,
            "-text", GEN_TEXT,
            "-c", "0",  # CUDA device 0
            "-o", out_path,
        ]

        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)

        if os.path.exists(out_path):
            import soundfile as sf
            data, sr = sf.read(out_path)
            dur = len(data) / sr
            print(f"    βœ… {out_path} ({dur:.1f}s)")
        else:
            print(f"    ❌ Failed: {result.stderr[-200:]}")

# === MAIN ===
def main():
    print(f"=== Fish Speech S2 Pro - Full Quantization Comparison ===")
    print(f"GPU: {torch.cuda.get_device_name(0)}, VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")
    print(f"Text: {GEN_TEXT}")
    print(f"Ref: {REF_AUDIO}")

    gen_python_models()
    gen_gguf_models()

    # Upload all samples
    print(f"\n{'='*60}")
    print(f"  UPLOADING ALL SAMPLES")
    print(f"{'='*60}")

    import soundfile as sf
    results = []
    for fn in sorted(os.listdir(OUT)):
        if fn.endswith(".wav"):
            fpath = os.path.join(OUT, fn)
            data, sr = sf.read(fpath)
            dur = len(data) / sr
            results.append((fn, dur, os.path.getsize(fpath)/1024))

    for fn, dur, sz in results:
        print(f"  {fn}: {dur:.1f}s, {sz:.0f}KB")

    try:
        from huggingface_hub import HfApi
        api = HfApi()
        repo = "Swagcrew/fish-speech-s2-quantized"
        for fn in sorted(os.listdir(OUT)):
            if fn.endswith(".wav"):
                api.upload_file(
                    path_or_fileobj=os.path.join(OUT, fn),
                    path_in_repo=f"samples/{fn}",
                    repo_id=repo,
                    repo_type="model"
                )
                print(f"  Uploaded samples/{fn}")
        print(f"\n  πŸ”— https://huggingface.co/{repo}/tree/main/samples")
    except Exception as e:
        print(f"  Upload error: {e}")

    print("\nDONE!")

if __name__ == "__main__":
    main()