import torch from fish_speech.models.fish_speech import FishSpeech from fish_speech.inference import infer import io import base64 import soundfile as sf # 加载模型 model = FishSpeech.from_pretrained('fishaudio/fish-speech-1.5') def predict(inputs: dict): text = inputs.get('inputs', 'Hello world') # 支持 [singing] 标签 if "[singing]" in text.lower(): mode = "singing" text = text.replace("[singing]", "") else: mode = "speech" # 生成音频 audio = infer(model, text, mode=mode) # 转 base64 WAV buffer = io.BytesIO() sf.write(buffer, audio.cpu().numpy(), 24000, format='WAV') audio_b64 = base64.b64encode(buffer.getvalue()).decode() return {"audio": audio_b64} def query(payload): return predict(payload)