import glob
import html
import os
import shutil
import sys
import uuid
from pathlib import Path
from typing import Any
from fastapi import FastAPI, File, Form, Request, UploadFile
from fastapi.responses import HTMLResponse, JSONResponse
import uvicorn
SPACE_TTS_CONFIG = os.getenv("SPACE_TTS_CONFIG", "GPT_SoVITS/configs/tts_infer_cpu.yaml")
SPACE_PORT = os.getenv("PORT", "7860")
sys.argv = [
"api_v2.py",
"-a",
"0.0.0.0",
"-p",
SPACE_PORT,
"-c",
SPACE_TTS_CONFIG,
]
import api_v2
app = FastAPI(title="GPT-SoVITS Space")
PRETRAINED_DIR = "/app/GPT_SoVITS/pretrained_models"
CUSTOM_DIR = "/data/models"
UPLOAD_DIR = Path("/tmp/uploads")
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
DEFAULT_REF_AUDIO = Path("/data/models/zh_vo_MAIN_YHX_2_12.wav")
FAST_LANGDETECT_MODEL = Path("/app/GPT_SoVITS/pretrained_models/fast_langdetect/lid.176.bin")
DEFAULT_REF_TEXT = "是吗,抱歉哦,我不记得了。"
DEFAULT_PROMPT_LANG = "zh"
DEFAULT_TEXT_LANG = "zh"
DEFAULT_CHARACTER = "aimisi"
DEFAULT_EMOTION = "default"
def get_models():
models = []
for directory in [CUSTOM_DIR, PRETRAINED_DIR]:
if os.path.exists(directory):
for pattern in ("*.ckpt", "*.pth"):
for path in glob.glob(os.path.join(directory, "**", pattern), recursive=True):
models.append({"name": os.path.basename(path), "path": path})
models.sort(key=lambda item: item["name"])
return models
def get_languages():
languages = list(getattr(api_v2.tts_config, "languages", []))
if not languages:
languages = [DEFAULT_TEXT_LANG, DEFAULT_PROMPT_LANG, "auto"]
return sorted({lang.lower() for lang in languages})
def current_model_paths():
configs = getattr(api_v2.tts_pipeline, "configs", None)
return {
"gpt": getattr(configs, "t2s_weights_path", ""),
"sovits": getattr(configs, "vits_weights_path", ""),
}
def language_options(selected: str):
options = []
for language in get_languages():
chosen = " selected" if language == selected else ""
label = html.escape(language)
options.append(f'')
return "".join(options)
def normalize_language(language: str | None) -> str:
if not language:
return "auto"
return language.lower()
def build_tts_request(
text: str,
text_lang: str | None = None,
prompt_lang: str | None = None,
ref_text: str | None = None,
ref_audio_path: str | Path | None = None,
media_type: str | None = None,
speed: float | None = None,
top_k: int | None = None,
top_p: float | None = None,
temperature: float | None = None,
batch_size: int | None = None,
stream: bool | None = None,
text_split_method: str | None = None,
batch_threshold: float | None = None,
split_bucket: bool | None = None,
speed_factor: float | None = None,
fragment_interval: float | None = None,
seed: int | None = None,
parallel_infer: bool | None = None,
repetition_penalty: float | None = None,
) -> dict[str, Any]:
request = {
"text": text.strip(),
"text_lang": normalize_language(text_lang),
"ref_audio_path": str(ref_audio_path or DEFAULT_REF_AUDIO),
"aux_ref_audio_paths": [],
"prompt_text": (ref_text or DEFAULT_REF_TEXT).strip(),
"prompt_lang": normalize_language(prompt_lang or DEFAULT_PROMPT_LANG),
"media_type": (media_type or "wav").lower(),
"streaming_mode": bool(stream) if stream is not None else False,
"text_split_method": text_split_method or "cut5",
"batch_threshold": float(batch_threshold) if batch_threshold is not None else 0.75,
"split_bucket": bool(split_bucket) if split_bucket is not None else True,
"seed": int(seed) if seed is not None else -1,
"parallel_infer": bool(parallel_infer) if parallel_infer is not None else True,
"repetition_penalty": float(repetition_penalty) if repetition_penalty is not None else 1.35,
}
if speed is not None:
request["speed_factor"] = float(speed)
elif speed_factor is not None:
request["speed_factor"] = float(speed_factor)
if top_k is not None:
request["top_k"] = int(top_k)
if top_p is not None:
request["top_p"] = float(top_p)
if temperature is not None:
request["temperature"] = float(temperature)
if batch_size is not None:
request["batch_size"] = int(batch_size)
if fragment_interval is not None:
request["fragment_interval"] = float(fragment_interval)
return request
@app.get("/", response_class=HTMLResponse)
def index():
models = get_models()
model_paths = current_model_paths()
default_ref_status = "available" if DEFAULT_REF_AUDIO.exists() else "missing"
return f"""
GPT-SoVITS Space
Hugging Face Space
GPT-SoVITS CPU Inference
The Space now boots through a small FastAPI wrapper instead of exposing a bare API root.
If you do not upload a reference clip, it falls back to the built-in Aimisi sample at
{html.escape(str(DEFAULT_REF_AUDIO))}.