| """
|
| OpenAI-compatible FastAPI server for MOSS-TTS longform (1.5B / 7B) backend.
|
|
|
| Exposes the same ``POST /v1/audio/speech`` endpoint shape as the realtime
|
| server so clients can switch backends by changing the base URL only.
|
|
|
| Environment variables
|
| ---------------------
|
| MOSS_TTS_LONGFORM_MODEL_PATH HF repo or local path for the backbone model
|
| (default: OpenMOSS-Team/MOSS-TTS-Local-Transformer)
|
| MOSS_TTS_CODEC_MODEL_PATH HF repo or local path for the audio tokenizer
|
| (default: OpenMOSS-Team/MOSS-Audio-Tokenizer)
|
| MOSS_TTS_DEVICE PyTorch device (default: cuda:0)
|
| MOSS_TTS_ATTN_IMPLEMENTATION sdpa | flash_attention_2 | eager | auto
|
| (default: auto)
|
| MOSS_TTS_TORCH_DTYPE bfloat16 | float16 | float32 | auto
|
| (default: auto → bfloat16 when CUDA present)
|
| MOSS_TTS_VOICE_DIR Directory that holds voice-prompt WAV/MP3 files
|
| named after OpenAI voice IDs (default: built-in
|
| audio/ next to openai_api.py in moss_tts_realtime)
|
| MOSS_TTS_MAX_NEW_TOKENS Max generation tokens (default: 4096)
|
| and upper bound for the per-request heuristic cap
|
| MOSS_TTS_TEMPERATURE Audio sampling temperature (default: 1.0)
|
| MOSS_TTS_TOP_P Audio top-p (default: 0.95)
|
| MOSS_TTS_TOP_K Audio top-k (default: 50)
|
| MOSS_TTS_REPETITION_PENALTY Audio repetition penalty (default: 1.1)
|
| MOSS_TTS_WARMUP_ON_START true/1/yes → run a short warmup (default: true)
|
| MOSS_TTS_MAX_CONCURRENT Max simultaneous synthesis requests (default: 1)
|
| MOSS_TTS_HOST Bind host (default: 0.0.0.0)
|
| MOSS_TTS_PORT Bind port (default: 8013)
|
| MOSS_TTS_LOG_LEVEL Logging verbosity (default: INFO)
|
| """
|
|
|
| from __future__ import annotations
|
|
|
| import io
|
| import logging
|
| import os
|
| import sys
|
| import threading
|
| import time
|
| from contextlib import asynccontextmanager
|
| from pathlib import Path
|
| from typing import Literal
|
|
|
| import numpy as np
|
| from fastapi import FastAPI, HTTPException, Response
|
| from fastapi.middleware.cors import CORSMiddleware
|
| from pydantic import BaseModel, ConfigDict, Field
|
|
|
|
|
| _PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| if str(_PROJECT_ROOT) not in sys.path:
|
| sys.path.insert(0, str(_PROJECT_ROOT))
|
|
|
| from runner.adapters.longform_native import LongformNativeAdapter
|
|
|
| log = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
| DEFAULT_MODEL_PATH = os.getenv(
|
| "MOSS_TTS_LONGFORM_MODEL_PATH", "OpenMOSS-Team/MOSS-TTS-Local-Transformer"
|
| )
|
| DEFAULT_CODEC_PATH = os.getenv(
|
| "MOSS_TTS_CODEC_MODEL_PATH", "OpenMOSS-Team/MOSS-Audio-Tokenizer"
|
| )
|
| DEFAULT_DEVICE = os.getenv("MOSS_TTS_DEVICE", "cuda:0")
|
| DEFAULT_ATTN = os.getenv("MOSS_TTS_ATTN_IMPLEMENTATION", "auto")
|
| DEFAULT_DTYPE = os.getenv("MOSS_TTS_TORCH_DTYPE", "auto")
|
| DEFAULT_MAX_NEW_TOKENS = int(os.getenv("MOSS_TTS_MAX_NEW_TOKENS", "4096"))
|
| DEFAULT_TEMPERATURE = float(os.getenv("MOSS_TTS_TEMPERATURE", "1.0"))
|
| DEFAULT_TOP_P = float(os.getenv("MOSS_TTS_TOP_P", "0.95"))
|
| DEFAULT_TOP_K = int(os.getenv("MOSS_TTS_TOP_K", "50"))
|
| DEFAULT_REPETITION_PENALTY = float(os.getenv("MOSS_TTS_REPETITION_PENALTY", "1.1"))
|
| WARMUP_ON_START = os.getenv("MOSS_TTS_WARMUP_ON_START", "true").lower() in ("true", "1", "yes")
|
| MAX_CONCURRENT = max(1, int(os.getenv("MOSS_TTS_MAX_CONCURRENT", "1")))
|
|
|
|
|
|
|
| _DEFAULT_VOICE_DIR = Path(__file__).resolve().parent.parent / "moss_tts_realtime" / "audio"
|
| VOICE_DIR = Path(os.getenv("MOSS_TTS_VOICE_DIR", str(_DEFAULT_VOICE_DIR)))
|
|
|
| _SUPPORTED_MODELS = {
|
| "tts-1": DEFAULT_MODEL_PATH,
|
| "tts-1-hd": DEFAULT_MODEL_PATH,
|
| "moss-tts-longform": DEFAULT_MODEL_PATH,
|
| "moss-tts-delay": DEFAULT_MODEL_PATH,
|
| }
|
|
|
| _VOICE_PRESETS: dict[str, Path | None] = {
|
| "alloy": VOICE_DIR / "prompt_audio.mp3",
|
| "echo": VOICE_DIR / "prompt_audio1.mp3",
|
| "fable": VOICE_DIR / "prompt_audio.mp3",
|
| "nova": VOICE_DIR / "prompt_audio1.mp3",
|
| "onyx": VOICE_DIR / "prompt_audio.mp3",
|
| "shimmer": VOICE_DIR / "prompt_audio1.mp3",
|
| "default": None,
|
| }
|
|
|
| _generation_semaphore = threading.BoundedSemaphore(MAX_CONCURRENT)
|
|
|
|
|
| _adapter: LongformNativeAdapter | None = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| class OpenAISpeechRequest(BaseModel):
|
| model_config = ConfigDict(extra="ignore")
|
|
|
| model: str = Field(default="tts-1")
|
| input: str = Field(..., min_length=1, max_length=8192)
|
| voice: str = Field(default="alloy")
|
| response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(default="mp3")
|
| speed: float = Field(default=1.0, ge=0.25, le=4.0)
|
|
|
|
|
| class VoiceInfo(BaseModel):
|
| id: str
|
| name: str
|
| description: str | None = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| def _content_type(audio_format: str) -> str:
|
| return {
|
| "mp3": "audio/mpeg",
|
| "opus": "audio/opus",
|
| "aac": "audio/aac",
|
| "flac": "audio/flac",
|
| "wav": "audio/wav",
|
| "pcm": "audio/pcm",
|
| }[audio_format]
|
|
|
|
|
| def _wav_bytes(audio: np.ndarray, sample_rate: int) -> bytes:
|
| import wave
|
|
|
| audio = np.asarray(audio, dtype=np.float32).reshape(-1)
|
| audio = np.clip(audio, -1.0, 1.0)
|
| audio_i16 = (audio * 32767.0).astype(np.int16)
|
| buf = io.BytesIO()
|
| with wave.open(buf, "wb") as wf:
|
| wf.setnchannels(1)
|
| wf.setsampwidth(2)
|
| wf.setframerate(sample_rate)
|
| wf.writeframes(audio_i16.tobytes())
|
| return buf.getvalue()
|
|
|
|
|
| def _pcm_bytes(audio: np.ndarray) -> bytes:
|
| audio = np.asarray(audio, dtype=np.float32).reshape(-1)
|
| return (np.clip(audio, -1.0, 1.0) * 32767.0).astype(np.int16).tobytes()
|
|
|
|
|
| def _encode_audio(audio: np.ndarray, sample_rate: int, response_format: str) -> bytes:
|
| if response_format == "wav":
|
| return _wav_bytes(audio, sample_rate)
|
| if response_format == "pcm":
|
| return _pcm_bytes(audio)
|
| try:
|
| from pydub import AudioSegment
|
| except ImportError as exc:
|
| raise RuntimeError(
|
| f"Compressed output ('{response_format}') requires pydub: {exc}"
|
| ) from exc
|
|
|
| wav_b = _wav_bytes(audio, sample_rate)
|
| seg = AudioSegment.from_wav(io.BytesIO(wav_b))
|
| out = io.BytesIO()
|
| kwargs = {
|
| "mp3": {"format": "mp3", "bitrate": "192k"},
|
| "opus": {"format": "opus", "bitrate": "128k"},
|
| "aac": {"format": "adts", "bitrate": "192k"},
|
| "flac": {"format": "flac"},
|
| }[response_format]
|
| fmt = kwargs.pop("format")
|
| seg.export(out, format=fmt, **kwargs)
|
| return out.getvalue()
|
|
|
|
|
|
|
|
|
|
|
|
|
| def _voice_reference_path(voice: str) -> str | None:
|
| """Return the filesystem path to the reference audio for *voice*, or None."""
|
| normalized = voice.strip().lower()
|
| if not normalized:
|
| raise HTTPException(status_code=400, detail="voice is required")
|
|
|
| if normalized in _VOICE_PRESETS:
|
| p = _VOICE_PRESETS[normalized]
|
| if p is None:
|
| return None
|
| if not p.exists():
|
| log.warning("Bundled voice prompt missing: %s – using no reference.", p)
|
| return None
|
| return str(p.resolve())
|
|
|
|
|
| candidate = Path(voice).expanduser()
|
| if candidate.is_file():
|
| return str(candidate.resolve())
|
|
|
| raise HTTPException(
|
| status_code=400,
|
| detail=(
|
| f"Unsupported voice '{voice}'. "
|
| f"Available voices: {', '.join(sorted(_VOICE_PRESETS))}"
|
| ),
|
| )
|
|
|
|
|
| def _estimate_max_new_tokens(text: str) -> int:
|
| """Estimate a practical generation cap from prompt length.
|
|
|
| The local-transformer backend does not always emit EOS promptly, so a fixed
|
| 4096-token cap causes short prompts to run for minutes. Approximate speech
|
| length from word count and clamp it by the environment-configured ceiling.
|
| """
|
| words = max(1, len(text.split()))
|
| estimated = words * 6 + 64
|
| return max(128, min(DEFAULT_MAX_NEW_TOKENS, estimated))
|
|
|
|
|
|
|
|
|
|
|
|
|
| def _synthesize(payload: OpenAISpeechRequest) -> tuple[bytes, dict[str, float]]:
|
| """Run synthesis and return ``(encoded_audio_bytes, metrics)``."""
|
| assert _adapter is not None, "Adapter not initialised"
|
|
|
| if payload.model not in _SUPPORTED_MODELS:
|
| raise HTTPException(
|
| status_code=400,
|
| detail=(
|
| f"Unsupported model '{payload.model}'. "
|
| f"Supported: {', '.join(sorted(_SUPPORTED_MODELS))}"
|
| ),
|
| )
|
|
|
| reference_path = _voice_reference_path(payload.voice)
|
|
|
| t0 = time.perf_counter()
|
|
|
| acquired = _generation_semaphore.acquire(timeout=120)
|
| if not acquired:
|
| raise HTTPException(
|
| status_code=503,
|
| detail="Server busy – all generation slots occupied. Retry shortly.",
|
| )
|
|
|
| try:
|
| t_gen_start = time.perf_counter()
|
| waveform, sample_rate = _adapter.synthesize(
|
| text=payload.input,
|
| reference_audio=reference_path,
|
| max_new_tokens=_estimate_max_new_tokens(payload.input),
|
| audio_temperature=DEFAULT_TEMPERATURE,
|
| audio_top_p=DEFAULT_TOP_P,
|
| audio_top_k=DEFAULT_TOP_K,
|
| audio_repetition_penalty=DEFAULT_REPETITION_PENALTY,
|
| )
|
| t_gen_end = time.perf_counter()
|
| finally:
|
| _generation_semaphore.release()
|
|
|
| t_encode_start = time.perf_counter()
|
| encoded = _encode_audio(waveform, sample_rate, payload.response_format)
|
| t_encode_end = time.perf_counter()
|
|
|
| audio_seconds = float(waveform.size) / sample_rate
|
| gen_seconds = t_gen_end - t_gen_start
|
| total_seconds = t_encode_end - t0
|
|
|
| metrics = {
|
| "model_generation_seconds": gen_seconds,
|
| "audio_emit_seconds": t_encode_end - t_encode_start,
|
| "total_seconds": total_seconds,
|
| "audio_seconds": audio_seconds,
|
| "rtf": gen_seconds / max(audio_seconds, 1e-9),
|
| "ttfb_ms": (t_gen_end - t0) * 1000.0,
|
| }
|
|
|
| log.info(
|
| "synthesize: %.1f s audio in %.1f s (RTF=%.3f)",
|
| audio_seconds,
|
| gen_seconds,
|
| metrics["rtf"],
|
| )
|
| return encoded, metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
| @asynccontextmanager
|
| async def lifespan(app: FastAPI):
|
| global _adapter
|
|
|
| _adapter = LongformNativeAdapter(
|
| model_path=DEFAULT_MODEL_PATH,
|
| device=DEFAULT_DEVICE,
|
| attn_implementation=DEFAULT_ATTN,
|
| codec_path=DEFAULT_CODEC_PATH,
|
| torch_dtype=DEFAULT_DTYPE,
|
| )
|
| _adapter.load()
|
|
|
| if WARMUP_ON_START:
|
| import asyncio
|
| loop = asyncio.get_event_loop()
|
| await loop.run_in_executor(None, _adapter.warmup)
|
|
|
| yield
|
|
|
| _adapter = None
|
|
|
|
|
| app = FastAPI(
|
| title="MOSS-TTS Longform",
|
| description="OpenAI-compatible TTS API backed by the MOSS-TTS 1.5B / 7B PyTorch model.",
|
| version="1.0.0",
|
| lifespan=lifespan,
|
| )
|
|
|
| app.add_middleware(
|
| CORSMiddleware,
|
| allow_origins=["*"],
|
| allow_methods=["*"],
|
| allow_headers=["*"],
|
| )
|
|
|
|
|
| @app.get("/")
|
| async def root():
|
| return {
|
| "service": "moss-tts-longform",
|
| "status": "ok",
|
| "model": DEFAULT_MODEL_PATH,
|
| }
|
|
|
|
|
| @app.get("/health")
|
| async def health():
|
| if _adapter is None or _adapter._model is None:
|
| raise HTTPException(status_code=503, detail="Backend not ready.")
|
| return {"status": "ok", "backend": "longform-native"}
|
|
|
|
|
| @app.get("/v1/models")
|
| async def list_models():
|
| return {
|
| "object": "list",
|
| "data": [
|
| {"id": mid, "object": "model", "owned_by": "OpenMOSS-Team"}
|
| for mid in sorted(_SUPPORTED_MODELS)
|
| ],
|
| }
|
|
|
|
|
| @app.get("/v1/voices")
|
| async def list_voices():
|
| voices = [
|
| VoiceInfo(id=v, name=v.capitalize())
|
| for v in sorted(_VOICE_PRESETS)
|
| ]
|
| return {"object": "list", "data": [v.model_dump() for v in voices]}
|
|
|
|
|
| @app.post("/v1/audio/speech")
|
| async def create_speech(payload: OpenAISpeechRequest):
|
| import asyncio
|
|
|
| loop = asyncio.get_event_loop()
|
| try:
|
| encoded, metrics = await loop.run_in_executor(None, _synthesize, payload)
|
| except HTTPException:
|
| raise
|
| except Exception as exc:
|
| log.exception("Synthesis failed")
|
| raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
|
| headers = {
|
| "Content-Disposition": f"attachment; filename=speech.{payload.response_format}",
|
| "X-MOSS-TTFB-MS": f"{metrics['ttfb_ms']:.1f}",
|
| "X-MOSS-RTF": f"{metrics['rtf']:.4f}",
|
| "X-MOSS-AUDIO-SECONDS": f"{metrics['audio_seconds']:.4f}",
|
| "X-MOSS-STAGE-MODEL-MS": f"{metrics['model_generation_seconds'] * 1000:.1f}",
|
| "X-MOSS-STAGE-EMIT-MS": f"{metrics['audio_emit_seconds'] * 1000:.1f}",
|
| "X-MOSS-STAGE-TOTAL-MS": f"{metrics['total_seconds'] * 1000:.1f}",
|
| }
|
|
|
| return Response(
|
| content=encoded,
|
| media_type=_content_type(payload.response_format),
|
| headers=headers,
|
| )
|
|
|
|
|
|
|
|
|
|
|
|
|
| def main() -> None:
|
| import uvicorn
|
|
|
| logging.basicConfig(
|
| level=os.getenv("MOSS_TTS_LOG_LEVEL", "INFO").upper(),
|
| format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| )
|
| uvicorn.run(
|
| app,
|
| host=os.getenv("MOSS_TTS_HOST", "0.0.0.0"),
|
| port=int(os.getenv("MOSS_TTS_PORT", "8013")),
|
| log_level=os.getenv("MOSS_TTS_LOG_LEVEL", "info").lower(),
|
| )
|
|
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|