| import logging |
| import os |
| import shutil |
| import tempfile |
| from pathlib import Path |
| from typing import Dict |
|
|
| from fastapi import FastAPI, HTTPException |
| from fastapi.responses import JSONResponse, Response |
| from openai import OpenAI |
| from pydantic import BaseModel, Field |
| from pydub import AudioSegment |
|
|
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def _configure_app_logger() -> None: |
| """Ensure INFO logs from this module are emitted (root may stay at WARNING under uvicorn).""" |
| if logger.handlers: |
| return |
| handler = logging.StreamHandler() |
| handler.setFormatter(logging.Formatter("%(levelname)s %(name)s: %(message)s")) |
| logger.addHandler(handler) |
| logger.propagate = False |
| level_name = os.environ.get("LOG_LEVEL", "INFO").upper() |
| logger.setLevel(getattr(logging, level_name, logging.INFO)) |
|
|
|
|
| _configure_app_logger() |
|
|
| app = FastAPI() |
|
|
|
|
| HF_REPO_ID = "michael-chan-000/tts-v5-step-6000" |
| HF_REVISION = "2beb3f05d627e5ac12002f0c8ee0b643cefa7ecf" |
| SAMPLE_RATE = 24000 |
| ADAPTER = "qwen3_tts_repo_snapshot" |
|
|
| |
| MODEL_LOADED = True |
|
|
|
|
| |
| OPENAI_TTS_MODEL = os.environ.get("OPENAI_TTS_MODEL", "gpt-4o-mini-tts") |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| VOICE_FEMALE_DEFAULT = os.environ.get("OPENAI_TTS_VOICE_FEMALE_DEFAULT", "nova") |
| VOICE_FEMALE_SOFT = os.environ.get("OPENAI_TTS_VOICE_FEMALE_SOFT", "shimmer") |
| VOICE_FEMALE_BRIGHT = os.environ.get("OPENAI_TTS_VOICE_FEMALE_BRIGHT", "coral") |
|
|
| VOICE_MALE_DEFAULT = os.environ.get("OPENAI_TTS_VOICE_MALE_DEFAULT", "ash") |
| |
| VOICE_MALE_SERIOUS = os.environ.get("OPENAI_TTS_VOICE_MALE_SERIOUS", "ash") |
| |
| VOICE_MALE_BRIGHT = os.environ.get("OPENAI_TTS_VOICE_MALE_BRIGHT", "ash") |
|
|
| VOICE_NEUTRAL_DEFAULT = os.environ.get("OPENAI_TTS_VOICE_NEUTRAL_DEFAULT", "alloy") |
| VOICE_NEUTRAL_FORMAL = os.environ.get("OPENAI_TTS_VOICE_NEUTRAL_FORMAL", "sage") |
| |
| VOICE_NEUTRAL_EXPRESSIVE = os.environ.get("OPENAI_TTS_VOICE_NEUTRAL_EXPRESSIVE", "verse") |
| VOICE_NEUTRAL_YOUTHFUL = os.environ.get("OPENAI_TTS_VOICE_NEUTRAL_YOUTHFUL", "verse") |
| VOICE_STORYLIKE = os.environ.get("OPENAI_TTS_VOICE_STORYLIKE", "fable") |
|
|
|
|
| TRAIT_KEYS = { |
| "gender", |
| "pitch", |
| "speed", |
| "age_group", |
| "emotion", |
| "tone", |
| "accent", |
| } |
|
|
|
|
| |
| |
| ALLOWED_TRAITS = { |
| "gender": { |
| "male", |
| "female", |
| "neutral", |
| "unknown", |
| }, |
| "pitch": { |
| "low", |
| "mid", |
| "high", |
| "normal", |
| }, |
| "speed": { |
| "slow", |
| "normal", |
| "fast", |
| }, |
| "age_group": { |
| "child", |
| "teen", |
| "young_adult", |
| "adult", |
| "middle_aged", |
| "senior", |
| "elderly", |
| }, |
| "emotion": { |
| "neutral", |
| "happy", |
| "sad", |
| "angry", |
| "calm", |
| "excited", |
| "serious", |
| "fearful", |
| }, |
| "tone": { |
| "neutral", |
| "formal", |
| "informal", |
| "friendly", |
| "serious", |
| "warm", |
| "cold", |
| "urgent", |
| "calm", |
| }, |
| "accent": { |
| "us", |
| "uk", |
| "au", |
| "ca", |
| "indian", |
| "irish", |
| "scottish", |
| "neutral", |
| }, |
| } |
|
|
|
|
| class SpeakRequest(BaseModel): |
| instruction: str = Field(min_length=1) |
| text: str = Field(min_length=1) |
|
|
|
|
| @app.get("/health") |
| def health(): |
| """ |
| Health check endpoint. |
| |
| IMPORTANT: |
| Do not change this response shape. |
| |
| It must always return: |
| status, hf_repo_id, hf_revision, model_loaded, sample_rate, adapter |
| """ |
| return JSONResponse( |
| content={ |
| "status": "healthy", |
| "hf_repo_id": HF_REPO_ID, |
| "hf_revision": HF_REVISION, |
| "model_loaded": MODEL_LOADED, |
| "sample_rate": SAMPLE_RATE, |
| "adapter": ADAPTER, |
| }, |
| status_code=200, |
| ) |
|
|
|
|
| def require_ffmpeg() -> None: |
| """ |
| pydub needs FFmpeg/ffprobe available on PATH for MP3 decoding. |
| |
| If FFmpeg is missing, /speak fails clearly with HTTP 500 instead of |
| returning invalid or empty audio. |
| """ |
| if shutil.which("ffmpeg") is None: |
| raise RuntimeError( |
| "FFmpeg is not installed or not available on PATH. " |
| "Install ffmpeg before using /speak." |
| ) |
|
|
| if shutil.which("ffprobe") is None: |
| raise RuntimeError( |
| "ffprobe is not installed or not available on PATH. " |
| "Install ffmpeg before using /speak." |
| ) |
|
|
|
|
| def normalize_trait_value(value: str) -> str: |
| """ |
| Normalize validator enum values. |
| |
| Examples: |
| "young adult" -> "young_adult" |
| "Young-Adult" -> "young_adult" |
| """ |
| return value.strip().lower().replace("-", "_").replace(" ", "_") |
|
|
|
|
| def parse_trait_tags(instruction: str) -> Dict[str, str]: |
| """ |
| Parse validator instruction tags. |
| |
| Expected input: |
| |
| | gender: female | pitch: mid | speed: normal | age_group: adult | |
| emotion: angry | tone: formal | accent: uk |
| |
| Also supports input without the leading pipe: |
| |
| gender: female | pitch: mid | speed: normal | age_group: adult | |
| emotion: angry | tone: formal | accent: uk |
| |
| Important: |
| - Only known trait keys are extracted. |
| - Only allowed enum values are accepted. |
| - Unknown keys/values are ignored. |
| - The raw instruction is never spoken. |
| """ |
| traits: Dict[str, str] = {} |
|
|
| parts = [part.strip() for part in instruction.split("|") if part.strip()] |
|
|
| for part in parts: |
| if ":" not in part: |
| continue |
|
|
| key, value = part.split(":", 1) |
|
|
| key = key.strip().lower() |
| value = normalize_trait_value(value) |
|
|
| if key not in TRAIT_KEYS: |
| continue |
|
|
| allowed_values = ALLOWED_TRAITS.get(key) |
|
|
| if allowed_values and value not in allowed_values: |
| |
| |
| continue |
|
|
| if value: |
| traits[key] = value |
|
|
| return traits |
|
|
|
|
| def choose_openai_voice(traits: Dict[str, str]) -> str: |
| """ |
| Choose OpenAI's required base voice from structured validator traits. |
| |
| Why this exists: |
| - OpenAI TTS requires voice=... |
| - The instructions field does not reliably change the speaker identity. |
| - Therefore, gender-like characteristics should influence the selected base voice. |
| |
| Naturalness: |
| - Prefer voices that read as conversational, smooth, or balanced (see voice.md). |
| - Trait buckets (serious / soft / bright / youthful) still pick different defaults, |
| but each default is the most natural fit in that bucket unless env overrides say otherwise. |
| |
| Important: |
| - There is no official OpenAI child voice. |
| - age_group can influence the style instruction, but not perfectly control age. |
| - This function never changes the spoken text. |
| """ |
| gender = traits.get("gender", "unknown") |
| emotion = traits.get("emotion", "neutral") |
| tone = traits.get("tone", "neutral") |
| age_group = traits.get("age_group", "adult") |
| pitch = traits.get("pitch", "mid") |
| speed = traits.get("speed", "normal") |
|
|
| serious_like = emotion in {"serious", "angry"} or tone in {"formal", "serious", "urgent", "cold"} |
| soft_like = emotion in {"calm", "sad", "fearful"} or tone in {"calm", "warm"} |
| bright_like = emotion in {"happy", "excited"} or tone in {"friendly", "informal"} or speed == "fast" |
| youthful_like = age_group in {"child", "teen", "young_adult"} or pitch == "high" |
|
|
| if gender == "female": |
| |
| if bright_like or youthful_like: |
| return VOICE_FEMALE_BRIGHT |
|
|
| if soft_like: |
| return VOICE_FEMALE_SOFT |
|
|
| if serious_like: |
| return VOICE_FEMALE_DEFAULT |
|
|
| return VOICE_FEMALE_DEFAULT |
|
|
| if gender == "male": |
| |
| if serious_like: |
| return VOICE_MALE_SERIOUS |
|
|
| if bright_like or youthful_like: |
| return VOICE_MALE_BRIGHT |
|
|
| if soft_like: |
| return VOICE_MALE_DEFAULT |
|
|
| return VOICE_MALE_DEFAULT |
|
|
| |
| if serious_like: |
| return VOICE_NEUTRAL_FORMAL |
|
|
| if bright_like: |
| return VOICE_NEUTRAL_EXPRESSIVE |
|
|
| if age_group in {"child", "teen"}: |
| |
| return VOICE_NEUTRAL_YOUTHFUL |
|
|
| return VOICE_NEUTRAL_DEFAULT |
|
|
|
|
| def build_openai_tts_instructions(traits: Dict[str, str]) -> str: |
| """ |
| Convert structured validator tags into OpenAI TTS style guidance. |
| |
| Critical rule: |
| - These instructions describe delivery/style only. |
| - They are not the content to speak. |
| - The spoken content remains req.text only. |
| """ |
| guidance = [ |
| "Use the following metadata only as voice and delivery guidance.", |
| "Do not read, say, mention, or paraphrase the metadata.", |
| "Speak only the input text.", |
| "Prioritize natural, human-sounding speech with clear articulation.", |
| "Keep delivery conversational and believable; avoid robotic delivery.", |
| "Do not add cartoonish or exaggerated theatricality unless the metadata clearly demands a strong stylistic effect.", |
| ] |
|
|
| gender = traits.get("gender") |
| pitch = traits.get("pitch") |
| speed = traits.get("speed") |
| age_group = traits.get("age_group") |
| emotion = traits.get("emotion") |
| tone = traits.get("tone") |
| accent = traits.get("accent") |
|
|
| if gender and gender not in {"unknown", "neutral"}: |
| guidance.append(f"Use a {gender} vocal presentation.") |
|
|
| if age_group: |
| if age_group == "child": |
| guidance.append( |
| "Use a youthful, childlike delivery where possible, while keeping speech natural and clear." |
| ) |
| elif age_group == "teen": |
| guidance.append( |
| "Use a teenage-sounding delivery where possible, while keeping speech natural and clear." |
| ) |
| elif age_group == "young_adult": |
| guidance.append("Use a young adult delivery.") |
| elif age_group in {"senior", "elderly"}: |
| guidance.append("Use a mature older-speaker delivery.") |
| else: |
| guidance.append(f"Sound like a {age_group.replace('_', ' ')} speaker.") |
|
|
| if pitch: |
| if pitch in {"mid", "normal"}: |
| guidance.append("Use a medium pitch.") |
| else: |
| guidance.append(f"Use a {pitch} pitch.") |
|
|
| if speed: |
| if speed == "normal": |
| guidance.append("Use a normal speaking speed.") |
| else: |
| guidance.append(f"Use a {speed} speaking speed.") |
|
|
| if emotion: |
| guidance.append(f"Convey a {emotion} emotion.") |
|
|
| if tone: |
| guidance.append(f"Use a {tone} tone.") |
|
|
| if accent: |
| if accent == "uk": |
| guidance.append("Use a UK English accent.") |
| elif accent == "us": |
| guidance.append("Use a US English accent.") |
| elif accent == "au": |
| guidance.append("Use an Australian English accent.") |
| elif accent == "ca": |
| guidance.append("Use a Canadian English accent.") |
| elif accent == "indian": |
| guidance.append("Use an Indian English accent.") |
| elif accent == "irish": |
| guidance.append("Use an Irish English accent.") |
| elif accent == "scottish": |
| guidance.append("Use a Scottish English accent.") |
| elif accent == "neutral": |
| guidance.append("Use a neutral English accent.") |
| else: |
| guidance.append(f"Use a {accent.replace('_', ' ')} accent.") |
|
|
| return " ".join(guidance) |
|
|
|
|
| @app.post("/speak") |
| def speak(req: SpeakRequest): |
| """ |
| Generate speech and return raw WAV bytes. |
| |
| Validator sends: |
| |
| { |
| "text": "The transcription to speak.", |
| "instruction": "| gender: female | pitch: mid | speed: normal | age_group: adult | emotion: angry | tone: formal | accent: uk" |
| } |
| |
| Important: |
| - req.text is the only spoken content. |
| - req.instruction is structured style metadata only. |
| - voice is selected from parsed traits because OpenAI requires voice=... |
| - The raw instruction string is never spoken. |
| |
| Response: |
| - raw WAV bytes |
| - Content-Type: audio/wav |
| |
| This endpoint does not return: |
| - base64 |
| - JSON-wrapped audio |
| - file paths |
| """ |
| api_key = os.environ.get("OPENAI_API_KEY") |
|
|
| if not api_key: |
| raise HTTPException( |
| status_code=500, |
| detail="OPENAI_API_KEY is not set", |
| ) |
|
|
| try: |
| require_ffmpeg() |
|
|
| traits = parse_trait_tags(req.instruction) |
|
|
| |
| |
| |
| |
| |
| voice = choose_openai_voice(traits) |
| tts_instructions = build_openai_tts_instructions(traits) |
|
|
| logger.info( |
| "Instruction pipeline: raw=%r | parsed_traits=%s | chosen_voice=%r | openai_instructions=%r", |
| req.instruction, |
| traits, |
| voice, |
| tts_instructions, |
| ) |
| logger.info( |
| "OpenAI audio.speech.create payload: model=%r voice=%r response_format=%r input=%r instructions=%r", |
| OPENAI_TTS_MODEL, |
| voice, |
| "mp3", |
| req.text, |
| tts_instructions, |
| ) |
|
|
| client = OpenAI(api_key=api_key) |
|
|
| with tempfile.TemporaryDirectory() as tmpdir: |
| tmpdir_path = Path(tmpdir) |
|
|
| mp3_path = tmpdir_path / "speech.mp3" |
| wav_path = tmpdir_path / "speech.wav" |
|
|
| speech = client.audio.speech.create( |
| model=OPENAI_TTS_MODEL, |
| voice=voice, |
| input=req.text, |
| instructions=tts_instructions, |
| response_format="mp3", |
| ) |
|
|
| speech.stream_to_file(mp3_path) |
|
|
| if not mp3_path.exists(): |
| raise RuntimeError("OpenAI TTS did not create an MP3 file") |
|
|
| if mp3_path.stat().st_size == 0: |
| raise RuntimeError("OpenAI TTS created an empty MP3 file") |
|
|
| audio = AudioSegment.from_file(mp3_path, format="mp3") |
|
|
| |
| audio = ( |
| audio |
| .set_frame_rate(SAMPLE_RATE) |
| .set_channels(1) |
| .set_sample_width(2) |
| ) |
|
|
| audio.export(wav_path, format="wav") |
|
|
| if not wav_path.exists(): |
| raise RuntimeError("MP3 to WAV conversion failed: WAV file missing") |
|
|
| if wav_path.stat().st_size == 0: |
| raise RuntimeError("MP3 to WAV conversion failed: WAV file is empty") |
|
|
| wav_bytes = wav_path.read_bytes() |
|
|
| return Response( |
| content=wav_bytes, |
| media_type="audio/wav", |
| headers={ |
| "Content-Disposition": 'inline; filename="speech.wav"', |
| }, |
| ) |
|
|
| except HTTPException: |
| raise |
|
|
| except Exception as e: |
| raise HTTPException( |
| status_code=500, |
| detail=f"TTS generation failed: {str(e)}", |
| ) |