import gc import io import asyncio import numpy as np import soundfile as sf import torch import os import bark from fastapi import FastAPI, HTTPException from fastapi.responses import Response from bark import generate_audio, preload_models from typing import List, Dict from pydantic import BaseModel from typing import List, Optional # ===================================================== # APP # ===================================================== app = FastAPI(title="Bark TTS Production API") # ===================================================== # LOAD MODELS ON STARTUP # ===================================================== preload_models() # ===================================================== # CONCURRENCY CONTROL # ===================================================== # Max 20 concurrent users total GLOBAL_SEMAPHORE = asyncio.Semaphore(20) # Per-voice sequential locks VOICE_LOCKS: Dict[str, asyncio.Semaphore] = {} VOICE_LOCKS_LOCK = asyncio.Lock() # protects VOICE_LOCKS dict async def get_voice_lock(voice: str) -> asyncio.Semaphore: async with VOICE_LOCKS_LOCK: if voice not in VOICE_LOCKS: VOICE_LOCKS[voice] = asyncio.Semaphore(1) return VOICE_LOCKS[voice] # ===================================================== # METADATA # ===================================================== EXPRESSIONS = { # -------------------------------------------------- # Breathing & Airflow # -------------------------------------------------- "breathing": "[breathing]", "slow_breathing": "[slow breathing]", "heavy_breathing": "[heavy breathing]", "shallow_breathing": "[shallow breathing]", "deep_inhale": "[inhales deeply]", "deep_exhale": "[exhales]", "sharp_inhale": "[sharp inhale]", "gasp": "[gasps]", "panting": "[panting]", "catching_breath": "[catching breath]", # -------------------------------------------------- # Sighs & Relief # -------------------------------------------------- "sigh": "[sighs]", "relieved_sigh": "[sighs with relief]", "frustrated_sigh": "[frustrated sigh]", "tired_sigh": "[tired sigh]", # -------------------------------------------------- # Hesitation & Thinking # -------------------------------------------------- "hesitation": "...", "hesitation_dots": "...", "hesitation_dashes": "—", "thinking": "[thinking]", "thinking_hum": "[hums thoughtfully]", "unsure_hum": "[hesitant hum]", "pause": "[pause]", # -------------------------------------------------- # Laughter & Amusement # -------------------------------------------------- "laughter": "[laughter]", "laughs": "[laughs]", "soft_laugh": "[soft laugh]", "nervous_laugh": "[nervous laugh]", "awkward_laugh": "[awkward laugh]", "giggle": "[giggles]", "chuckle": "[chuckles]", "snort": "[snorts]", # -------------------------------------------------- # Effort & Exertion # -------------------------------------------------- "grunt": "[grunts]", "strain": "[straining]", "effort": "[exertion]", "groan_effort": "[groans]", "release_breath": "[releases breath]", "fatigue": "[fatigued breath]", # -------------------------------------------------- # Surprise & Reaction # -------------------------------------------------- "surprised_gasp": "[surprised gasp]", "startled": "[startled]", "shock": "[shocked inhale]", "oh_reaction": "[sudden inhale]", # -------------------------------------------------- # Pain / Distress (non-graphic) # -------------------------------------------------- "pain_grunt": "[painful grunt]", "wince": "[winces]", "distress": "[distressed breathing]", "strained_voice": "[strained voice]", # -------------------------------------------------- # Crying & Sadness (non-graphic) # -------------------------------------------------- "sob": "[sobs]", "quiet_sob": "[quiet sob]", "whimper": "[whimpers]", "soft_cry": "[soft crying]", "tearful_breath": "[tearful breathing]", # -------------------------------------------------- # Approval / Disapproval # -------------------------------------------------- "approval_hum": "[approving hum]", "satisfied_hum": "[satisfied hum]", "disappointed": "[disappointed sigh]", "annoyed": "[annoyed exhale]", "dismissive": "[dismissive huff]", # -------------------------------------------------- # Sleep & Tiredness # -------------------------------------------------- "yawn": "[yawns]", "sleepy_breath": "[sleepy breathing]", "groggy": "[groggy inhale]", "relaxed_exhale": "[relaxed exhale]", # -------------------------------------------------- # Ambient / Texture # -------------------------------------------------- "murmur": "[murmuring]", "background_voice": "[background voice]", "indistinct_vocal": "[indistinct vocal sounds]", "muffled_voice": "[muffled voice]", # -------------------------------------------------- # Voice Modifiers (approximate) # -------------------------------------------------- "whisper": "(whispers)", "shout": "(shouting)", "breathy_voice": "[breathy voice]", "raspy_voice": "[raspy voice]", "trembling_voice": "[trembling voice]", # -------------------------------------------------- # Misc # -------------------------------------------------- "clears_throat": "[clears throat]" } SOUND_EFFECTS = { # -------------------------------------------------- # Music & Musical Cues (approximate) # -------------------------------------------------- "music": "[music]", "background_music": "[background music]", "music_fade_in": "[music fades in]", "music_fade_out": "[music fades out]", "humming_music": "[humming]", "song_notes": "♪ {text} ♪", "melody": "[melodic humming]", # -------------------------------------------------- # Cinematic / Dramatic # -------------------------------------------------- "dramatic_hit": "*dramatic cinematic sound*", "dramatic_pause": "*dramatic pause*", "dramatic_silence": "*sudden silence*", "impact": "*impact sound*", "tension_rise": "*rising tension sound*", "reveal": "*cinematic reveal sound*", # -------------------------------------------------- # Radio / Audio Artifacts # -------------------------------------------------- "static": "*radio static noise*", "interference": "*audio interference*", "signal_drop": "*signal breaking up*", "distortion": "*distorted audio*", "low_quality": "*low quality audio*", "glitch": "*audio glitch*", # -------------------------------------------------- # Echo / Space / Environment # -------------------------------------------------- "echo": "*echoing voice*", "reverb": "*reverberation*", "hall_echo": "*large hall echo*", "distance_voice": "*distant voice*", "muffled": "*muffled sound*", "underwater": "*underwater sound*", # -------------------------------------------------- # Movement / Presence (vocal-texture based) # -------------------------------------------------- "footsteps": "*footsteps*", "approaching": "*approaching sound*", "departing": "*fading footsteps*", "rustle": "*rustling sound*", "cloth_movement": "*fabric movement*", # -------------------------------------------------- # Environmental Ambience (approximate) # -------------------------------------------------- "wind": "*wind blowing*", "rain": "*rain falling*", "storm": "*distant thunder*", "crowd": "*crowd murmuring*", "room_tone": "*room tone*", "outdoor_ambience": "*outdoor ambience*", # -------------------------------------------------- # Mechanical / Tech (very rough) # -------------------------------------------------- "beep": "*beep*", "alarm": "*alarm sound*", "machine_hum": "*machine humming*", "engine": "*engine rumble*", "power_on": "*powering on sound*", "power_off": "*powering down sound*", # -------------------------------------------------- # Creature / Non-human (unstable) # -------------------------------------------------- "growl": "*low growl*", "roar": "*distant roar*", "rumble": "*low rumble*", "alien": "*alien-like sound*", # -------------------------------------------------- # Silence / Timing Control # -------------------------------------------------- "pause": "*pause*", "short_pause": "*brief pause*", "long_pause": "*long pause*" } STYLE_HINTS = { "narration": "in a calm, professional narration tone", "emotional": "with deep emotional expression", "robotic": "with a robotic, synthetic delivery", "space": "as if speaking through a helmet in space with echo", "cinematic": "cinematic, dramatic delivery", } LANGUAGES = [ "English", "French", "Spanish", "German", "Italian", "Portuguese", "Polish", "Russian", "Turkish", "Chinese", "Japanese", "Korean", "Hindi" ] def load_all_bark_voice_presets(): presets = [] bark_root = os.path.dirname(bark.__file__) prompts_dir = os.path.join(bark_root, "assets", "prompts") if not os.path.isdir(prompts_dir): return presets for file in os.listdir(prompts_dir): if file.endswith(".npz"): presets.append(file.replace(".npz", "")) presets.sort() return presets VOICE_PRESETS = load_all_bark_voice_presets() # ===================================================== # UTILS # ===================================================== def chunk_text(text: str, max_chars: int = 200): words = text.split() chunks, current = [], [] for w in words: current.append(w) if len(" ".join(current)) >= max_chars: chunks.append(" ".join(current)) current = [] if current: chunks.append(" ".join(current)) return chunks def clean_memory(): gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() # ===================================================== # ROUTES # ===================================================== @app.get("/meta") def meta(): return { "languages": LANGUAGES, "voices": VOICE_PRESETS, "expressions": EXPRESSIONS, "sound_effects": SOUND_EFFECTS, "styles": STYLE_HINTS, "limits": { "max_concurrent_users": 20, "per_voice_sequential": True } } class TTSRequest(BaseModel): text: str voice: str expressions: List[str] = [] sounds: List[str] = [] style: Optional[str] = None @app.post("/tts") async def tts(req: TTSRequest): text = req.text voice = req.voice expressions = req.expressions sounds = req.sounds style = req.style # 1. Validation if not text.strip(): raise HTTPException(status_code=400, detail="Text is required") if voice not in VOICE_PRESETS: raise HTTPException(status_code=400, detail=f"Invalid voice preset: {voice}") # 2. Build the Audio Prefix (Expressions, Sounds, Style) prompt_parts = [] for e in expressions: if e in EXPRESSIONS: prompt_parts.append(EXPRESSIONS[e]) for s in sounds: if s in SOUND_EFFECTS: prompt_parts.append(SOUND_EFFECTS[s]) if style and style in STYLE_HINTS: prompt_parts.append(STYLE_HINTS[style]) prefix = " ".join(prompt_parts).strip() # 3. Chunk ONLY the spoken text chunks = chunk_text(text) # Prepend the prefix (sounds/expressions) only to the very first chunk if prefix: chunks[0] = f"{prefix} {chunks[0]}" # 4. Acquire Locks voice_lock = await get_voice_lock(voice) async with GLOBAL_SEMAPHORE: async with voice_lock: audio_out = [] # 5. Generate Audio sequentially for each chunk for chunk in chunks: # Bark can sometimes fail on completely empty strings, skip if empty if not chunk.strip(): continue audio_array = await asyncio.to_thread( generate_audio, chunk, history_prompt=voice ) audio_out.append(audio_array) clean_memory() # 6. Combine and package the audio if not audio_out: raise HTTPException(status_code=500, detail="Failed to generate audio.") final_audio = np.concatenate(audio_out) buffer = io.BytesIO() # Bark's default sample rate is 24kHz sf.write(buffer, final_audio, 24000, format="WAV") buffer.seek(0) clean_memory() return Response( content=buffer.read(), media_type="audio/wav" )