bark-tts / main.py
lainlives's picture
Update main.py
9f90330 verified
import gc
import io
import asyncio
import numpy as np
import soundfile as sf
import torch
import os
import bark
from fastapi import FastAPI, HTTPException
from fastapi.responses import Response
from bark import generate_audio, preload_models
from typing import List, Dict
from pydantic import BaseModel
from typing import List, Optional
# =====================================================
# APP
# =====================================================
app = FastAPI(title="Bark TTS Production API")
# =====================================================
# LOAD MODELS ON STARTUP
# =====================================================
preload_models()
# =====================================================
# CONCURRENCY CONTROL
# =====================================================
# Max 20 concurrent users total
GLOBAL_SEMAPHORE = asyncio.Semaphore(20)
# Per-voice sequential locks
VOICE_LOCKS: Dict[str, asyncio.Semaphore] = {}
VOICE_LOCKS_LOCK = asyncio.Lock() # protects VOICE_LOCKS dict
async def get_voice_lock(voice: str) -> asyncio.Semaphore:
async with VOICE_LOCKS_LOCK:
if voice not in VOICE_LOCKS:
VOICE_LOCKS[voice] = asyncio.Semaphore(1)
return VOICE_LOCKS[voice]
# =====================================================
# METADATA
# =====================================================
EXPRESSIONS = {
# --------------------------------------------------
# Breathing & Airflow
# --------------------------------------------------
"breathing": "[breathing]",
"slow_breathing": "[slow breathing]",
"heavy_breathing": "[heavy breathing]",
"shallow_breathing": "[shallow breathing]",
"deep_inhale": "[inhales deeply]",
"deep_exhale": "[exhales]",
"sharp_inhale": "[sharp inhale]",
"gasp": "[gasps]",
"panting": "[panting]",
"catching_breath": "[catching breath]",
# --------------------------------------------------
# Sighs & Relief
# --------------------------------------------------
"sigh": "[sighs]",
"relieved_sigh": "[sighs with relief]",
"frustrated_sigh": "[frustrated sigh]",
"tired_sigh": "[tired sigh]",
# --------------------------------------------------
# Hesitation & Thinking
# --------------------------------------------------
"hesitation": "...",
"hesitation_dots": "...",
"hesitation_dashes": "—",
"thinking": "[thinking]",
"thinking_hum": "[hums thoughtfully]",
"unsure_hum": "[hesitant hum]",
"pause": "[pause]",
# --------------------------------------------------
# Laughter & Amusement
# --------------------------------------------------
"laughter": "[laughter]",
"laughs": "[laughs]",
"soft_laugh": "[soft laugh]",
"nervous_laugh": "[nervous laugh]",
"awkward_laugh": "[awkward laugh]",
"giggle": "[giggles]",
"chuckle": "[chuckles]",
"snort": "[snorts]",
# --------------------------------------------------
# Effort & Exertion
# --------------------------------------------------
"grunt": "[grunts]",
"strain": "[straining]",
"effort": "[exertion]",
"groan_effort": "[groans]",
"release_breath": "[releases breath]",
"fatigue": "[fatigued breath]",
# --------------------------------------------------
# Surprise & Reaction
# --------------------------------------------------
"surprised_gasp": "[surprised gasp]",
"startled": "[startled]",
"shock": "[shocked inhale]",
"oh_reaction": "[sudden inhale]",
# --------------------------------------------------
# Pain / Distress (non-graphic)
# --------------------------------------------------
"pain_grunt": "[painful grunt]",
"wince": "[winces]",
"distress": "[distressed breathing]",
"strained_voice": "[strained voice]",
# --------------------------------------------------
# Crying & Sadness (non-graphic)
# --------------------------------------------------
"sob": "[sobs]",
"quiet_sob": "[quiet sob]",
"whimper": "[whimpers]",
"soft_cry": "[soft crying]",
"tearful_breath": "[tearful breathing]",
# --------------------------------------------------
# Approval / Disapproval
# --------------------------------------------------
"approval_hum": "[approving hum]",
"satisfied_hum": "[satisfied hum]",
"disappointed": "[disappointed sigh]",
"annoyed": "[annoyed exhale]",
"dismissive": "[dismissive huff]",
# --------------------------------------------------
# Sleep & Tiredness
# --------------------------------------------------
"yawn": "[yawns]",
"sleepy_breath": "[sleepy breathing]",
"groggy": "[groggy inhale]",
"relaxed_exhale": "[relaxed exhale]",
# --------------------------------------------------
# Ambient / Texture
# --------------------------------------------------
"murmur": "[murmuring]",
"background_voice": "[background voice]",
"indistinct_vocal": "[indistinct vocal sounds]",
"muffled_voice": "[muffled voice]",
# --------------------------------------------------
# Voice Modifiers (approximate)
# --------------------------------------------------
"whisper": "(whispers)",
"shout": "(shouting)",
"breathy_voice": "[breathy voice]",
"raspy_voice": "[raspy voice]",
"trembling_voice": "[trembling voice]",
# --------------------------------------------------
# Misc
# --------------------------------------------------
"clears_throat": "[clears throat]"
}
SOUND_EFFECTS = {
# --------------------------------------------------
# Music & Musical Cues (approximate)
# --------------------------------------------------
"music": "[music]",
"background_music": "[background music]",
"music_fade_in": "[music fades in]",
"music_fade_out": "[music fades out]",
"humming_music": "[humming]",
"song_notes": "♪ {text} ♪",
"melody": "[melodic humming]",
# --------------------------------------------------
# Cinematic / Dramatic
# --------------------------------------------------
"dramatic_hit": "*dramatic cinematic sound*",
"dramatic_pause": "*dramatic pause*",
"dramatic_silence": "*sudden silence*",
"impact": "*impact sound*",
"tension_rise": "*rising tension sound*",
"reveal": "*cinematic reveal sound*",
# --------------------------------------------------
# Radio / Audio Artifacts
# --------------------------------------------------
"static": "*radio static noise*",
"interference": "*audio interference*",
"signal_drop": "*signal breaking up*",
"distortion": "*distorted audio*",
"low_quality": "*low quality audio*",
"glitch": "*audio glitch*",
# --------------------------------------------------
# Echo / Space / Environment
# --------------------------------------------------
"echo": "*echoing voice*",
"reverb": "*reverberation*",
"hall_echo": "*large hall echo*",
"distance_voice": "*distant voice*",
"muffled": "*muffled sound*",
"underwater": "*underwater sound*",
# --------------------------------------------------
# Movement / Presence (vocal-texture based)
# --------------------------------------------------
"footsteps": "*footsteps*",
"approaching": "*approaching sound*",
"departing": "*fading footsteps*",
"rustle": "*rustling sound*",
"cloth_movement": "*fabric movement*",
# --------------------------------------------------
# Environmental Ambience (approximate)
# --------------------------------------------------
"wind": "*wind blowing*",
"rain": "*rain falling*",
"storm": "*distant thunder*",
"crowd": "*crowd murmuring*",
"room_tone": "*room tone*",
"outdoor_ambience": "*outdoor ambience*",
# --------------------------------------------------
# Mechanical / Tech (very rough)
# --------------------------------------------------
"beep": "*beep*",
"alarm": "*alarm sound*",
"machine_hum": "*machine humming*",
"engine": "*engine rumble*",
"power_on": "*powering on sound*",
"power_off": "*powering down sound*",
# --------------------------------------------------
# Creature / Non-human (unstable)
# --------------------------------------------------
"growl": "*low growl*",
"roar": "*distant roar*",
"rumble": "*low rumble*",
"alien": "*alien-like sound*",
# --------------------------------------------------
# Silence / Timing Control
# --------------------------------------------------
"pause": "*pause*",
"short_pause": "*brief pause*",
"long_pause": "*long pause*"
}
STYLE_HINTS = {
"narration": "in a calm, professional narration tone",
"emotional": "with deep emotional expression",
"robotic": "with a robotic, synthetic delivery",
"space": "as if speaking through a helmet in space with echo",
"cinematic": "cinematic, dramatic delivery",
}
LANGUAGES = [
"English", "French", "Spanish", "German", "Italian",
"Portuguese", "Polish", "Russian", "Turkish",
"Chinese", "Japanese", "Korean", "Hindi"
]
def load_all_bark_voice_presets():
presets = []
bark_root = os.path.dirname(bark.__file__)
prompts_dir = os.path.join(bark_root, "assets", "prompts")
if not os.path.isdir(prompts_dir):
return presets
for file in os.listdir(prompts_dir):
if file.endswith(".npz"):
presets.append(file.replace(".npz", ""))
presets.sort()
return presets
VOICE_PRESETS = load_all_bark_voice_presets()
# =====================================================
# UTILS
# =====================================================
def chunk_text(text: str, max_chars: int = 200):
words = text.split()
chunks, current = [], []
for w in words:
current.append(w)
if len(" ".join(current)) >= max_chars:
chunks.append(" ".join(current))
current = []
if current:
chunks.append(" ".join(current))
return chunks
def clean_memory():
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
# =====================================================
# ROUTES
# =====================================================
@app.get("/meta")
def meta():
return {
"languages": LANGUAGES,
"voices": VOICE_PRESETS,
"expressions": EXPRESSIONS,
"sound_effects": SOUND_EFFECTS,
"styles": STYLE_HINTS,
"limits": {
"max_concurrent_users": 20,
"per_voice_sequential": True
}
}
class TTSRequest(BaseModel):
text: str
voice: str
expressions: List[str] = []
sounds: List[str] = []
style: Optional[str] = None
@app.post("/tts")
async def tts(req: TTSRequest):
text = req.text
voice = req.voice
expressions = req.expressions
sounds = req.sounds
style = req.style
# 1. Validation
if not text.strip():
raise HTTPException(status_code=400, detail="Text is required")
if voice not in VOICE_PRESETS:
raise HTTPException(status_code=400, detail=f"Invalid voice preset: {voice}")
# 2. Build the Audio Prefix (Expressions, Sounds, Style)
prompt_parts = []
for e in expressions:
if e in EXPRESSIONS:
prompt_parts.append(EXPRESSIONS[e])
for s in sounds:
if s in SOUND_EFFECTS:
prompt_parts.append(SOUND_EFFECTS[s])
if style and style in STYLE_HINTS:
prompt_parts.append(STYLE_HINTS[style])
prefix = " ".join(prompt_parts).strip()
# 3. Chunk ONLY the spoken text
chunks = chunk_text(text)
# Prepend the prefix (sounds/expressions) only to the very first chunk
if prefix:
chunks[0] = f"{prefix} {chunks[0]}"
# 4. Acquire Locks
voice_lock = await get_voice_lock(voice)
async with GLOBAL_SEMAPHORE:
async with voice_lock:
audio_out = []
# 5. Generate Audio sequentially for each chunk
for chunk in chunks:
# Bark can sometimes fail on completely empty strings, skip if empty
if not chunk.strip():
continue
audio_array = await asyncio.to_thread(
generate_audio,
chunk,
history_prompt=voice
)
audio_out.append(audio_array)
clean_memory()
# 6. Combine and package the audio
if not audio_out:
raise HTTPException(status_code=500, detail="Failed to generate audio.")
final_audio = np.concatenate(audio_out)
buffer = io.BytesIO()
# Bark's default sample rate is 24kHz
sf.write(buffer, final_audio, 24000, format="WAV")
buffer.seek(0)
clean_memory()
return Response(
content=buffer.read(),
media_type="audio/wav"
)