bark-tts

Running

App Files Files Community

bark-tts / main.py

lainlives

Update main.py

9f90330 verified 9 days ago

raw

history blame contribute delete

13.1 kB

	import gc
	import io
	import asyncio
	import numpy as np
	import soundfile as sf
	import torch
	import os

	import bark
	from fastapi import FastAPI, HTTPException
	from fastapi.responses import Response
	from bark import generate_audio, preload_models
	from typing import List, Dict
	from pydantic import BaseModel
	from typing import List, Optional
	# =====================================================
	# APP
	# =====================================================
	app = FastAPI(title="Bark TTS Production API")

	# =====================================================
	# LOAD MODELS ON STARTUP
	# =====================================================
	preload_models()

	# =====================================================
	# CONCURRENCY CONTROL
	# =====================================================

	# Max 20 concurrent users total
	GLOBAL_SEMAPHORE = asyncio.Semaphore(20)

	# Per-voice sequential locks
	VOICE_LOCKS: Dict[str, asyncio.Semaphore] = {}

	VOICE_LOCKS_LOCK = asyncio.Lock() # protects VOICE_LOCKS dict


	async def get_voice_lock(voice: str) -> asyncio.Semaphore:
	async with VOICE_LOCKS_LOCK:
	if voice not in VOICE_LOCKS:
	VOICE_LOCKS[voice] = asyncio.Semaphore(1)
	return VOICE_LOCKS[voice]

	# =====================================================
	# METADATA
	# =====================================================
	EXPRESSIONS = {
	# --------------------------------------------------
	# Breathing & Airflow
	# --------------------------------------------------
	"breathing": "[breathing]",
	"slow_breathing": "[slow breathing]",
	"heavy_breathing": "[heavy breathing]",
	"shallow_breathing": "[shallow breathing]",
	"deep_inhale": "[inhales deeply]",
	"deep_exhale": "[exhales]",
	"sharp_inhale": "[sharp inhale]",
	"gasp": "[gasps]",
	"panting": "[panting]",
	"catching_breath": "[catching breath]",

	# --------------------------------------------------
	# Sighs & Relief
	# --------------------------------------------------
	"sigh": "[sighs]",
	"relieved_sigh": "[sighs with relief]",
	"frustrated_sigh": "[frustrated sigh]",
	"tired_sigh": "[tired sigh]",

	# --------------------------------------------------
	# Hesitation & Thinking
	# --------------------------------------------------
	"hesitation": "...",
	"hesitation_dots": "...",
	"hesitation_dashes": "—",
	"thinking": "[thinking]",
	"thinking_hum": "[hums thoughtfully]",
	"unsure_hum": "[hesitant hum]",
	"pause": "[pause]",

	# --------------------------------------------------
	# Laughter & Amusement
	# --------------------------------------------------
	"laughter": "[laughter]",
	"laughs": "[laughs]",
	"soft_laugh": "[soft laugh]",
	"nervous_laugh": "[nervous laugh]",
	"awkward_laugh": "[awkward laugh]",
	"giggle": "[giggles]",
	"chuckle": "[chuckles]",
	"snort": "[snorts]",

	# --------------------------------------------------
	# Effort & Exertion
	# --------------------------------------------------
	"grunt": "[grunts]",
	"strain": "[straining]",
	"effort": "[exertion]",
	"groan_effort": "[groans]",
	"release_breath": "[releases breath]",
	"fatigue": "[fatigued breath]",

	# --------------------------------------------------
	# Surprise & Reaction
	# --------------------------------------------------
	"surprised_gasp": "[surprised gasp]",
	"startled": "[startled]",
	"shock": "[shocked inhale]",
	"oh_reaction": "[sudden inhale]",

	# --------------------------------------------------
	# Pain / Distress (non-graphic)
	# --------------------------------------------------
	"pain_grunt": "[painful grunt]",
	"wince": "[winces]",
	"distress": "[distressed breathing]",
	"strained_voice": "[strained voice]",

	# --------------------------------------------------
	# Crying & Sadness (non-graphic)
	# --------------------------------------------------
	"sob": "[sobs]",
	"quiet_sob": "[quiet sob]",
	"whimper": "[whimpers]",
	"soft_cry": "[soft crying]",
	"tearful_breath": "[tearful breathing]",

	# --------------------------------------------------
	# Approval / Disapproval
	# --------------------------------------------------
	"approval_hum": "[approving hum]",
	"satisfied_hum": "[satisfied hum]",
	"disappointed": "[disappointed sigh]",
	"annoyed": "[annoyed exhale]",
	"dismissive": "[dismissive huff]",

	# --------------------------------------------------
	# Sleep & Tiredness
	# --------------------------------------------------
	"yawn": "[yawns]",
	"sleepy_breath": "[sleepy breathing]",
	"groggy": "[groggy inhale]",
	"relaxed_exhale": "[relaxed exhale]",

	# --------------------------------------------------
	# Ambient / Texture
	# --------------------------------------------------
	"murmur": "[murmuring]",
	"background_voice": "[background voice]",
	"indistinct_vocal": "[indistinct vocal sounds]",
	"muffled_voice": "[muffled voice]",

	# --------------------------------------------------
	# Voice Modifiers (approximate)
	# --------------------------------------------------
	"whisper": "(whispers)",
	"shout": "(shouting)",
	"breathy_voice": "[breathy voice]",
	"raspy_voice": "[raspy voice]",
	"trembling_voice": "[trembling voice]",

	# --------------------------------------------------
	# Misc
	# --------------------------------------------------
	"clears_throat": "[clears throat]"
	}


	SOUND_EFFECTS = {
	# --------------------------------------------------
	# Music & Musical Cues (approximate)
	# --------------------------------------------------
	"music": "[music]",
	"background_music": "[background music]",
	"music_fade_in": "[music fades in]",
	"music_fade_out": "[music fades out]",
	"humming_music": "[humming]",
	"song_notes": "♪ {text} ♪",
	"melody": "[melodic humming]",

	# --------------------------------------------------
	# Cinematic / Dramatic
	# --------------------------------------------------
	"dramatic_hit": "dramatic cinematic sound",
	"dramatic_pause": "dramatic pause",
	"dramatic_silence": "sudden silence",
	"impact": "impact sound",
	"tension_rise": "rising tension sound",
	"reveal": "cinematic reveal sound",

	# --------------------------------------------------
	# Radio / Audio Artifacts
	# --------------------------------------------------
	"static": "radio static noise",
	"interference": "audio interference",
	"signal_drop": "signal breaking up",
	"distortion": "distorted audio",
	"low_quality": "low quality audio",
	"glitch": "audio glitch",

	# --------------------------------------------------
	# Echo / Space / Environment
	# --------------------------------------------------
	"echo": "echoing voice",
	"reverb": "reverberation",
	"hall_echo": "large hall echo",
	"distance_voice": "distant voice",
	"muffled": "muffled sound",
	"underwater": "underwater sound",

	# --------------------------------------------------
	# Movement / Presence (vocal-texture based)
	# --------------------------------------------------
	"footsteps": "footsteps",
	"approaching": "approaching sound",
	"departing": "fading footsteps",
	"rustle": "rustling sound",
	"cloth_movement": "fabric movement",

	# --------------------------------------------------
	# Environmental Ambience (approximate)
	# --------------------------------------------------
	"wind": "wind blowing",
	"rain": "rain falling",
	"storm": "distant thunder",
	"crowd": "crowd murmuring",
	"room_tone": "room tone",
	"outdoor_ambience": "outdoor ambience",

	# --------------------------------------------------
	# Mechanical / Tech (very rough)
	# --------------------------------------------------
	"beep": "beep",
	"alarm": "alarm sound",
	"machine_hum": "machine humming",
	"engine": "engine rumble",
	"power_on": "powering on sound",
	"power_off": "powering down sound",

	# --------------------------------------------------
	# Creature / Non-human (unstable)
	# --------------------------------------------------
	"growl": "low growl",
	"roar": "distant roar",
	"rumble": "low rumble",
	"alien": "alien-like sound",

	# --------------------------------------------------
	# Silence / Timing Control
	# --------------------------------------------------
	"pause": "pause",
	"short_pause": "brief pause",
	"long_pause": "long pause"
	}


	STYLE_HINTS = {
	"narration": "in a calm, professional narration tone",
	"emotional": "with deep emotional expression",
	"robotic": "with a robotic, synthetic delivery",
	"space": "as if speaking through a helmet in space with echo",
	"cinematic": "cinematic, dramatic delivery",
	}

	LANGUAGES = [
	"English", "French", "Spanish", "German", "Italian",
	"Portuguese", "Polish", "Russian", "Turkish",
	"Chinese", "Japanese", "Korean", "Hindi"
	]

	def load_all_bark_voice_presets():
	presets = []

	bark_root = os.path.dirname(bark.__file__)
	prompts_dir = os.path.join(bark_root, "assets", "prompts")

	if not os.path.isdir(prompts_dir):
	return presets

	for file in os.listdir(prompts_dir):
	if file.endswith(".npz"):
	presets.append(file.replace(".npz", ""))

	presets.sort()
	return presets



	VOICE_PRESETS = load_all_bark_voice_presets()

	# =====================================================
	# UTILS
	# =====================================================
	def chunk_text(text: str, max_chars: int = 200):
	words = text.split()
	chunks, current = [], []
	for w in words:
	current.append(w)
	if len(" ".join(current)) >= max_chars:
	chunks.append(" ".join(current))
	current = []
	if current:
	chunks.append(" ".join(current))
	return chunks


	def clean_memory():
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# =====================================================
	# ROUTES
	# =====================================================
	@app.get("/meta")
	def meta():
	return {
	"languages": LANGUAGES,
	"voices": VOICE_PRESETS,
	"expressions": EXPRESSIONS,
	"sound_effects": SOUND_EFFECTS,
	"styles": STYLE_HINTS,
	"limits": {
	"max_concurrent_users": 20,
	"per_voice_sequential": True
	}
	}


	class TTSRequest(BaseModel):
	text: str
	voice: str
	expressions: List[str] = []
	sounds: List[str] = []
	style: Optional[str] = None


	@app.post("/tts")
	async def tts(req: TTSRequest):
	text = req.text
	voice = req.voice
	expressions = req.expressions
	sounds = req.sounds
	style = req.style

	# 1. Validation
	if not text.strip():
	raise HTTPException(status_code=400, detail="Text is required")

	if voice not in VOICE_PRESETS:
	raise HTTPException(status_code=400, detail=f"Invalid voice preset: {voice}")

	# 2. Build the Audio Prefix (Expressions, Sounds, Style)
	prompt_parts = []

	for e in expressions:
	if e in EXPRESSIONS:
	prompt_parts.append(EXPRESSIONS[e])

	for s in sounds:
	if s in SOUND_EFFECTS:
	prompt_parts.append(SOUND_EFFECTS[s])

	if style and style in STYLE_HINTS:
	prompt_parts.append(STYLE_HINTS[style])

	prefix = " ".join(prompt_parts).strip()

	# 3. Chunk ONLY the spoken text
	chunks = chunk_text(text)

	# Prepend the prefix (sounds/expressions) only to the very first chunk
	if prefix:
	chunks[0] = f"{prefix} {chunks[0]}"

	# 4. Acquire Locks
	voice_lock = await get_voice_lock(voice)

	async with GLOBAL_SEMAPHORE:
	async with voice_lock:

	audio_out = []

	# 5. Generate Audio sequentially for each chunk
	for chunk in chunks:
	# Bark can sometimes fail on completely empty strings, skip if empty
	if not chunk.strip():
	continue

	audio_array = await asyncio.to_thread(
	generate_audio,
	chunk,
	history_prompt=voice
	)
	audio_out.append(audio_array)
	clean_memory()

	# 6. Combine and package the audio
	if not audio_out:
	raise HTTPException(status_code=500, detail="Failed to generate audio.")

	final_audio = np.concatenate(audio_out)

	buffer = io.BytesIO()
	# Bark's default sample rate is 24kHz
	sf.write(buffer, final_audio, 24000, format="WAV")
	buffer.seek(0)

	clean_memory()

	return Response(
	content=buffer.read(),
	media_type="audio/wav"
	)