| | import gc |
| | import io |
| | import asyncio |
| | import numpy as np |
| | import soundfile as sf |
| | import torch |
| | import os |
| |
|
| | import bark |
| | from fastapi import FastAPI, HTTPException |
| | from fastapi.responses import Response |
| | from bark import generate_audio, preload_models |
| | from typing import List, Dict |
| | from pydantic import BaseModel |
| | from typing import List, Optional |
| | |
| | |
| | |
| | app = FastAPI(title="Bark TTS Production API") |
| |
|
| | |
| | |
| | |
| | preload_models() |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | GLOBAL_SEMAPHORE = asyncio.Semaphore(20) |
| |
|
| | |
| | VOICE_LOCKS: Dict[str, asyncio.Semaphore] = {} |
| |
|
| | VOICE_LOCKS_LOCK = asyncio.Lock() |
| |
|
| |
|
| | async def get_voice_lock(voice: str) -> asyncio.Semaphore: |
| | async with VOICE_LOCKS_LOCK: |
| | if voice not in VOICE_LOCKS: |
| | VOICE_LOCKS[voice] = asyncio.Semaphore(1) |
| | return VOICE_LOCKS[voice] |
| |
|
| | |
| | |
| | |
| | EXPRESSIONS = { |
| | |
| | |
| | |
| | "breathing": "[breathing]", |
| | "slow_breathing": "[slow breathing]", |
| | "heavy_breathing": "[heavy breathing]", |
| | "shallow_breathing": "[shallow breathing]", |
| | "deep_inhale": "[inhales deeply]", |
| | "deep_exhale": "[exhales]", |
| | "sharp_inhale": "[sharp inhale]", |
| | "gasp": "[gasps]", |
| | "panting": "[panting]", |
| | "catching_breath": "[catching breath]", |
| |
|
| | |
| | |
| | |
| | "sigh": "[sighs]", |
| | "relieved_sigh": "[sighs with relief]", |
| | "frustrated_sigh": "[frustrated sigh]", |
| | "tired_sigh": "[tired sigh]", |
| |
|
| | |
| | |
| | |
| | "hesitation": "...", |
| | "hesitation_dots": "...", |
| | "hesitation_dashes": "—", |
| | "thinking": "[thinking]", |
| | "thinking_hum": "[hums thoughtfully]", |
| | "unsure_hum": "[hesitant hum]", |
| | "pause": "[pause]", |
| |
|
| | |
| | |
| | |
| | "laughter": "[laughter]", |
| | "laughs": "[laughs]", |
| | "soft_laugh": "[soft laugh]", |
| | "nervous_laugh": "[nervous laugh]", |
| | "awkward_laugh": "[awkward laugh]", |
| | "giggle": "[giggles]", |
| | "chuckle": "[chuckles]", |
| | "snort": "[snorts]", |
| |
|
| | |
| | |
| | |
| | "grunt": "[grunts]", |
| | "strain": "[straining]", |
| | "effort": "[exertion]", |
| | "groan_effort": "[groans]", |
| | "release_breath": "[releases breath]", |
| | "fatigue": "[fatigued breath]", |
| |
|
| | |
| | |
| | |
| | "surprised_gasp": "[surprised gasp]", |
| | "startled": "[startled]", |
| | "shock": "[shocked inhale]", |
| | "oh_reaction": "[sudden inhale]", |
| |
|
| | |
| | |
| | |
| | "pain_grunt": "[painful grunt]", |
| | "wince": "[winces]", |
| | "distress": "[distressed breathing]", |
| | "strained_voice": "[strained voice]", |
| |
|
| | |
| | |
| | |
| | "sob": "[sobs]", |
| | "quiet_sob": "[quiet sob]", |
| | "whimper": "[whimpers]", |
| | "soft_cry": "[soft crying]", |
| | "tearful_breath": "[tearful breathing]", |
| |
|
| | |
| | |
| | |
| | "approval_hum": "[approving hum]", |
| | "satisfied_hum": "[satisfied hum]", |
| | "disappointed": "[disappointed sigh]", |
| | "annoyed": "[annoyed exhale]", |
| | "dismissive": "[dismissive huff]", |
| |
|
| | |
| | |
| | |
| | "yawn": "[yawns]", |
| | "sleepy_breath": "[sleepy breathing]", |
| | "groggy": "[groggy inhale]", |
| | "relaxed_exhale": "[relaxed exhale]", |
| |
|
| | |
| | |
| | |
| | "murmur": "[murmuring]", |
| | "background_voice": "[background voice]", |
| | "indistinct_vocal": "[indistinct vocal sounds]", |
| | "muffled_voice": "[muffled voice]", |
| |
|
| | |
| | |
| | |
| | "whisper": "(whispers)", |
| | "shout": "(shouting)", |
| | "breathy_voice": "[breathy voice]", |
| | "raspy_voice": "[raspy voice]", |
| | "trembling_voice": "[trembling voice]", |
| |
|
| | |
| | |
| | |
| | "clears_throat": "[clears throat]" |
| | } |
| |
|
| |
|
| | SOUND_EFFECTS = { |
| | |
| | |
| | |
| | "music": "[music]", |
| | "background_music": "[background music]", |
| | "music_fade_in": "[music fades in]", |
| | "music_fade_out": "[music fades out]", |
| | "humming_music": "[humming]", |
| | "song_notes": "♪ {text} ♪", |
| | "melody": "[melodic humming]", |
| |
|
| | |
| | |
| | |
| | "dramatic_hit": "*dramatic cinematic sound*", |
| | "dramatic_pause": "*dramatic pause*", |
| | "dramatic_silence": "*sudden silence*", |
| | "impact": "*impact sound*", |
| | "tension_rise": "*rising tension sound*", |
| | "reveal": "*cinematic reveal sound*", |
| |
|
| | |
| | |
| | |
| | "static": "*radio static noise*", |
| | "interference": "*audio interference*", |
| | "signal_drop": "*signal breaking up*", |
| | "distortion": "*distorted audio*", |
| | "low_quality": "*low quality audio*", |
| | "glitch": "*audio glitch*", |
| |
|
| | |
| | |
| | |
| | "echo": "*echoing voice*", |
| | "reverb": "*reverberation*", |
| | "hall_echo": "*large hall echo*", |
| | "distance_voice": "*distant voice*", |
| | "muffled": "*muffled sound*", |
| | "underwater": "*underwater sound*", |
| |
|
| | |
| | |
| | |
| | "footsteps": "*footsteps*", |
| | "approaching": "*approaching sound*", |
| | "departing": "*fading footsteps*", |
| | "rustle": "*rustling sound*", |
| | "cloth_movement": "*fabric movement*", |
| |
|
| | |
| | |
| | |
| | "wind": "*wind blowing*", |
| | "rain": "*rain falling*", |
| | "storm": "*distant thunder*", |
| | "crowd": "*crowd murmuring*", |
| | "room_tone": "*room tone*", |
| | "outdoor_ambience": "*outdoor ambience*", |
| |
|
| | |
| | |
| | |
| | "beep": "*beep*", |
| | "alarm": "*alarm sound*", |
| | "machine_hum": "*machine humming*", |
| | "engine": "*engine rumble*", |
| | "power_on": "*powering on sound*", |
| | "power_off": "*powering down sound*", |
| |
|
| | |
| | |
| | |
| | "growl": "*low growl*", |
| | "roar": "*distant roar*", |
| | "rumble": "*low rumble*", |
| | "alien": "*alien-like sound*", |
| |
|
| | |
| | |
| | |
| | "pause": "*pause*", |
| | "short_pause": "*brief pause*", |
| | "long_pause": "*long pause*" |
| | } |
| |
|
| |
|
| | STYLE_HINTS = { |
| | "narration": "in a calm, professional narration tone", |
| | "emotional": "with deep emotional expression", |
| | "robotic": "with a robotic, synthetic delivery", |
| | "space": "as if speaking through a helmet in space with echo", |
| | "cinematic": "cinematic, dramatic delivery", |
| | } |
| |
|
| | LANGUAGES = [ |
| | "English", "French", "Spanish", "German", "Italian", |
| | "Portuguese", "Polish", "Russian", "Turkish", |
| | "Chinese", "Japanese", "Korean", "Hindi" |
| | ] |
| |
|
| | def load_all_bark_voice_presets(): |
| | presets = [] |
| |
|
| | bark_root = os.path.dirname(bark.__file__) |
| | prompts_dir = os.path.join(bark_root, "assets", "prompts") |
| |
|
| | if not os.path.isdir(prompts_dir): |
| | return presets |
| |
|
| | for file in os.listdir(prompts_dir): |
| | if file.endswith(".npz"): |
| | presets.append(file.replace(".npz", "")) |
| |
|
| | presets.sort() |
| | return presets |
| |
|
| |
|
| |
|
| | VOICE_PRESETS = load_all_bark_voice_presets() |
| |
|
| | |
| | |
| | |
| | def chunk_text(text: str, max_chars: int = 200): |
| | words = text.split() |
| | chunks, current = [], [] |
| | for w in words: |
| | current.append(w) |
| | if len(" ".join(current)) >= max_chars: |
| | chunks.append(" ".join(current)) |
| | current = [] |
| | if current: |
| | chunks.append(" ".join(current)) |
| | return chunks |
| |
|
| |
|
| | def clean_memory(): |
| | gc.collect() |
| | if torch.cuda.is_available(): |
| | torch.cuda.empty_cache() |
| |
|
| | |
| | |
| | |
| | @app.get("/meta") |
| | def meta(): |
| | return { |
| | "languages": LANGUAGES, |
| | "voices": VOICE_PRESETS, |
| | "expressions": EXPRESSIONS, |
| | "sound_effects": SOUND_EFFECTS, |
| | "styles": STYLE_HINTS, |
| | "limits": { |
| | "max_concurrent_users": 20, |
| | "per_voice_sequential": True |
| | } |
| | } |
| |
|
| |
|
| | class TTSRequest(BaseModel): |
| | text: str |
| | voice: str |
| | expressions: List[str] = [] |
| | sounds: List[str] = [] |
| | style: Optional[str] = None |
| |
|
| |
|
| | @app.post("/tts") |
| | async def tts(req: TTSRequest): |
| | text = req.text |
| | voice = req.voice |
| | expressions = req.expressions |
| | sounds = req.sounds |
| | style = req.style |
| |
|
| | |
| | if not text.strip(): |
| | raise HTTPException(status_code=400, detail="Text is required") |
| |
|
| | if voice not in VOICE_PRESETS: |
| | raise HTTPException(status_code=400, detail=f"Invalid voice preset: {voice}") |
| |
|
| | |
| | prompt_parts = [] |
| |
|
| | for e in expressions: |
| | if e in EXPRESSIONS: |
| | prompt_parts.append(EXPRESSIONS[e]) |
| |
|
| | for s in sounds: |
| | if s in SOUND_EFFECTS: |
| | prompt_parts.append(SOUND_EFFECTS[s]) |
| | |
| | if style and style in STYLE_HINTS: |
| | prompt_parts.append(STYLE_HINTS[style]) |
| |
|
| | prefix = " ".join(prompt_parts).strip() |
| |
|
| | |
| | chunks = chunk_text(text) |
| | |
| | |
| | if prefix: |
| | chunks[0] = f"{prefix} {chunks[0]}" |
| |
|
| | |
| | voice_lock = await get_voice_lock(voice) |
| |
|
| | async with GLOBAL_SEMAPHORE: |
| | async with voice_lock: |
| | |
| | audio_out = [] |
| |
|
| | |
| | for chunk in chunks: |
| | |
| | if not chunk.strip(): |
| | continue |
| | |
| | audio_array = await asyncio.to_thread( |
| | generate_audio, |
| | chunk, |
| | history_prompt=voice |
| | ) |
| | audio_out.append(audio_array) |
| | clean_memory() |
| |
|
| | |
| | if not audio_out: |
| | raise HTTPException(status_code=500, detail="Failed to generate audio.") |
| |
|
| | final_audio = np.concatenate(audio_out) |
| |
|
| | buffer = io.BytesIO() |
| | |
| | sf.write(buffer, final_audio, 24000, format="WAV") |
| | buffer.seek(0) |
| |
|
| | clean_memory() |
| |
|
| | return Response( |
| | content=buffer.read(), |
| | media_type="audio/wav" |
| | ) |