| | import hashlib |
| | import json |
| | import logging |
| | import os |
| | import uuid |
| | import html |
| | import base64 |
| | from functools import lru_cache |
| | from pydub import AudioSegment |
| | from pydub.silence import split_on_silence |
| | from concurrent.futures import ThreadPoolExecutor |
| | from typing import Optional |
| |
|
| | from fnmatch import fnmatch |
| | import aiohttp |
| | import aiofiles |
| | import requests |
| | import mimetypes |
| |
|
| | from fastapi import ( |
| | Depends, |
| | FastAPI, |
| | File, |
| | Form, |
| | HTTPException, |
| | Request, |
| | UploadFile, |
| | status, |
| | APIRouter, |
| | ) |
| | from fastapi.middleware.cors import CORSMiddleware |
| | from fastapi.responses import FileResponse |
| | from pydantic import BaseModel |
| |
|
| |
|
| | from open_webui.utils.misc import strict_match_mime_type |
| | from open_webui.utils.auth import get_admin_user, get_verified_user |
| | from open_webui.utils.access_control import has_permission |
| | from open_webui.utils.headers import include_user_info_headers |
| | from open_webui.config import ( |
| | WHISPER_MODEL_AUTO_UPDATE, |
| | WHISPER_COMPUTE_TYPE, |
| | WHISPER_MODEL_DIR, |
| | WHISPER_VAD_FILTER, |
| | CACHE_DIR, |
| | WHISPER_LANGUAGE, |
| | WHISPER_MULTILINGUAL, |
| | ELEVENLABS_API_BASE_URL, |
| | ) |
| |
|
| | from open_webui.constants import ERROR_MESSAGES |
| | from open_webui.env import ( |
| | ENV, |
| | AIOHTTP_CLIENT_SESSION_SSL, |
| | AIOHTTP_CLIENT_TIMEOUT, |
| | AIOHTTP_CLIENT_TIMEOUT_MODEL_LIST, |
| | DEVICE_TYPE, |
| | ENABLE_FORWARD_USER_INFO_HEADERS, |
| | ) |
| |
|
| | router = APIRouter() |
| |
|
| | |
| | MAX_FILE_SIZE_MB = 20 |
| | MAX_FILE_SIZE = MAX_FILE_SIZE_MB * 1024 * 1024 |
| | AZURE_MAX_FILE_SIZE_MB = 200 |
| | AZURE_MAX_FILE_SIZE = AZURE_MAX_FILE_SIZE_MB * 1024 * 1024 |
| |
|
| | log = logging.getLogger(__name__) |
| |
|
| | SPEECH_CACHE_DIR = CACHE_DIR / "audio" / "speech" |
| | SPEECH_CACHE_DIR.mkdir(parents=True, exist_ok=True) |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | from pydub import AudioSegment |
| | from pydub.utils import mediainfo |
| |
|
| |
|
| | def is_audio_conversion_required(file_path): |
| | """ |
| | Check if the given audio file needs conversion to mp3. |
| | """ |
| | SUPPORTED_FORMATS = {"flac", "m4a", "mp3", "mp4", "mpeg", "wav", "webm"} |
| |
|
| | if not os.path.isfile(file_path): |
| | log.error(f"File not found: {file_path}") |
| | return False |
| |
|
| | try: |
| | info = mediainfo(file_path) |
| | codec_name = info.get("codec_name", "").lower() |
| | codec_type = info.get("codec_type", "").lower() |
| | codec_tag_string = info.get("codec_tag_string", "").lower() |
| |
|
| | if codec_name == "aac" and codec_type == "audio" and codec_tag_string == "mp4a": |
| | |
| | return True |
| |
|
| | |
| | if codec_name in SUPPORTED_FORMATS: |
| | return False |
| |
|
| | return True |
| | except Exception as e: |
| | log.error(f"Error getting audio format: {e}") |
| | return False |
| |
|
| |
|
| | def convert_audio_to_mp3(file_path): |
| | """Convert audio file to mp3 format.""" |
| | try: |
| | output_path = os.path.splitext(file_path)[0] + ".mp3" |
| | audio = AudioSegment.from_file(file_path) |
| | audio.export(output_path, format="mp3") |
| | log.info(f"Converted {file_path} to {output_path}") |
| | return output_path |
| | except Exception as e: |
| | log.error(f"Error converting audio file: {e}") |
| | return None |
| |
|
| |
|
| | def set_faster_whisper_model(model: str, auto_update: bool = False): |
| | whisper_model = None |
| | if model: |
| | from faster_whisper import WhisperModel |
| |
|
| | faster_whisper_kwargs = { |
| | "model_size_or_path": model, |
| | "device": DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu", |
| | "compute_type": WHISPER_COMPUTE_TYPE, |
| | "download_root": WHISPER_MODEL_DIR, |
| | "local_files_only": not auto_update, |
| | } |
| |
|
| | try: |
| | whisper_model = WhisperModel(**faster_whisper_kwargs) |
| | except Exception: |
| | log.warning( |
| | "WhisperModel initialization failed, attempting download with local_files_only=False" |
| | ) |
| | faster_whisper_kwargs["local_files_only"] = False |
| | whisper_model = WhisperModel(**faster_whisper_kwargs) |
| | return whisper_model |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | class TTSConfigForm(BaseModel): |
| | OPENAI_API_BASE_URL: str |
| | OPENAI_API_KEY: str |
| | OPENAI_PARAMS: Optional[dict] = None |
| | API_KEY: str |
| | ENGINE: str |
| | MODEL: str |
| | VOICE: str |
| | SPLIT_ON: str |
| | AZURE_SPEECH_REGION: str |
| | AZURE_SPEECH_BASE_URL: str |
| | AZURE_SPEECH_OUTPUT_FORMAT: str |
| |
|
| |
|
| | class STTConfigForm(BaseModel): |
| | OPENAI_API_BASE_URL: str |
| | OPENAI_API_KEY: str |
| | ENGINE: str |
| | MODEL: str |
| | SUPPORTED_CONTENT_TYPES: list[str] = [] |
| | WHISPER_MODEL: str |
| | DEEPGRAM_API_KEY: str |
| | AZURE_API_KEY: str |
| | AZURE_REGION: str |
| | AZURE_LOCALES: str |
| | AZURE_BASE_URL: str |
| | AZURE_MAX_SPEAKERS: str |
| | MISTRAL_API_KEY: str |
| | MISTRAL_API_BASE_URL: str |
| | MISTRAL_USE_CHAT_COMPLETIONS: bool |
| |
|
| |
|
| | class AudioConfigUpdateForm(BaseModel): |
| | tts: TTSConfigForm |
| | stt: STTConfigForm |
| |
|
| |
|
| | @router.get("/config") |
| | async def get_audio_config(request: Request, user=Depends(get_admin_user)): |
| | return { |
| | "tts": { |
| | "OPENAI_API_BASE_URL": request.app.state.config.TTS_OPENAI_API_BASE_URL, |
| | "OPENAI_API_KEY": request.app.state.config.TTS_OPENAI_API_KEY, |
| | "OPENAI_PARAMS": request.app.state.config.TTS_OPENAI_PARAMS, |
| | "API_KEY": request.app.state.config.TTS_API_KEY, |
| | "ENGINE": request.app.state.config.TTS_ENGINE, |
| | "MODEL": request.app.state.config.TTS_MODEL, |
| | "VOICE": request.app.state.config.TTS_VOICE, |
| | "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON, |
| | "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION, |
| | "AZURE_SPEECH_BASE_URL": request.app.state.config.TTS_AZURE_SPEECH_BASE_URL, |
| | "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT, |
| | }, |
| | "stt": { |
| | "OPENAI_API_BASE_URL": request.app.state.config.STT_OPENAI_API_BASE_URL, |
| | "OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY, |
| | "ENGINE": request.app.state.config.STT_ENGINE, |
| | "MODEL": request.app.state.config.STT_MODEL, |
| | "SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES, |
| | "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL, |
| | "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY, |
| | "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY, |
| | "AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION, |
| | "AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES, |
| | "AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL, |
| | "AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS, |
| | "MISTRAL_API_KEY": request.app.state.config.AUDIO_STT_MISTRAL_API_KEY, |
| | "MISTRAL_API_BASE_URL": request.app.state.config.AUDIO_STT_MISTRAL_API_BASE_URL, |
| | "MISTRAL_USE_CHAT_COMPLETIONS": request.app.state.config.AUDIO_STT_MISTRAL_USE_CHAT_COMPLETIONS, |
| | }, |
| | } |
| |
|
| |
|
| | @router.post("/config/update") |
| | async def update_audio_config( |
| | request: Request, form_data: AudioConfigUpdateForm, user=Depends(get_admin_user) |
| | ): |
| | request.app.state.config.TTS_OPENAI_API_BASE_URL = form_data.tts.OPENAI_API_BASE_URL |
| | request.app.state.config.TTS_OPENAI_API_KEY = form_data.tts.OPENAI_API_KEY |
| | request.app.state.config.TTS_OPENAI_PARAMS = form_data.tts.OPENAI_PARAMS |
| | request.app.state.config.TTS_API_KEY = form_data.tts.API_KEY |
| | request.app.state.config.TTS_ENGINE = form_data.tts.ENGINE |
| | request.app.state.config.TTS_MODEL = form_data.tts.MODEL |
| | request.app.state.config.TTS_VOICE = form_data.tts.VOICE |
| | request.app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON |
| | request.app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION |
| | request.app.state.config.TTS_AZURE_SPEECH_BASE_URL = ( |
| | form_data.tts.AZURE_SPEECH_BASE_URL |
| | ) |
| | request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = ( |
| | form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT |
| | ) |
| |
|
| | request.app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL |
| | request.app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY |
| | request.app.state.config.STT_ENGINE = form_data.stt.ENGINE |
| | request.app.state.config.STT_MODEL = form_data.stt.MODEL |
| | request.app.state.config.STT_SUPPORTED_CONTENT_TYPES = ( |
| | form_data.stt.SUPPORTED_CONTENT_TYPES |
| | ) |
| |
|
| | request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL |
| | request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY |
| | request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY |
| | request.app.state.config.AUDIO_STT_AZURE_REGION = form_data.stt.AZURE_REGION |
| | request.app.state.config.AUDIO_STT_AZURE_LOCALES = form_data.stt.AZURE_LOCALES |
| | request.app.state.config.AUDIO_STT_AZURE_BASE_URL = form_data.stt.AZURE_BASE_URL |
| | request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS = ( |
| | form_data.stt.AZURE_MAX_SPEAKERS |
| | ) |
| | request.app.state.config.AUDIO_STT_MISTRAL_API_KEY = form_data.stt.MISTRAL_API_KEY |
| | request.app.state.config.AUDIO_STT_MISTRAL_API_BASE_URL = ( |
| | form_data.stt.MISTRAL_API_BASE_URL |
| | ) |
| | request.app.state.config.AUDIO_STT_MISTRAL_USE_CHAT_COMPLETIONS = ( |
| | form_data.stt.MISTRAL_USE_CHAT_COMPLETIONS |
| | ) |
| |
|
| | if request.app.state.config.STT_ENGINE == "": |
| | request.app.state.faster_whisper_model = set_faster_whisper_model( |
| | form_data.stt.WHISPER_MODEL, WHISPER_MODEL_AUTO_UPDATE |
| | ) |
| | else: |
| | request.app.state.faster_whisper_model = None |
| |
|
| | return { |
| | "tts": { |
| | "ENGINE": request.app.state.config.TTS_ENGINE, |
| | "MODEL": request.app.state.config.TTS_MODEL, |
| | "VOICE": request.app.state.config.TTS_VOICE, |
| | "OPENAI_API_BASE_URL": request.app.state.config.TTS_OPENAI_API_BASE_URL, |
| | "OPENAI_API_KEY": request.app.state.config.TTS_OPENAI_API_KEY, |
| | "OPENAI_PARAMS": request.app.state.config.TTS_OPENAI_PARAMS, |
| | "API_KEY": request.app.state.config.TTS_API_KEY, |
| | "SPLIT_ON": request.app.state.config.TTS_SPLIT_ON, |
| | "AZURE_SPEECH_REGION": request.app.state.config.TTS_AZURE_SPEECH_REGION, |
| | "AZURE_SPEECH_BASE_URL": request.app.state.config.TTS_AZURE_SPEECH_BASE_URL, |
| | "AZURE_SPEECH_OUTPUT_FORMAT": request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT, |
| | }, |
| | "stt": { |
| | "OPENAI_API_BASE_URL": request.app.state.config.STT_OPENAI_API_BASE_URL, |
| | "OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY, |
| | "ENGINE": request.app.state.config.STT_ENGINE, |
| | "MODEL": request.app.state.config.STT_MODEL, |
| | "SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES, |
| | "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL, |
| | "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY, |
| | "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY, |
| | "AZURE_REGION": request.app.state.config.AUDIO_STT_AZURE_REGION, |
| | "AZURE_LOCALES": request.app.state.config.AUDIO_STT_AZURE_LOCALES, |
| | "AZURE_BASE_URL": request.app.state.config.AUDIO_STT_AZURE_BASE_URL, |
| | "AZURE_MAX_SPEAKERS": request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS, |
| | "MISTRAL_API_KEY": request.app.state.config.AUDIO_STT_MISTRAL_API_KEY, |
| | "MISTRAL_API_BASE_URL": request.app.state.config.AUDIO_STT_MISTRAL_API_BASE_URL, |
| | "MISTRAL_USE_CHAT_COMPLETIONS": request.app.state.config.AUDIO_STT_MISTRAL_USE_CHAT_COMPLETIONS, |
| | }, |
| | } |
| |
|
| |
|
| | def load_speech_pipeline(request): |
| | from transformers import pipeline |
| | from datasets import load_dataset |
| |
|
| | if request.app.state.speech_synthesiser is None: |
| | request.app.state.speech_synthesiser = pipeline( |
| | "text-to-speech", "microsoft/speecht5_tts" |
| | ) |
| |
|
| | if request.app.state.speech_speaker_embeddings_dataset is None: |
| | request.app.state.speech_speaker_embeddings_dataset = load_dataset( |
| | "Matthijs/cmu-arctic-xvectors", split="validation" |
| | ) |
| |
|
| |
|
| | @router.post("/speech") |
| | async def speech(request: Request, user=Depends(get_verified_user)): |
| | if request.app.state.config.TTS_ENGINE == "": |
| | raise HTTPException( |
| | status_code=status.HTTP_404_NOT_FOUND, |
| | detail=ERROR_MESSAGES.NOT_FOUND, |
| | ) |
| |
|
| | if user.role != "admin" and not has_permission( |
| | user.id, "chat.tts", request.app.state.config.USER_PERMISSIONS |
| | ): |
| | raise HTTPException( |
| | status_code=status.HTTP_403_FORBIDDEN, |
| | detail=ERROR_MESSAGES.ACCESS_PROHIBITED, |
| | ) |
| |
|
| | body = await request.body() |
| | name = hashlib.sha256( |
| | body |
| | + str(request.app.state.config.TTS_ENGINE).encode("utf-8") |
| | + str(request.app.state.config.TTS_MODEL).encode("utf-8") |
| | ).hexdigest() |
| |
|
| | file_path = SPEECH_CACHE_DIR.joinpath(f"{name}.mp3") |
| | file_body_path = SPEECH_CACHE_DIR.joinpath(f"{name}.json") |
| |
|
| | |
| | if file_path.is_file(): |
| | return FileResponse(file_path) |
| |
|
| | payload = None |
| | try: |
| | payload = json.loads(body.decode("utf-8")) |
| | except Exception as e: |
| | log.exception(e) |
| | raise HTTPException(status_code=400, detail="Invalid JSON payload") |
| |
|
| | r = None |
| | if request.app.state.config.TTS_ENGINE == "openai": |
| | payload["model"] = request.app.state.config.TTS_MODEL |
| |
|
| | try: |
| | timeout = aiohttp.ClientTimeout(total=AIOHTTP_CLIENT_TIMEOUT) |
| | async with aiohttp.ClientSession( |
| | timeout=timeout, trust_env=True |
| | ) as session: |
| | payload = { |
| | **payload, |
| | **(request.app.state.config.TTS_OPENAI_PARAMS or {}), |
| | } |
| |
|
| | headers = { |
| | "Content-Type": "application/json", |
| | "Authorization": f"Bearer {request.app.state.config.TTS_OPENAI_API_KEY}", |
| | } |
| | if ENABLE_FORWARD_USER_INFO_HEADERS: |
| | headers = include_user_info_headers(headers, user) |
| |
|
| | r = await session.post( |
| | url=f"{request.app.state.config.TTS_OPENAI_API_BASE_URL}/audio/speech", |
| | json=payload, |
| | headers=headers, |
| | ssl=AIOHTTP_CLIENT_SESSION_SSL, |
| | ) |
| |
|
| | r.raise_for_status() |
| |
|
| | async with aiofiles.open(file_path, "wb") as f: |
| | await f.write(await r.read()) |
| |
|
| | async with aiofiles.open(file_body_path, "w") as f: |
| | await f.write(json.dumps(payload)) |
| |
|
| | return FileResponse(file_path) |
| |
|
| | except Exception as e: |
| | log.exception(e) |
| | detail = None |
| |
|
| | status_code = 500 |
| | detail = f"Open WebUI: Server Connection Error" |
| |
|
| | if r is not None: |
| | status_code = r.status |
| |
|
| | try: |
| | res = await r.json() |
| | if "error" in res: |
| | detail = f"External: {res['error']}" |
| | except Exception: |
| | detail = f"External: {e}" |
| |
|
| | raise HTTPException( |
| | status_code=status_code, |
| | detail=detail, |
| | ) |
| |
|
| | elif request.app.state.config.TTS_ENGINE == "elevenlabs": |
| | voice_id = payload.get("voice", "") |
| |
|
| | if voice_id not in get_available_voices(request): |
| | raise HTTPException( |
| | status_code=400, |
| | detail="Invalid voice id", |
| | ) |
| |
|
| | try: |
| | timeout = aiohttp.ClientTimeout(total=AIOHTTP_CLIENT_TIMEOUT) |
| | async with aiohttp.ClientSession( |
| | timeout=timeout, trust_env=True |
| | ) as session: |
| | async with session.post( |
| | f"{ELEVENLABS_API_BASE_URL}/v1/text-to-speech/{voice_id}", |
| | json={ |
| | "text": payload["input"], |
| | "model_id": request.app.state.config.TTS_MODEL, |
| | "voice_settings": {"stability": 0.5, "similarity_boost": 0.5}, |
| | }, |
| | headers={ |
| | "Accept": "audio/mpeg", |
| | "Content-Type": "application/json", |
| | "xi-api-key": request.app.state.config.TTS_API_KEY, |
| | }, |
| | ssl=AIOHTTP_CLIENT_SESSION_SSL, |
| | ) as r: |
| | r.raise_for_status() |
| |
|
| | async with aiofiles.open(file_path, "wb") as f: |
| | await f.write(await r.read()) |
| |
|
| | async with aiofiles.open(file_body_path, "w") as f: |
| | await f.write(json.dumps(payload)) |
| |
|
| | return FileResponse(file_path) |
| |
|
| | except Exception as e: |
| | log.exception(e) |
| | detail = None |
| |
|
| | try: |
| | if r.status != 200: |
| | res = await r.json() |
| | if "error" in res: |
| | detail = f"External: {res['error'].get('message', '')}" |
| | except Exception: |
| | detail = f"External: {e}" |
| |
|
| | raise HTTPException( |
| | status_code=getattr(r, "status", 500) if r else 500, |
| | detail=detail if detail else "Open WebUI: Server Connection Error", |
| | ) |
| |
|
| | elif request.app.state.config.TTS_ENGINE == "azure": |
| | try: |
| | payload = json.loads(body.decode("utf-8")) |
| | except Exception as e: |
| | log.exception(e) |
| | raise HTTPException(status_code=400, detail="Invalid JSON payload") |
| |
|
| | region = request.app.state.config.TTS_AZURE_SPEECH_REGION or "eastus" |
| | base_url = request.app.state.config.TTS_AZURE_SPEECH_BASE_URL |
| | language = request.app.state.config.TTS_VOICE |
| | locale = "-".join(request.app.state.config.TTS_VOICE.split("-")[:1]) |
| | output_format = request.app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT |
| |
|
| | try: |
| | data = f"""<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{locale}"> |
| | <voice name="{language}">{html.escape(payload["input"])}</voice> |
| | </speak>""" |
| | timeout = aiohttp.ClientTimeout(total=AIOHTTP_CLIENT_TIMEOUT) |
| | async with aiohttp.ClientSession( |
| | timeout=timeout, trust_env=True |
| | ) as session: |
| | async with session.post( |
| | (base_url or f"https://{region}.tts.speech.microsoft.com") |
| | + "/cognitiveservices/v1", |
| | headers={ |
| | "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY, |
| | "Content-Type": "application/ssml+xml", |
| | "X-Microsoft-OutputFormat": output_format, |
| | }, |
| | data=data, |
| | ssl=AIOHTTP_CLIENT_SESSION_SSL, |
| | ) as r: |
| | r.raise_for_status() |
| |
|
| | async with aiofiles.open(file_path, "wb") as f: |
| | await f.write(await r.read()) |
| |
|
| | async with aiofiles.open(file_body_path, "w") as f: |
| | await f.write(json.dumps(payload)) |
| |
|
| | return FileResponse(file_path) |
| |
|
| | except Exception as e: |
| | log.exception(e) |
| | detail = None |
| |
|
| | try: |
| | if r.status != 200: |
| | res = await r.json() |
| | if "error" in res: |
| | detail = f"External: {res['error'].get('message', '')}" |
| | except Exception: |
| | detail = f"External: {e}" |
| |
|
| | raise HTTPException( |
| | status_code=getattr(r, "status", 500) if r else 500, |
| | detail=detail if detail else "Open WebUI: Server Connection Error", |
| | ) |
| |
|
| | elif request.app.state.config.TTS_ENGINE == "transformers": |
| | payload = None |
| | try: |
| | payload = json.loads(body.decode("utf-8")) |
| | except Exception as e: |
| | log.exception(e) |
| | raise HTTPException(status_code=400, detail="Invalid JSON payload") |
| |
|
| | import torch |
| | import soundfile as sf |
| |
|
| | load_speech_pipeline(request) |
| |
|
| | embeddings_dataset = request.app.state.speech_speaker_embeddings_dataset |
| |
|
| | speaker_index = 6799 |
| | try: |
| | speaker_index = embeddings_dataset["filename"].index( |
| | request.app.state.config.TTS_MODEL |
| | ) |
| | except Exception: |
| | pass |
| |
|
| | speaker_embedding = torch.tensor( |
| | embeddings_dataset[speaker_index]["xvector"] |
| | ).unsqueeze(0) |
| |
|
| | speech = request.app.state.speech_synthesiser( |
| | payload["input"], |
| | forward_params={"speaker_embeddings": speaker_embedding}, |
| | ) |
| |
|
| | sf.write(file_path, speech["audio"], samplerate=speech["sampling_rate"]) |
| |
|
| | async with aiofiles.open(file_body_path, "w") as f: |
| | await f.write(json.dumps(payload)) |
| |
|
| | return FileResponse(file_path) |
| |
|
| |
|
| | def transcription_handler(request, file_path, metadata, user=None): |
| | filename = os.path.basename(file_path) |
| | file_dir = os.path.dirname(file_path) |
| | id = filename.split(".")[0] |
| |
|
| | metadata = metadata or {} |
| |
|
| | languages = [ |
| | metadata.get("language", None) if not WHISPER_LANGUAGE else WHISPER_LANGUAGE, |
| | None, |
| | ] |
| |
|
| | if request.app.state.config.STT_ENGINE == "": |
| | if request.app.state.faster_whisper_model is None: |
| | request.app.state.faster_whisper_model = set_faster_whisper_model( |
| | request.app.state.config.WHISPER_MODEL |
| | ) |
| |
|
| | model = request.app.state.faster_whisper_model |
| | segments, info = model.transcribe( |
| | file_path, |
| | beam_size=5, |
| | vad_filter=WHISPER_VAD_FILTER, |
| | language=languages[0], |
| | multilingual=WHISPER_MULTILINGUAL, |
| | ) |
| | log.info( |
| | "Detected language '%s' with probability %f" |
| | % (info.language, info.language_probability) |
| | ) |
| |
|
| | transcript = "".join([segment.text for segment in list(segments)]) |
| | data = {"text": transcript.strip()} |
| |
|
| | |
| | transcript_file = f"{file_dir}/{id}.json" |
| | with open(transcript_file, "w") as f: |
| | json.dump(data, f) |
| |
|
| | log.debug(data) |
| | return data |
| | elif request.app.state.config.STT_ENGINE == "openai": |
| | r = None |
| | try: |
| | for language in languages: |
| | payload = { |
| | "model": request.app.state.config.STT_MODEL, |
| | } |
| |
|
| | if language: |
| | payload["language"] = language |
| |
|
| | headers = { |
| | "Authorization": f"Bearer {request.app.state.config.STT_OPENAI_API_KEY}" |
| | } |
| | if user and ENABLE_FORWARD_USER_INFO_HEADERS: |
| | headers = include_user_info_headers(headers, user) |
| |
|
| | with open(file_path, "rb") as audio_file: |
| | r = requests.post( |
| | url=f"{request.app.state.config.STT_OPENAI_API_BASE_URL}/audio/transcriptions", |
| | headers=headers, |
| | files={"file": (filename, audio_file)}, |
| | data=payload, |
| | timeout=AIOHTTP_CLIENT_TIMEOUT, |
| | ) |
| |
|
| | if r.status_code == 200: |
| | |
| | break |
| |
|
| | r.raise_for_status() |
| | data = r.json() |
| |
|
| | |
| | transcript_file = f"{file_dir}/{id}.json" |
| | with open(transcript_file, "w") as f: |
| | json.dump(data, f) |
| |
|
| | return data |
| | except Exception as e: |
| | log.exception(e) |
| |
|
| | detail = None |
| | if r is not None: |
| | try: |
| | res = r.json() |
| | if "error" in res: |
| | detail = f"External: {res['error'].get('message', '')}" |
| | except Exception: |
| | detail = f"External: {e}" |
| |
|
| | raise Exception(detail if detail else "Open WebUI: Server Connection Error") |
| |
|
| | elif request.app.state.config.STT_ENGINE == "deepgram": |
| | try: |
| | |
| | mime, _ = mimetypes.guess_type(file_path) |
| | if not mime: |
| | mime = "audio/wav" |
| |
|
| | |
| | with open(file_path, "rb") as f: |
| | file_data = f.read() |
| |
|
| | |
| | headers = { |
| | "Authorization": f"Token {request.app.state.config.DEEPGRAM_API_KEY}", |
| | "Content-Type": mime, |
| | } |
| |
|
| | for language in languages: |
| | params = {} |
| | if request.app.state.config.STT_MODEL: |
| | params["model"] = request.app.state.config.STT_MODEL |
| |
|
| | if language: |
| | params["language"] = language |
| |
|
| | |
| | r = requests.post( |
| | "https://api.deepgram.com/v1/listen?smart_format=true", |
| | headers=headers, |
| | params=params, |
| | data=file_data, |
| | timeout=AIOHTTP_CLIENT_TIMEOUT, |
| | ) |
| |
|
| | if r.status_code == 200: |
| | |
| | break |
| |
|
| | r.raise_for_status() |
| | response_data = r.json() |
| |
|
| | |
| | try: |
| | transcript = response_data["results"]["channels"][0]["alternatives"][ |
| | 0 |
| | ].get("transcript", "") |
| | except (KeyError, IndexError) as e: |
| | log.error(f"Malformed response from Deepgram: {str(e)}") |
| | raise Exception( |
| | "Failed to parse Deepgram response - unexpected response format" |
| | ) |
| | data = {"text": transcript.strip()} |
| |
|
| | |
| | transcript_file = f"{file_dir}/{id}.json" |
| | with open(transcript_file, "w") as f: |
| | json.dump(data, f) |
| |
|
| | return data |
| |
|
| | except Exception as e: |
| | log.exception(e) |
| | detail = None |
| | if r is not None: |
| | try: |
| | res = r.json() |
| | if "error" in res: |
| | detail = f"External: {res['error'].get('message', '')}" |
| | except Exception: |
| | detail = f"External: {e}" |
| | raise Exception(detail if detail else "Open WebUI: Server Connection Error") |
| |
|
| | elif request.app.state.config.STT_ENGINE == "azure": |
| | |
| | if not os.path.exists(file_path): |
| | raise HTTPException(status_code=400, detail="Audio file not found") |
| |
|
| | |
| | file_size = os.path.getsize(file_path) |
| | if file_size > AZURE_MAX_FILE_SIZE: |
| | raise HTTPException( |
| | status_code=400, |
| | detail=f"File size exceeds Azure's limit of {AZURE_MAX_FILE_SIZE_MB}MB", |
| | ) |
| |
|
| | api_key = request.app.state.config.AUDIO_STT_AZURE_API_KEY |
| | region = request.app.state.config.AUDIO_STT_AZURE_REGION or "eastus" |
| | locales = request.app.state.config.AUDIO_STT_AZURE_LOCALES |
| | base_url = request.app.state.config.AUDIO_STT_AZURE_BASE_URL |
| | max_speakers = request.app.state.config.AUDIO_STT_AZURE_MAX_SPEAKERS or 3 |
| |
|
| | |
| | if len(locales) < 2: |
| | locales = [ |
| | "en-US", |
| | "es-ES", |
| | "es-MX", |
| | "fr-FR", |
| | "hi-IN", |
| | "it-IT", |
| | "de-DE", |
| | "en-GB", |
| | "en-IN", |
| | "ja-JP", |
| | "ko-KR", |
| | "pt-BR", |
| | "zh-CN", |
| | ] |
| | locales = ",".join(locales) |
| |
|
| | if not api_key or not region: |
| | raise HTTPException( |
| | status_code=400, |
| | detail="Azure API key is required for Azure STT", |
| | ) |
| |
|
| | r = None |
| | try: |
| | |
| | data = { |
| | "definition": json.dumps( |
| | { |
| | "locales": locales.split(","), |
| | "diarization": {"maxSpeakers": max_speakers, "enabled": True}, |
| | } |
| | if locales |
| | else {} |
| | ) |
| | } |
| |
|
| | url = ( |
| | base_url or f"https://{region}.api.cognitive.microsoft.com" |
| | ) + "/speechtotext/transcriptions:transcribe?api-version=2024-11-15" |
| |
|
| | |
| | with open(file_path, "rb") as audio_file: |
| | r = requests.post( |
| | url=url, |
| | files={"audio": audio_file}, |
| | data=data, |
| | headers={ |
| | "Ocp-Apim-Subscription-Key": api_key, |
| | }, |
| | timeout=AIOHTTP_CLIENT_TIMEOUT, |
| | ) |
| |
|
| | r.raise_for_status() |
| | response = r.json() |
| |
|
| | |
| | if not response.get("combinedPhrases"): |
| | raise ValueError("No transcription found in response") |
| |
|
| | |
| | transcript = response["combinedPhrases"][0].get("text", "").strip() |
| | if not transcript: |
| | raise ValueError("Empty transcript in response") |
| |
|
| | data = {"text": transcript} |
| |
|
| | |
| | transcript_file = f"{file_dir}/{id}.json" |
| | with open(transcript_file, "w") as f: |
| | json.dump(data, f) |
| |
|
| | log.debug(data) |
| | return data |
| |
|
| | except (KeyError, IndexError, ValueError) as e: |
| | log.exception("Error parsing Azure response") |
| | raise HTTPException( |
| | status_code=500, |
| | detail=f"Failed to parse Azure response: {str(e)}", |
| | ) |
| | except requests.exceptions.RequestException as e: |
| | log.exception(e) |
| | detail = None |
| |
|
| | try: |
| | if r is not None and r.status_code != 200: |
| | res = r.json() |
| | if "error" in res: |
| | detail = f"External: {res['error'].get('message', '')}" |
| | except Exception: |
| | detail = f"External: {e}" |
| |
|
| | raise HTTPException( |
| | status_code=getattr(r, "status_code", 500) if r else 500, |
| | detail=detail if detail else "Open WebUI: Server Connection Error", |
| | ) |
| |
|
| | elif request.app.state.config.STT_ENGINE == "mistral": |
| | |
| | if not os.path.exists(file_path): |
| | raise HTTPException(status_code=400, detail="Audio file not found") |
| |
|
| | |
| | file_size = os.path.getsize(file_path) |
| | if file_size > MAX_FILE_SIZE: |
| | raise HTTPException( |
| | status_code=400, |
| | detail=f"File size exceeds limit of {MAX_FILE_SIZE_MB}MB", |
| | ) |
| |
|
| | api_key = request.app.state.config.AUDIO_STT_MISTRAL_API_KEY |
| | api_base_url = ( |
| | request.app.state.config.AUDIO_STT_MISTRAL_API_BASE_URL |
| | or "https://api.mistral.ai/v1" |
| | ) |
| | use_chat_completions = ( |
| | request.app.state.config.AUDIO_STT_MISTRAL_USE_CHAT_COMPLETIONS |
| | ) |
| |
|
| | if not api_key: |
| | raise HTTPException( |
| | status_code=400, |
| | detail="Mistral API key is required for Mistral STT", |
| | ) |
| |
|
| | r = None |
| | try: |
| | |
| | model = request.app.state.config.STT_MODEL or "voxtral-mini-latest" |
| |
|
| | log.info( |
| | f"Mistral STT - model: {model}, " |
| | f"method: {'chat_completions' if use_chat_completions else 'transcriptions'}" |
| | ) |
| |
|
| | if use_chat_completions: |
| | |
| | |
| | audio_file_to_use = file_path |
| |
|
| | if is_audio_conversion_required(file_path): |
| | log.debug("Converting audio to mp3 for chat completions API") |
| | converted_path = convert_audio_to_mp3(file_path) |
| | if converted_path: |
| | audio_file_to_use = converted_path |
| | else: |
| | log.error("Audio conversion failed") |
| | raise HTTPException( |
| | status_code=500, |
| | detail="Audio conversion failed. Chat completions API requires mp3 or wav format.", |
| | ) |
| |
|
| | |
| | with open(audio_file_to_use, "rb") as audio_file: |
| | audio_base64 = base64.b64encode(audio_file.read()).decode("utf-8") |
| |
|
| | |
| | url = f"{api_base_url}/chat/completions" |
| |
|
| | |
| | language = metadata.get("language", None) if metadata else None |
| | if language: |
| | text_instruction = f"Transcribe this audio exactly as spoken in {language}. Do not translate it." |
| | else: |
| | text_instruction = "Transcribe this audio exactly as spoken in its original language. Do not translate it to another language." |
| |
|
| | payload = { |
| | "model": model, |
| | "messages": [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "input_audio", |
| | "input_audio": audio_base64, |
| | }, |
| | {"type": "text", "text": text_instruction}, |
| | ], |
| | } |
| | ], |
| | } |
| |
|
| | r = requests.post( |
| | url=url, |
| | json=payload, |
| | headers={ |
| | "Authorization": f"Bearer {api_key}", |
| | "Content-Type": "application/json", |
| | }, |
| | timeout=AIOHTTP_CLIENT_TIMEOUT, |
| | ) |
| |
|
| | r.raise_for_status() |
| | response = r.json() |
| |
|
| | |
| | transcript = ( |
| | response.get("choices", [{}])[0] |
| | .get("message", {}) |
| | .get("content", "") |
| | .strip() |
| | ) |
| | if not transcript: |
| | raise ValueError("Empty transcript in response") |
| |
|
| | data = {"text": transcript} |
| |
|
| | else: |
| | |
| | url = f"{api_base_url}/audio/transcriptions" |
| |
|
| | |
| | mime_type, _ = mimetypes.guess_type(file_path) |
| | if not mime_type: |
| | mime_type = "audio/webm" |
| |
|
| | |
| | with open(file_path, "rb") as audio_file: |
| | files = {"file": (filename, audio_file, mime_type)} |
| | data_form = {"model": model} |
| |
|
| | |
| | language = metadata.get("language", None) if metadata else None |
| | if language: |
| | data_form["language"] = language |
| |
|
| | r = requests.post( |
| | url=url, |
| | files=files, |
| | data=data_form, |
| | headers={ |
| | "Authorization": f"Bearer {api_key}", |
| | }, |
| | timeout=AIOHTTP_CLIENT_TIMEOUT, |
| | ) |
| |
|
| | r.raise_for_status() |
| | response = r.json() |
| |
|
| | |
| | transcript = response.get("text", "").strip() |
| | if not transcript: |
| | raise ValueError("Empty transcript in response") |
| |
|
| | data = {"text": transcript} |
| |
|
| | |
| | transcript_file = f"{file_dir}/{id}.json" |
| | with open(transcript_file, "w") as f: |
| | json.dump(data, f) |
| |
|
| | log.debug(data) |
| | return data |
| |
|
| | except ValueError as e: |
| | log.exception("Error parsing Mistral response") |
| | raise HTTPException( |
| | status_code=500, |
| | detail=f"Failed to parse Mistral response: {str(e)}", |
| | ) |
| | except requests.exceptions.RequestException as e: |
| | log.exception(e) |
| | detail = None |
| |
|
| | try: |
| | if r is not None and r.status_code != 200: |
| | res = r.json() |
| | if "error" in res: |
| | detail = f"External: {res['error'].get('message', '')}" |
| | else: |
| | detail = f"External: {r.text}" |
| | except Exception: |
| | detail = f"External: {e}" |
| |
|
| | raise HTTPException( |
| | status_code=getattr(r, "status_code", 500) if r else 500, |
| | detail=detail if detail else "Open WebUI: Server Connection Error", |
| | ) |
| |
|
| |
|
| | def transcribe( |
| | request: Request, file_path: str, metadata: Optional[dict] = None, user=None |
| | ): |
| | log.info(f"transcribe: {file_path} {metadata}") |
| |
|
| | if is_audio_conversion_required(file_path): |
| | file_path = convert_audio_to_mp3(file_path) |
| |
|
| | try: |
| | file_path = compress_audio(file_path) |
| | except Exception as e: |
| | log.exception(e) |
| |
|
| | |
| | try: |
| | chunk_paths = split_audio(file_path, MAX_FILE_SIZE) |
| | print(f"Chunk paths: {chunk_paths}") |
| | except Exception as e: |
| | log.exception(e) |
| | raise HTTPException( |
| | status_code=status.HTTP_400_BAD_REQUEST, |
| | detail=ERROR_MESSAGES.DEFAULT(e), |
| | ) |
| |
|
| | results = [] |
| | try: |
| | with ThreadPoolExecutor() as executor: |
| | |
| | futures = [ |
| | executor.submit( |
| | transcription_handler, request, chunk_path, metadata, user |
| | ) |
| | for chunk_path in chunk_paths |
| | ] |
| | |
| | for future in futures: |
| | try: |
| | results.append(future.result()) |
| | except Exception as transcribe_exc: |
| | raise HTTPException( |
| | status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, |
| | detail=f"Error transcribing chunk: {transcribe_exc}", |
| | ) |
| | finally: |
| | |
| | for chunk_path in chunk_paths: |
| | if chunk_path != file_path and os.path.isfile(chunk_path): |
| | try: |
| | os.remove(chunk_path) |
| | except Exception: |
| | pass |
| |
|
| | return { |
| | "text": " ".join([result["text"] for result in results]), |
| | } |
| |
|
| |
|
| | def compress_audio(file_path): |
| | if os.path.getsize(file_path) > MAX_FILE_SIZE: |
| | id = os.path.splitext(os.path.basename(file_path))[ |
| | 0 |
| | ] |
| | file_dir = os.path.dirname(file_path) |
| |
|
| | audio = AudioSegment.from_file(file_path) |
| | audio = audio.set_frame_rate(16000).set_channels(1) |
| |
|
| | compressed_path = os.path.join(file_dir, f"{id}_compressed.mp3") |
| | audio.export(compressed_path, format="mp3", bitrate="32k") |
| | |
| |
|
| | return compressed_path |
| | else: |
| | return file_path |
| |
|
| |
|
| | def split_audio(file_path, max_bytes, format="mp3", bitrate="32k"): |
| | """ |
| | Splits audio into chunks not exceeding max_bytes. |
| | Returns a list of chunk file paths. If audio fits, returns list with original path. |
| | """ |
| | file_size = os.path.getsize(file_path) |
| | if file_size <= max_bytes: |
| | return [file_path] |
| |
|
| | audio = AudioSegment.from_file(file_path) |
| | duration_ms = len(audio) |
| | orig_size = file_size |
| |
|
| | approx_chunk_ms = max(int(duration_ms * (max_bytes / orig_size)) - 1000, 1000) |
| | chunks = [] |
| | start = 0 |
| | i = 0 |
| |
|
| | base, _ = os.path.splitext(file_path) |
| |
|
| | while start < duration_ms: |
| | end = min(start + approx_chunk_ms, duration_ms) |
| | chunk = audio[start:end] |
| | chunk_path = f"{base}_chunk_{i}.{format}" |
| | chunk.export(chunk_path, format=format, bitrate=bitrate) |
| |
|
| | |
| | while os.path.getsize(chunk_path) > max_bytes and (end - start) > 5000: |
| | end = start + ((end - start) // 2) |
| | chunk = audio[start:end] |
| | chunk.export(chunk_path, format=format, bitrate=bitrate) |
| |
|
| | if os.path.getsize(chunk_path) > max_bytes: |
| | os.remove(chunk_path) |
| | raise Exception("Audio chunk cannot be reduced below max file size.") |
| |
|
| | chunks.append(chunk_path) |
| | start = end |
| | i += 1 |
| |
|
| | return chunks |
| |
|
| |
|
| | @router.post("/transcriptions") |
| | def transcription( |
| | request: Request, |
| | file: UploadFile = File(...), |
| | language: Optional[str] = Form(None), |
| | user=Depends(get_verified_user), |
| | ): |
| | if user.role != "admin" and not has_permission( |
| | user.id, "chat.stt", request.app.state.config.USER_PERMISSIONS |
| | ): |
| | raise HTTPException( |
| | status_code=status.HTTP_403_FORBIDDEN, |
| | detail=ERROR_MESSAGES.ACCESS_PROHIBITED, |
| | ) |
| | log.info(f"file.content_type: {file.content_type}") |
| | stt_supported_content_types = getattr( |
| | request.app.state.config, "STT_SUPPORTED_CONTENT_TYPES", [] |
| | ) |
| |
|
| | if not strict_match_mime_type(stt_supported_content_types, file.content_type): |
| | raise HTTPException( |
| | status_code=status.HTTP_400_BAD_REQUEST, |
| | detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, |
| | ) |
| |
|
| | try: |
| | ext = file.filename.split(".")[-1] |
| | id = uuid.uuid4() |
| |
|
| | filename = f"{id}.{ext}" |
| | contents = file.file.read() |
| |
|
| | file_dir = f"{CACHE_DIR}/audio/transcriptions" |
| | os.makedirs(file_dir, exist_ok=True) |
| | file_path = f"{file_dir}/{filename}" |
| |
|
| | with open(file_path, "wb") as f: |
| | f.write(contents) |
| |
|
| | try: |
| | metadata = None |
| |
|
| | if language: |
| | metadata = {"language": language} |
| |
|
| | result = transcribe(request, file_path, metadata, user) |
| |
|
| | return { |
| | **result, |
| | "filename": os.path.basename(file_path), |
| | } |
| |
|
| | except Exception as e: |
| | log.exception(e) |
| |
|
| | raise HTTPException( |
| | status_code=status.HTTP_400_BAD_REQUEST, |
| | detail=ERROR_MESSAGES.DEFAULT(e), |
| | ) |
| |
|
| | except Exception as e: |
| | log.exception(e) |
| |
|
| | raise HTTPException( |
| | status_code=status.HTTP_400_BAD_REQUEST, |
| | detail=ERROR_MESSAGES.DEFAULT(e), |
| | ) |
| |
|
| |
|
| | def get_available_models(request: Request) -> list[dict]: |
| | available_models = [] |
| | if request.app.state.config.TTS_ENGINE == "openai": |
| | |
| | if not request.app.state.config.TTS_OPENAI_API_BASE_URL.startswith( |
| | "https://api.openai.com" |
| | ): |
| | try: |
| | response = requests.get( |
| | f"{request.app.state.config.TTS_OPENAI_API_BASE_URL}/audio/models", |
| | timeout=AIOHTTP_CLIENT_TIMEOUT_MODEL_LIST, |
| | ) |
| | response.raise_for_status() |
| | data = response.json() |
| | available_models = data.get("models", []) |
| | except Exception as e: |
| | log.error(f"Error fetching models from custom endpoint: {str(e)}") |
| | available_models = [{"id": "tts-1"}, {"id": "tts-1-hd"}] |
| | else: |
| | available_models = [{"id": "tts-1"}, {"id": "tts-1-hd"}] |
| | elif request.app.state.config.TTS_ENGINE == "elevenlabs": |
| | try: |
| | response = requests.get( |
| | f"{ELEVENLABS_API_BASE_URL}/v1/models", |
| | headers={ |
| | "xi-api-key": request.app.state.config.TTS_API_KEY, |
| | "Content-Type": "application/json", |
| | }, |
| | timeout=5, |
| | ) |
| | response.raise_for_status() |
| | models = response.json() |
| |
|
| | available_models = [ |
| | {"name": model["name"], "id": model["model_id"]} for model in models |
| | ] |
| | except requests.RequestException as e: |
| | log.error(f"Error fetching voices: {str(e)}") |
| | return available_models |
| |
|
| |
|
| | @router.get("/models") |
| | async def get_models(request: Request, user=Depends(get_verified_user)): |
| | return {"models": get_available_models(request)} |
| |
|
| |
|
| | def get_available_voices(request) -> dict: |
| | """Returns {voice_id: voice_name} dict""" |
| | available_voices = {} |
| | if request.app.state.config.TTS_ENGINE == "openai": |
| | |
| | if not request.app.state.config.TTS_OPENAI_API_BASE_URL.startswith( |
| | "https://api.openai.com" |
| | ): |
| | try: |
| | response = requests.get( |
| | f"{request.app.state.config.TTS_OPENAI_API_BASE_URL}/audio/voices", |
| | timeout=AIOHTTP_CLIENT_TIMEOUT_MODEL_LIST, |
| | ) |
| | response.raise_for_status() |
| | data = response.json() |
| | voices_list = data.get("voices", []) |
| | available_voices = {voice["id"]: voice["name"] for voice in voices_list} |
| | except Exception as e: |
| | log.error(f"Error fetching voices from custom endpoint: {str(e)}") |
| | available_voices = { |
| | "alloy": "alloy", |
| | "echo": "echo", |
| | "fable": "fable", |
| | "onyx": "onyx", |
| | "nova": "nova", |
| | "shimmer": "shimmer", |
| | } |
| | else: |
| | available_voices = { |
| | "alloy": "alloy", |
| | "echo": "echo", |
| | "fable": "fable", |
| | "onyx": "onyx", |
| | "nova": "nova", |
| | "shimmer": "shimmer", |
| | } |
| | elif request.app.state.config.TTS_ENGINE == "elevenlabs": |
| | try: |
| | available_voices = get_elevenlabs_voices( |
| | api_key=request.app.state.config.TTS_API_KEY |
| | ) |
| | except Exception: |
| | |
| | pass |
| | elif request.app.state.config.TTS_ENGINE == "azure": |
| | try: |
| | region = request.app.state.config.TTS_AZURE_SPEECH_REGION |
| | base_url = request.app.state.config.TTS_AZURE_SPEECH_BASE_URL |
| | url = ( |
| | base_url or f"https://{region}.tts.speech.microsoft.com" |
| | ) + "/cognitiveservices/voices/list" |
| | headers = { |
| | "Ocp-Apim-Subscription-Key": request.app.state.config.TTS_API_KEY |
| | } |
| |
|
| | response = requests.get( |
| | url, headers=headers, timeout=AIOHTTP_CLIENT_TIMEOUT_MODEL_LIST |
| | ) |
| | response.raise_for_status() |
| | voices = response.json() |
| |
|
| | for voice in voices: |
| | available_voices[voice["ShortName"]] = ( |
| | f"{voice['DisplayName']} ({voice['ShortName']})" |
| | ) |
| | except requests.RequestException as e: |
| | log.error(f"Error fetching voices: {str(e)}") |
| |
|
| | return available_voices |
| |
|
| |
|
| | @lru_cache |
| | def get_elevenlabs_voices(api_key: str) -> dict: |
| | """ |
| | Note, set the following in your .env file to use Elevenlabs: |
| | AUDIO_TTS_ENGINE=elevenlabs |
| | AUDIO_TTS_API_KEY=sk_... # Your Elevenlabs API key |
| | AUDIO_TTS_VOICE=EXAVITQu4vr4xnSDxMaL # From https://api.elevenlabs.io/v1/voices |
| | AUDIO_TTS_MODEL=eleven_multilingual_v2 |
| | """ |
| |
|
| | try: |
| | |
| | response = requests.get( |
| | f"{ELEVENLABS_API_BASE_URL}/v1/voices", |
| | headers={ |
| | "xi-api-key": api_key, |
| | "Content-Type": "application/json", |
| | }, |
| | timeout=AIOHTTP_CLIENT_TIMEOUT_MODEL_LIST, |
| | ) |
| | response.raise_for_status() |
| | voices_data = response.json() |
| |
|
| | voices = {} |
| | for voice in voices_data.get("voices", []): |
| | voices[voice["voice_id"]] = voice["name"] |
| | except requests.RequestException as e: |
| | |
| | log.error(f"Error fetching voices: {str(e)}") |
| | raise RuntimeError(f"Error fetching voices: {str(e)}") |
| |
|
| | return voices |
| |
|
| |
|
| | @router.get("/voices") |
| | async def get_voices(request: Request, user=Depends(get_verified_user)): |
| | return { |
| | "voices": [ |
| | {"id": k, "name": v} for k, v in get_available_voices(request).items() |
| | ] |
| | } |
| |
|