mcptts / server.py
Nguyen Trung
Rm
45acd11
# server.py
from __future__ import annotations
from typing import Optional, List, Dict, Any
from pathlib import Path
from urllib.parse import quote_plus
from datetime import datetime
import os
import json
import uuid
import mimetypes
from fastmcp import FastMCP
from huggingface_hub import HfApi
from tts_core import get_eleven_client, ensure_output_dir, generate_and_save_audio
from voices import (
load_voices_map,
list_voices_data,
resolve_voice,
VOICES_MAP_PATH_DEFAULT,
)
# ====== Hugging Face Space config (where Gradio UI is running) ======
ONLINE_UI_BASE = "https://trung06042002-mcptts.hf.space"
# Repo Space của bạn (đúng theo link)
HF_SPACE_REPO_ID = "trung06042002/mcptts"
HF_REPO_TYPE = "space"
# Folder trong repo Space để chứa audio + metadata
HF_AUDIO_DIR = "audios"
HF_META_DIR = "meta"
# Token upload (tạo trên HF settings/tokens, quyền write)
HF_TOKEN_ENV = "HF_TOKEN"
mcp = FastMCP("elevenlabs-tts")
def _require_hf_token() -> str:
token = os.getenv(HF_TOKEN_ENV)
if not token:
raise RuntimeError(
f"Missing {HF_TOKEN_ENV}. Please export {HF_TOKEN_ENV}=<your_hf_write_token> "
"before running the MCP server."
)
return token
def _safe_stem(text: str, max_len: int = 40) -> str:
s = "".join(c if c.isalnum() else "_" for c in text.strip().lower())
s = "_".join([p for p in s.split("_") if p])
return (s[:max_len] or "tts").rstrip("_")
def _guess_ext_from_format(output_format: str) -> str:
# output_format examples: "mp3_44100_128", "wav_44100"
head = (output_format or "").split("_", 1)[0].lower()
if head in {"mp3", "wav", "ogg", "flac", "m4a"}:
return f".{head}"
# fallback
return ".mp3"
def _upload_file_to_space(local_path: Path, path_in_repo: str, commit_message: str) -> None:
token = _require_hf_token()
api = HfApi(token=token)
api.upload_file(
path_or_fileobj=str(local_path),
path_in_repo=path_in_repo,
repo_id=HF_SPACE_REPO_ID,
repo_type=HF_REPO_TYPE,
commit_message=commit_message,
)
def upload_audio_and_meta(
audio_path: str,
*,
text: str,
voice_key: str,
voice_label: str,
voice_id: str,
model_id: str,
output_format: str,
) -> Dict[str, str]:
"""
Upload audio + metadata JSON lên HF Space repo.
Trả về:
- hf_audio_path: path trong repo (vd: audios/xxx.mp3)
- hf_meta_path: path trong repo (vd: meta/xxx.json)
- hf_audio_url: URL để truy cập file qua hf.space
- ui_url: URL UI (Gradio)
"""
p = Path(audio_path)
if not p.exists():
raise FileNotFoundError(f"Audio file not found: {audio_path}")
# tạo tên file unique để tránh đè
ext = p.suffix if p.suffix else _guess_ext_from_format(output_format)
safe = _safe_stem(text)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
uid = uuid.uuid4().hex[:8]
filename = f"{safe}_{voice_key}_{ts}_{uid}{ext}"
hf_audio_path = f"{HF_AUDIO_DIR}/{filename}"
hf_meta_path = f"{HF_META_DIR}/{filename}.json"
# metadata
size_bytes = p.stat().st_size
mime = mimetypes.guess_type(filename)[0] or "application/octet-stream"
meta = {
"text": text,
"voice_key": voice_key,
"voice_label": voice_label,
"voice_id": voice_id,
"model_id": model_id,
"output_format": output_format,
"filename": filename,
"size_bytes": size_bytes,
"mime_type": mime,
"created_at": ts,
}
tmp_meta = p.with_suffix(p.suffix + ".json.tmp")
tmp_meta.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
# upload audio
_upload_file_to_space(
p,
hf_audio_path,
commit_message=f"Upload audio: {filename}",
)
# upload meta
_upload_file_to_space(
tmp_meta,
hf_meta_path,
commit_message=f"Upload meta: {filename}.json",
)
# cleanup temp meta
try:
tmp_meta.unlink(missing_ok=True) # py3.8+ has missing_ok
except TypeError:
if tmp_meta.exists():
tmp_meta.unlink()
# URL truy cập file trên hf.space (path-based)
# Lưu ý: hf.space có route /file/<path_in_repo>
hf_audio_url = f"{ONLINE_UI_BASE}/file/{hf_audio_path}"
ui_url = ONLINE_UI_BASE # UI chỉ list/play/download; không cần query params nữa
return {
"hf_audio_path": hf_audio_path,
"hf_meta_path": hf_meta_path,
"hf_audio_url": hf_audio_url,
"ui_url": ui_url,
}
@mcp.tool
def list_voices(
voices_map_path: str = VOICES_MAP_PATH_DEFAULT,
) -> List[Dict[str, Any]]:
"""
Liệt kê các voice khả dụng từ voices.yaml.
Trả về list:
- key: key dùng trong code (vd: 'sarah')
- voice_id: mã ElevenLabs
- label: tên hiển thị
"""
voices_map = load_voices_map(voices_map_path)
return list_voices_data(voices_map)
@mcp.tool
def generate_tts(
text: str,
voices: Optional[List[str]] = None,
voice: Optional[str] = None, # 1 giọng đơn
model_id: str = "eleven_turbo_v2",
output_dir: str = "./outputs",
output_format: str = "mp3_44100_128",
language_code: Optional[str] = None,
env_path: str = ".env",
voices_map_path: str = VOICES_MAP_PATH_DEFAULT,
stability: float = 0.3,
similarity_boost: float = 0.7,
style: float = 0.8,
use_speaker_boost: bool = True,
speed: Optional[float] = None,
upload_to_hf: bool = True, # ✅ NEW: có upload lên HF Space hay không
) -> Dict[str, Any]:
"""
Sinh 1 hoặc nhiều file TTS từ text và (tuỳ chọn) upload lên Hugging Face Space để nghe/download online.
- Nếu KHÔNG truyền 'voices' và cũng KHÔNG truyền 'voice':
-> Không sinh audio, trả:
{
"status": "need_voice_selection",
"available_voices": [...],
"message": "..."
}
- Nếu truyền 'voices' (list) -> sinh cho tất cả.
- Nếu truyền 'voice' (string) -> sinh cho 1 giọng.
Nếu upload_to_hf=True:
- yêu cầu env var HF_TOKEN (write token)
- upload audio vào Space repo: audios/<file>
- upload meta vào Space repo: meta/<file>.json
- trả thêm hf_audio_url + ui_url
"""
voices_map = load_voices_map(voices_map_path)
# Xác định danh sách voice yêu cầu
requested: List[str] = []
if voices and len(voices) > 0:
requested.extend(voices)
elif voice:
requested.append(voice)
else:
return {
"status": "need_voice_selection",
"message": (
"No voice was specified. Please choose one or more voices from "
"'available_voices' and call generate_tts again with the 'voices' "
"parameter (or 'voice' for a single voice)."
),
"available_voices": list_voices_data(voices_map),
}
# Chuẩn bị client & output dir (local)
eleven = get_eleven_client(env_path)
base_output_dir = ensure_output_dir(output_dir)
voice_settings = {
"stability": stability,
"similarity_boost": similarity_boost,
"style": style,
"use_speaker_boost": use_speaker_boost,
}
if speed is not None:
voice_settings["speed"] = speed
results: List[Dict[str, Any]] = []
for v in requested:
resolved = resolve_voice(v, voices_map)
voice_id = resolved["voice_id"]
voice_key = resolved["voice_key"]
voice_label = resolved["voice_label"]
if not voice_id:
raise ValueError(
f"Could not resolve voice '{voice_key}' to a valid voice_id."
)
audio_path = generate_and_save_audio(
eleven=eleven,
text=text,
voice_id=voice_id,
model_id=model_id,
output_dir=base_output_dir,
output_format=output_format,
language_code=language_code,
voice_settings=voice_settings,
)
item: Dict[str, Any] = {
"text": text,
"voice_key": voice_key,
"voice_label": voice_label,
"voice_id": voice_id,
"model_id": model_id,
"output_format": output_format,
"audio_path": audio_path, # local path
"ui_url": ONLINE_UI_BASE, # UI online (list/play/download)
}
if upload_to_hf:
uploaded = upload_audio_and_meta(
audio_path,
text=text,
voice_key=voice_key,
voice_label=voice_label,
voice_id=voice_id,
model_id=model_id,
output_format=output_format,
)
item.update(uploaded)
# (tuỳ bạn) vẫn giữ query-param link cho tiện share, dù UI không cần
item["ui_url_with_params"] = (
f"{ONLINE_UI_BASE}/"
f"?text={quote_plus(text)}"
f"&voice={quote_plus(voice_key)}"
)
results.append(item)
return {
"status": "ok",
"audios": results,
}
if __name__ == "__main__":
# Khuyến nghị: đừng print thêm ra stdout, để Claude Desktop (STDIO) ổn định.
mcp.run()