| | #include <algorithm> |
| | #include <atomic> |
| | #include <chrono> |
| | #include <cstdlib> |
| | #include <filesystem> |
| | #include <fstream> |
| | #include <iostream> |
| | #include <mutex> |
| | #include <sstream> |
| | #include <string> |
| | #include <thread> |
| | #include <vector> |
| |
|
| | #ifndef NOMINMAX |
| | #define NOMINMAX |
| | #endif |
| | #include <windows.h> |
| |
|
| | #include <nlohmann/json.hpp> |
| |
|
| | #include <KittenTTS> |
| |
|
| | using json = nlohmann::json; |
| |
|
| | namespace { |
| |
|
| | constexpr const char* kServerName = "kitten-tts-mcp"; |
| | constexpr const char* kServerVersion = "0.1.0"; |
| | constexpr const char* kProtocolVersion = "2025-03-26"; |
| | constexpr const char* kFixedModelName = "nano"; |
| | constexpr const char* kDefaultVoice = "Jasper"; |
| | constexpr const char* kDefaultLocale = "en-us"; |
| | constexpr float kDefaultSpeed = 1.0f; |
| |
|
| | std::string TrimAsciiWhitespace(const std::string& value) { |
| | const auto is_space = [](unsigned char ch) { return std::isspace(ch) != 0; }; |
| |
|
| | size_t begin = 0; |
| | while (begin < value.size() && is_space(static_cast<unsigned char>(value[begin]))) { |
| | ++begin; |
| | } |
| |
|
| | size_t end = value.size(); |
| | while (end > begin && is_space(static_cast<unsigned char>(value[end - 1]))) { |
| | --end; |
| | } |
| |
|
| | return value.substr(begin, end - begin); |
| | } |
| |
|
| | std::filesystem::path GetExecutableDirectory() { |
| | wchar_t module_path[MAX_PATH] = {}; |
| | const DWORD length = GetModuleFileNameW(nullptr, module_path, MAX_PATH); |
| | if (length > 0 && length < MAX_PATH) { |
| | return std::filesystem::path(std::wstring(module_path, length)).parent_path(); |
| | } |
| | return std::filesystem::current_path(); |
| | } |
| |
|
| | std::filesystem::path FindFileNearExecutable(const std::filesystem::path& exe_dir, const std::string& filename) { |
| | std::error_code ec; |
| | std::filesystem::path probe = exe_dir; |
| |
|
| | for (int depth = 0; depth < 8; ++depth) { |
| | const std::filesystem::path candidate_same = probe / filename; |
| | if (std::filesystem::exists(candidate_same, ec)) { |
| | return candidate_same; |
| | } |
| |
|
| | const std::filesystem::path candidate_models = probe / "models" / filename; |
| | if (std::filesystem::exists(candidate_models, ec)) { |
| | return candidate_models; |
| | } |
| |
|
| | const std::filesystem::path parent = probe.parent_path(); |
| | if (parent.empty() || parent == probe) { |
| | break; |
| | } |
| | probe = parent; |
| | } |
| |
|
| | return {}; |
| | } |
| |
|
| | std::string GetEnvironmentString(const char* name, const std::string& fallback) { |
| | if (name == nullptr || name[0] == '\0') { |
| | return fallback; |
| | } |
| |
|
| | const char* value = std::getenv(name); |
| | if (value == nullptr || value[0] == '\0') { |
| | return fallback; |
| | } |
| |
|
| | return value; |
| | } |
| |
|
| | float GetEnvironmentFloat(const char* name, float fallback) { |
| | const std::string raw = GetEnvironmentString(name, ""); |
| | if (raw.empty()) { |
| | return fallback; |
| | } |
| |
|
| | try { |
| | return std::stof(raw); |
| | } catch (...) { |
| | return fallback; |
| | } |
| | } |
| |
|
| | std::vector<std::string> DefaultVoiceList() { |
| | return {"Bella", "Bruno", "Hugo", "Jasper", "Kiki", "Leo", "Luna", "Rosie"}; |
| | } |
| |
|
| | std::vector<std::string> LoadVoiceNamesFromJson(const std::filesystem::path& voices_path) { |
| | std::ifstream input(voices_path, std::ios::binary); |
| | if (!input) { |
| | return {}; |
| | } |
| |
|
| | json voices_json; |
| | try { |
| | input >> voices_json; |
| | } catch (...) { |
| | return {}; |
| | } |
| |
|
| | if (!voices_json.is_object()) { |
| | return {}; |
| | } |
| |
|
| | std::vector<std::string> voices; |
| | voices.reserve(voices_json.size()); |
| | for (auto it = voices_json.begin(); it != voices_json.end(); ++it) { |
| | voices.push_back(it.key()); |
| | } |
| |
|
| | std::sort(voices.begin(), voices.end()); |
| | voices.erase(std::unique(voices.begin(), voices.end()), voices.end()); |
| | return voices; |
| | } |
| |
|
| | void WriteJsonLine(const json& message) { |
| | std::cout << message.dump() << "\n"; |
| | std::cout.flush(); |
| | } |
| |
|
| | void WriteResult(const json& id, const json& result) { |
| | WriteJsonLine({ |
| | {"jsonrpc", "2.0"}, |
| | {"id", id}, |
| | {"result", result} |
| | }); |
| | } |
| |
|
| | void WriteError(const json& id, int code, const std::string& message, const json& data = nullptr) { |
| | json error = { |
| | {"code", code}, |
| | {"message", message} |
| | }; |
| | if (!data.is_null()) { |
| | error["data"] = data; |
| | } |
| |
|
| | WriteJsonLine({ |
| | {"jsonrpc", "2.0"}, |
| | {"id", id}, |
| | {"error", error} |
| | }); |
| | } |
| |
|
| | json MakeTextContent(const std::string& text) { |
| | return json{ |
| | {"type", "text"}, |
| | {"text", text} |
| | }; |
| | } |
| |
|
| | class KittenRuntime { |
| | public: |
| | bool Initialize() { |
| | const std::filesystem::path exe_dir = GetExecutableDirectory(); |
| |
|
| | default_voice_ = GetEnvironmentString("KITTEN_TTS_MCP_VOICE", kDefaultVoice); |
| | default_locale_ = GetEnvironmentString("KITTEN_TTS_MCP_LOCALE", kDefaultLocale); |
| | default_speed_ = GetEnvironmentFloat("KITTEN_TTS_MCP_SPEED", kDefaultSpeed); |
| |
|
| | const std::string model_filename = "kitten_tts_nano_v0_8.onnx"; |
| | const std::string voices_filename = "voices_nano.json"; |
| | model_path_ = FindFileNearExecutable(exe_dir, model_filename); |
| | voices_path_ = FindFileNearExecutable(exe_dir, voices_filename); |
| |
|
| | if (model_path_.empty() || voices_path_.empty()) { |
| | std::ostringstream oss; |
| | oss << "Could not find the fixed Kitten nano model files."; |
| | oss << " Expected " << model_filename << " and " << voices_filename << "."; |
| | last_error_ = oss.str(); |
| | return false; |
| | } |
| |
|
| | voices_ = LoadVoiceNamesFromJson(voices_path_); |
| | if (voices_.empty()) { |
| | voices_ = DefaultVoiceList(); |
| | } |
| |
|
| | kittentts::TTSSimpleServiceConfig config; |
| | config.model_name = kFixedModelName; |
| | config.model_path = model_path_.string(); |
| | config.voices_path = voices_path_.string(); |
| | config.voice = default_voice_; |
| | config.locale = default_locale_; |
| | config.speed = default_speed_; |
| |
|
| | service_.SetEventCallback([this](const kittentts::TTSSimpleService::Event& event) { |
| | OnServiceEvent(event); |
| | }); |
| |
|
| | if (!service_.Initialize(config)) { |
| | last_error_ = service_.GetLastError(); |
| | if (last_error_.empty()) { |
| | last_error_ = "TTSSimpleService initialization failed."; |
| | } |
| | return false; |
| | } |
| |
|
| | initialized_ = true; |
| | return true; |
| | } |
| |
|
| | void Shutdown() { |
| | if (!initialized_) { |
| | return; |
| | } |
| | service_.Stop(); |
| | service_.Shutdown(); |
| | initialized_ = false; |
| | } |
| |
|
| | bool IsInitialized() const { |
| | return initialized_; |
| | } |
| |
|
| | const std::string& GetLastError() const { |
| | return last_error_; |
| | } |
| |
|
| | const std::vector<std::string>& GetVoices() const { |
| | return voices_; |
| | } |
| |
|
| | bool Speak( |
| | const std::string& text, |
| | const std::string& voice, |
| | const std::string& locale, |
| | float speed, |
| | bool blocking, |
| | std::string& message_out) { |
| | message_out.clear(); |
| |
|
| | if (!initialized_) { |
| | message_out = "TTS runtime is not initialized."; |
| | return false; |
| | } |
| |
|
| | const std::string trimmed_text = TrimAsciiWhitespace(text); |
| | if (trimmed_text.empty()) { |
| | message_out = "Text must not be empty."; |
| | return false; |
| | } |
| |
|
| | if (speed < 0.5f || speed > 2.0f) { |
| | message_out = "Speed must be between 0.5 and 2.0."; |
| | return false; |
| | } |
| |
|
| | const std::string effective_voice = voice.empty() ? default_voice_ : voice; |
| | const std::string effective_locale = locale.empty() ? default_locale_ : locale; |
| | if (!voices_.empty() && |
| | std::find(voices_.begin(), voices_.end(), effective_voice) == voices_.end()) { |
| | message_out = "Unknown voice: " + effective_voice; |
| | return false; |
| | } |
| |
|
| | { |
| | std::lock_guard<std::mutex> lock(state_mutex_); |
| | last_error_.clear(); |
| | terminal_state_ = TerminalState::None; |
| | terminal_message_.clear(); |
| | } |
| |
|
| | service_.ClearLastError(); |
| | service_.SetVoice(effective_voice); |
| | service_.SetLocale(effective_locale); |
| | service_.SetSpeed(speed); |
| |
|
| | if (!service_.PlayTextUtf8(trimmed_text)) { |
| | message_out = service_.GetLastError(); |
| | if (message_out.empty()) { |
| | message_out = "Speech request failed."; |
| | } |
| | return false; |
| | } |
| |
|
| | if (!blocking) { |
| | message_out = "Speech started."; |
| | return true; |
| | } |
| |
|
| | while (service_.IsPlaybackActive()) { |
| | std::this_thread::sleep_for(std::chrono::milliseconds(25)); |
| | } |
| |
|
| | { |
| | std::lock_guard<std::mutex> lock(state_mutex_); |
| | if (terminal_state_ == TerminalState::Error) { |
| | message_out = terminal_message_.empty() ? "Speech failed." : terminal_message_; |
| | return false; |
| | } |
| | if (terminal_state_ == TerminalState::Stopped) { |
| | message_out = terminal_message_.empty() ? "Speech was stopped." : terminal_message_; |
| | return false; |
| | } |
| | } |
| |
|
| | message_out = "Speech completed."; |
| | return true; |
| | } |
| |
|
| | bool Stop(std::string& message_out) { |
| | message_out.clear(); |
| |
|
| | if (!initialized_) { |
| | message_out = "TTS runtime is not initialized."; |
| | return false; |
| | } |
| |
|
| | service_.Stop(); |
| | message_out = "Playback stopped."; |
| | return true; |
| | } |
| |
|
| | private: |
| | enum class TerminalState { |
| | None = 0, |
| | Completed, |
| | Stopped, |
| | Error |
| | }; |
| |
|
| | void OnServiceEvent(const kittentts::TTSSimpleService::Event& event) { |
| | std::lock_guard<std::mutex> lock(state_mutex_); |
| |
|
| | switch (event.type) { |
| | case kittentts::TTSSimpleService::EventType::PlaybackCompleted: |
| | terminal_state_ = TerminalState::Completed; |
| | terminal_message_ = "Speech completed."; |
| | break; |
| | case kittentts::TTSSimpleService::EventType::PlaybackStopped: |
| | if (terminal_state_ != TerminalState::Error) { |
| | terminal_state_ = TerminalState::Stopped; |
| | terminal_message_ = event.message.empty() ? "Speech was stopped." : event.message; |
| | } |
| | break; |
| | case kittentts::TTSSimpleService::EventType::Error: |
| | terminal_state_ = TerminalState::Error; |
| | terminal_message_ = event.message.empty() ? "Speech failed." : event.message; |
| | last_error_ = terminal_message_; |
| | break; |
| | default: |
| | break; |
| | } |
| | } |
| |
|
| | kittentts::TTSSimpleService service_; |
| | bool initialized_ = false; |
| | std::filesystem::path model_path_; |
| | std::filesystem::path voices_path_; |
| | std::string default_voice_; |
| | std::string default_locale_; |
| | float default_speed_ = kDefaultSpeed; |
| | std::vector<std::string> voices_; |
| | mutable std::mutex state_mutex_; |
| | TerminalState terminal_state_ = TerminalState::None; |
| | std::string terminal_message_; |
| | std::string last_error_; |
| | }; |
| |
|
| | KittenRuntime g_runtime; |
| |
|
| | void HandleInitialize(const json& request) { |
| | const json id = request.contains("id") ? request["id"] : json(nullptr); |
| | const json params = request.value("params", json::object()); |
| | const std::string requested_version = params.value("protocolVersion", ""); |
| |
|
| | json result = { |
| | {"protocolVersion", kProtocolVersion}, |
| | {"capabilities", { |
| | {"tools", { |
| | {"listChanged", false} |
| | }} |
| | }}, |
| | {"serverInfo", { |
| | {"name", kServerName}, |
| | {"version", kServerVersion} |
| | }}, |
| | {"instructions", "KittenTTS MCP server. Tools: speak, stop_speaking, list_voices."} |
| | }; |
| |
|
| | if (!requested_version.empty() && requested_version != kProtocolVersion) { |
| | result["warnings"] = json::array({ |
| | "Client requested a different protocolVersion; server is responding with 2025-03-26." |
| | }); |
| | } |
| |
|
| | WriteResult(id, result); |
| | } |
| |
|
| | void HandleToolsList(const json& request) { |
| | const json id = request.contains("id") ? request["id"] : json(nullptr); |
| |
|
| | json tools = json::array(); |
| | tools.push_back({ |
| | {"name", "speak"}, |
| | {"title", "Speak Text"}, |
| | {"description", "Speak text aloud on the local machine using KittenTTS."}, |
| | {"inputSchema", { |
| | {"type", "object"}, |
| | {"properties", { |
| | {"text", { |
| | {"type", "string"}, |
| | {"description", "The text to speak."} |
| | }}, |
| | {"voice", { |
| | {"type", "string"}, |
| | {"description", "Optional KittenTTS voice name."} |
| | }}, |
| | {"locale", { |
| | {"type", "string"}, |
| | {"description", "Optional eSpeak locale. Default is en-us."} |
| | }}, |
| | {"speed", { |
| | {"type", "number"}, |
| | {"description", "Playback speed from 0.5 to 2.0."}, |
| | {"default", kDefaultSpeed} |
| | }}, |
| | {"blocking", { |
| | {"type", "boolean"}, |
| | {"description", "If true, wait until playback completes or fails."}, |
| | {"default", false} |
| | }} |
| | }}, |
| | {"required", json::array({"text"})}, |
| | {"additionalProperties", false} |
| | }} |
| | }); |
| |
|
| | tools.push_back({ |
| | {"name", "stop_speaking"}, |
| | {"title", "Stop Speaking"}, |
| | {"description", "Stop current local KittenTTS playback."}, |
| | {"inputSchema", { |
| | {"type", "object"}, |
| | {"properties", json::object()}, |
| | {"additionalProperties", false} |
| | }} |
| | }); |
| |
|
| | tools.push_back({ |
| | {"name", "list_voices"}, |
| | {"title", "List Voices"}, |
| | {"description", "List the predefined KittenTTS voices available to this server."}, |
| | {"inputSchema", { |
| | {"type", "object"}, |
| | {"properties", json::object()}, |
| | {"additionalProperties", false} |
| | }} |
| | }); |
| |
|
| | WriteResult(id, {{"tools", tools}}); |
| | } |
| |
|
| | void HandleToolsCall(const json& request) { |
| | const json id = request.contains("id") ? request["id"] : json(nullptr); |
| | const json params = request.value("params", json::object()); |
| | const std::string name = params.value("name", ""); |
| | const json args = params.value("arguments", json::object()); |
| |
|
| | if (name == "speak") { |
| | if (!args.contains("text") || !args["text"].is_string()) { |
| | WriteError(id, -32602, "Invalid arguments: 'text' must be a string"); |
| | return; |
| | } |
| |
|
| | const std::string text = args["text"].get<std::string>(); |
| | const std::string voice = args.value("voice", ""); |
| | const std::string locale = args.value("locale", ""); |
| | const float speed = args.contains("speed") ? args["speed"].get<float>() : kDefaultSpeed; |
| | const bool blocking = args.value("blocking", false); |
| |
|
| | std::string runtime_message; |
| | const bool ok = g_runtime.Speak(text, voice, locale, speed, blocking, runtime_message); |
| |
|
| | const std::string effective_voice = voice.empty() ? kDefaultVoice : voice; |
| | const std::string effective_locale = locale.empty() ? kDefaultLocale : locale; |
| | const std::string content_text = ok ? runtime_message : ("TTS error: " + runtime_message); |
| |
|
| | WriteResult(id, { |
| | {"content", json::array({MakeTextContent(content_text)})}, |
| | {"structuredContent", { |
| | {"ok", ok}, |
| | {"voice", effective_voice}, |
| | {"locale", effective_locale}, |
| | {"speed", speed}, |
| | {"blocking", blocking}, |
| | {"message", runtime_message} |
| | }}, |
| | {"isError", !ok} |
| | }); |
| | return; |
| | } |
| |
|
| | if (name == "stop_speaking") { |
| | std::string runtime_message; |
| | const bool ok = g_runtime.Stop(runtime_message); |
| |
|
| | WriteResult(id, { |
| | {"content", json::array({MakeTextContent(runtime_message)})}, |
| | {"structuredContent", { |
| | {"ok", ok}, |
| | {"message", runtime_message} |
| | }}, |
| | {"isError", !ok} |
| | }); |
| | return; |
| | } |
| |
|
| | if (name == "list_voices") { |
| | const auto& voices = g_runtime.GetVoices(); |
| | std::ostringstream oss; |
| | oss << "Available voices:"; |
| | for (const auto& voice_name : voices) { |
| | oss << " " << voice_name; |
| | } |
| |
|
| | WriteResult(id, { |
| | {"content", json::array({MakeTextContent(oss.str())})}, |
| | {"structuredContent", { |
| | {"voices", voices} |
| | }}, |
| | {"isError", false} |
| | }); |
| | return; |
| | } |
| |
|
| | WriteError(id, -32601, "Unknown tool: " + name); |
| | } |
| |
|
| | } |
| |
|
| | int main() { |
| | std::ios::sync_with_stdio(false); |
| |
|
| | std::cerr << "[mcp] " << kServerName << " starting\n"; |
| | if (!g_runtime.Initialize()) { |
| | std::cerr << "[mcp] failed to initialize KittenTTS runtime: " << g_runtime.GetLastError() << "\n"; |
| | return 1; |
| | } |
| |
|
| | std::string line; |
| | while (std::getline(std::cin, line)) { |
| | if (line.empty()) { |
| | continue; |
| | } |
| |
|
| | try { |
| | const json request = json::parse(line); |
| |
|
| | if (!request.contains("jsonrpc") || request["jsonrpc"] != "2.0") { |
| | if (request.contains("id")) { |
| | WriteError(request["id"], -32600, "Invalid Request"); |
| | } |
| | continue; |
| | } |
| |
|
| | const std::string method = request.value("method", ""); |
| | const bool is_notification = !request.contains("id"); |
| |
|
| | if (method == "initialize") { |
| | if (!is_notification) { |
| | HandleInitialize(request); |
| | } |
| | } else if (method == "notifications/initialized") { |
| | std::cerr << "[mcp] client initialized\n"; |
| | } else if (method == "tools/list") { |
| | if (!is_notification) { |
| | HandleToolsList(request); |
| | } |
| | } else if (method == "tools/call") { |
| | if (!is_notification) { |
| | HandleToolsCall(request); |
| | } |
| | } else if (method == "ping") { |
| | if (!is_notification) { |
| | WriteResult(request["id"], json::object()); |
| | } |
| | } else { |
| | if (!is_notification) { |
| | WriteError(request["id"], -32601, "Method not found: " + method); |
| | } |
| | } |
| | } catch (const std::exception& e) { |
| | std::cerr << "[mcp] parse/dispatch error: " << e.what() << "\n"; |
| | } |
| | } |
| |
|
| | std::cerr << "[mcp] stdin closed, exiting\n"; |
| | g_runtime.Shutdown(); |
| | return 0; |
| | } |
| |
|