#include #include #include #include #include #include #include #include #include #include #include #include #ifndef NOMINMAX #define NOMINMAX #endif #include #include #include using json = nlohmann::json; namespace { constexpr const char* kServerName = "kitten-tts-mcp"; constexpr const char* kServerVersion = "0.1.0"; constexpr const char* kProtocolVersion = "2025-03-26"; constexpr const char* kFixedModelName = "nano"; constexpr const char* kDefaultVoice = "Jasper"; constexpr const char* kDefaultLocale = "en-us"; constexpr float kDefaultSpeed = 1.0f; std::string TrimAsciiWhitespace(const std::string& value) { const auto is_space = [](unsigned char ch) { return std::isspace(ch) != 0; }; size_t begin = 0; while (begin < value.size() && is_space(static_cast(value[begin]))) { ++begin; } size_t end = value.size(); while (end > begin && is_space(static_cast(value[end - 1]))) { --end; } return value.substr(begin, end - begin); } std::filesystem::path GetExecutableDirectory() { wchar_t module_path[MAX_PATH] = {}; const DWORD length = GetModuleFileNameW(nullptr, module_path, MAX_PATH); if (length > 0 && length < MAX_PATH) { return std::filesystem::path(std::wstring(module_path, length)).parent_path(); } return std::filesystem::current_path(); } std::filesystem::path FindFileNearExecutable(const std::filesystem::path& exe_dir, const std::string& filename) { std::error_code ec; std::filesystem::path probe = exe_dir; for (int depth = 0; depth < 8; ++depth) { const std::filesystem::path candidate_same = probe / filename; if (std::filesystem::exists(candidate_same, ec)) { return candidate_same; } const std::filesystem::path candidate_models = probe / "models" / filename; if (std::filesystem::exists(candidate_models, ec)) { return candidate_models; } const std::filesystem::path parent = probe.parent_path(); if (parent.empty() || parent == probe) { break; } probe = parent; } return {}; } std::string GetEnvironmentString(const char* name, const std::string& fallback) { if (name == nullptr || name[0] == '\0') { return fallback; } const char* value = std::getenv(name); if (value == nullptr || value[0] == '\0') { return fallback; } return value; } float GetEnvironmentFloat(const char* name, float fallback) { const std::string raw = GetEnvironmentString(name, ""); if (raw.empty()) { return fallback; } try { return std::stof(raw); } catch (...) { return fallback; } } std::vector DefaultVoiceList() { return {"Bella", "Bruno", "Hugo", "Jasper", "Kiki", "Leo", "Luna", "Rosie"}; } std::vector LoadVoiceNamesFromJson(const std::filesystem::path& voices_path) { std::ifstream input(voices_path, std::ios::binary); if (!input) { return {}; } json voices_json; try { input >> voices_json; } catch (...) { return {}; } if (!voices_json.is_object()) { return {}; } std::vector voices; voices.reserve(voices_json.size()); for (auto it = voices_json.begin(); it != voices_json.end(); ++it) { voices.push_back(it.key()); } std::sort(voices.begin(), voices.end()); voices.erase(std::unique(voices.begin(), voices.end()), voices.end()); return voices; } void WriteJsonLine(const json& message) { std::cout << message.dump() << "\n"; std::cout.flush(); } void WriteResult(const json& id, const json& result) { WriteJsonLine({ {"jsonrpc", "2.0"}, {"id", id}, {"result", result} }); } void WriteError(const json& id, int code, const std::string& message, const json& data = nullptr) { json error = { {"code", code}, {"message", message} }; if (!data.is_null()) { error["data"] = data; } WriteJsonLine({ {"jsonrpc", "2.0"}, {"id", id}, {"error", error} }); } json MakeTextContent(const std::string& text) { return json{ {"type", "text"}, {"text", text} }; } class KittenRuntime { public: bool Initialize() { const std::filesystem::path exe_dir = GetExecutableDirectory(); default_voice_ = GetEnvironmentString("KITTEN_TTS_MCP_VOICE", kDefaultVoice); default_locale_ = GetEnvironmentString("KITTEN_TTS_MCP_LOCALE", kDefaultLocale); default_speed_ = GetEnvironmentFloat("KITTEN_TTS_MCP_SPEED", kDefaultSpeed); const std::string model_filename = "kitten_tts_nano_v0_8.onnx"; const std::string voices_filename = "voices_nano.json"; model_path_ = FindFileNearExecutable(exe_dir, model_filename); voices_path_ = FindFileNearExecutable(exe_dir, voices_filename); if (model_path_.empty() || voices_path_.empty()) { std::ostringstream oss; oss << "Could not find the fixed Kitten nano model files."; oss << " Expected " << model_filename << " and " << voices_filename << "."; last_error_ = oss.str(); return false; } voices_ = LoadVoiceNamesFromJson(voices_path_); if (voices_.empty()) { voices_ = DefaultVoiceList(); } kittentts::TTSSimpleServiceConfig config; config.model_name = kFixedModelName; config.model_path = model_path_.string(); config.voices_path = voices_path_.string(); config.voice = default_voice_; config.locale = default_locale_; config.speed = default_speed_; service_.SetEventCallback([this](const kittentts::TTSSimpleService::Event& event) { OnServiceEvent(event); }); if (!service_.Initialize(config)) { last_error_ = service_.GetLastError(); if (last_error_.empty()) { last_error_ = "TTSSimpleService initialization failed."; } return false; } initialized_ = true; return true; } void Shutdown() { if (!initialized_) { return; } service_.Stop(); service_.Shutdown(); initialized_ = false; } bool IsInitialized() const { return initialized_; } const std::string& GetLastError() const { return last_error_; } const std::vector& GetVoices() const { return voices_; } bool Speak( const std::string& text, const std::string& voice, const std::string& locale, float speed, bool blocking, std::string& message_out) { message_out.clear(); if (!initialized_) { message_out = "TTS runtime is not initialized."; return false; } const std::string trimmed_text = TrimAsciiWhitespace(text); if (trimmed_text.empty()) { message_out = "Text must not be empty."; return false; } if (speed < 0.5f || speed > 2.0f) { message_out = "Speed must be between 0.5 and 2.0."; return false; } const std::string effective_voice = voice.empty() ? default_voice_ : voice; const std::string effective_locale = locale.empty() ? default_locale_ : locale; if (!voices_.empty() && std::find(voices_.begin(), voices_.end(), effective_voice) == voices_.end()) { message_out = "Unknown voice: " + effective_voice; return false; } { std::lock_guard lock(state_mutex_); last_error_.clear(); terminal_state_ = TerminalState::None; terminal_message_.clear(); } service_.ClearLastError(); service_.SetVoice(effective_voice); service_.SetLocale(effective_locale); service_.SetSpeed(speed); if (!service_.PlayTextUtf8(trimmed_text)) { message_out = service_.GetLastError(); if (message_out.empty()) { message_out = "Speech request failed."; } return false; } if (!blocking) { message_out = "Speech started."; return true; } while (service_.IsPlaybackActive()) { std::this_thread::sleep_for(std::chrono::milliseconds(25)); } { std::lock_guard lock(state_mutex_); if (terminal_state_ == TerminalState::Error) { message_out = terminal_message_.empty() ? "Speech failed." : terminal_message_; return false; } if (terminal_state_ == TerminalState::Stopped) { message_out = terminal_message_.empty() ? "Speech was stopped." : terminal_message_; return false; } } message_out = "Speech completed."; return true; } bool Stop(std::string& message_out) { message_out.clear(); if (!initialized_) { message_out = "TTS runtime is not initialized."; return false; } service_.Stop(); message_out = "Playback stopped."; return true; } private: enum class TerminalState { None = 0, Completed, Stopped, Error }; void OnServiceEvent(const kittentts::TTSSimpleService::Event& event) { std::lock_guard lock(state_mutex_); switch (event.type) { case kittentts::TTSSimpleService::EventType::PlaybackCompleted: terminal_state_ = TerminalState::Completed; terminal_message_ = "Speech completed."; break; case kittentts::TTSSimpleService::EventType::PlaybackStopped: if (terminal_state_ != TerminalState::Error) { terminal_state_ = TerminalState::Stopped; terminal_message_ = event.message.empty() ? "Speech was stopped." : event.message; } break; case kittentts::TTSSimpleService::EventType::Error: terminal_state_ = TerminalState::Error; terminal_message_ = event.message.empty() ? "Speech failed." : event.message; last_error_ = terminal_message_; break; default: break; } } kittentts::TTSSimpleService service_; bool initialized_ = false; std::filesystem::path model_path_; std::filesystem::path voices_path_; std::string default_voice_; std::string default_locale_; float default_speed_ = kDefaultSpeed; std::vector voices_; mutable std::mutex state_mutex_; TerminalState terminal_state_ = TerminalState::None; std::string terminal_message_; std::string last_error_; }; KittenRuntime g_runtime; void HandleInitialize(const json& request) { const json id = request.contains("id") ? request["id"] : json(nullptr); const json params = request.value("params", json::object()); const std::string requested_version = params.value("protocolVersion", ""); json result = { {"protocolVersion", kProtocolVersion}, {"capabilities", { {"tools", { {"listChanged", false} }} }}, {"serverInfo", { {"name", kServerName}, {"version", kServerVersion} }}, {"instructions", "KittenTTS MCP server. Tools: speak, stop_speaking, list_voices."} }; if (!requested_version.empty() && requested_version != kProtocolVersion) { result["warnings"] = json::array({ "Client requested a different protocolVersion; server is responding with 2025-03-26." }); } WriteResult(id, result); } void HandleToolsList(const json& request) { const json id = request.contains("id") ? request["id"] : json(nullptr); json tools = json::array(); tools.push_back({ {"name", "speak"}, {"title", "Speak Text"}, {"description", "Speak text aloud on the local machine using KittenTTS."}, {"inputSchema", { {"type", "object"}, {"properties", { {"text", { {"type", "string"}, {"description", "The text to speak."} }}, {"voice", { {"type", "string"}, {"description", "Optional KittenTTS voice name."} }}, {"locale", { {"type", "string"}, {"description", "Optional eSpeak locale. Default is en-us."} }}, {"speed", { {"type", "number"}, {"description", "Playback speed from 0.5 to 2.0."}, {"default", kDefaultSpeed} }}, {"blocking", { {"type", "boolean"}, {"description", "If true, wait until playback completes or fails."}, {"default", false} }} }}, {"required", json::array({"text"})}, {"additionalProperties", false} }} }); tools.push_back({ {"name", "stop_speaking"}, {"title", "Stop Speaking"}, {"description", "Stop current local KittenTTS playback."}, {"inputSchema", { {"type", "object"}, {"properties", json::object()}, {"additionalProperties", false} }} }); tools.push_back({ {"name", "list_voices"}, {"title", "List Voices"}, {"description", "List the predefined KittenTTS voices available to this server."}, {"inputSchema", { {"type", "object"}, {"properties", json::object()}, {"additionalProperties", false} }} }); WriteResult(id, {{"tools", tools}}); } void HandleToolsCall(const json& request) { const json id = request.contains("id") ? request["id"] : json(nullptr); const json params = request.value("params", json::object()); const std::string name = params.value("name", ""); const json args = params.value("arguments", json::object()); if (name == "speak") { if (!args.contains("text") || !args["text"].is_string()) { WriteError(id, -32602, "Invalid arguments: 'text' must be a string"); return; } const std::string text = args["text"].get(); const std::string voice = args.value("voice", ""); const std::string locale = args.value("locale", ""); const float speed = args.contains("speed") ? args["speed"].get() : kDefaultSpeed; const bool blocking = args.value("blocking", false); std::string runtime_message; const bool ok = g_runtime.Speak(text, voice, locale, speed, blocking, runtime_message); const std::string effective_voice = voice.empty() ? kDefaultVoice : voice; const std::string effective_locale = locale.empty() ? kDefaultLocale : locale; const std::string content_text = ok ? runtime_message : ("TTS error: " + runtime_message); WriteResult(id, { {"content", json::array({MakeTextContent(content_text)})}, {"structuredContent", { {"ok", ok}, {"voice", effective_voice}, {"locale", effective_locale}, {"speed", speed}, {"blocking", blocking}, {"message", runtime_message} }}, {"isError", !ok} }); return; } if (name == "stop_speaking") { std::string runtime_message; const bool ok = g_runtime.Stop(runtime_message); WriteResult(id, { {"content", json::array({MakeTextContent(runtime_message)})}, {"structuredContent", { {"ok", ok}, {"message", runtime_message} }}, {"isError", !ok} }); return; } if (name == "list_voices") { const auto& voices = g_runtime.GetVoices(); std::ostringstream oss; oss << "Available voices:"; for (const auto& voice_name : voices) { oss << " " << voice_name; } WriteResult(id, { {"content", json::array({MakeTextContent(oss.str())})}, {"structuredContent", { {"voices", voices} }}, {"isError", false} }); return; } WriteError(id, -32601, "Unknown tool: " + name); } } // namespace int main() { std::ios::sync_with_stdio(false); std::cerr << "[mcp] " << kServerName << " starting\n"; if (!g_runtime.Initialize()) { std::cerr << "[mcp] failed to initialize KittenTTS runtime: " << g_runtime.GetLastError() << "\n"; return 1; } std::string line; while (std::getline(std::cin, line)) { if (line.empty()) { continue; } try { const json request = json::parse(line); if (!request.contains("jsonrpc") || request["jsonrpc"] != "2.0") { if (request.contains("id")) { WriteError(request["id"], -32600, "Invalid Request"); } continue; } const std::string method = request.value("method", ""); const bool is_notification = !request.contains("id"); if (method == "initialize") { if (!is_notification) { HandleInitialize(request); } } else if (method == "notifications/initialized") { std::cerr << "[mcp] client initialized\n"; } else if (method == "tools/list") { if (!is_notification) { HandleToolsList(request); } } else if (method == "tools/call") { if (!is_notification) { HandleToolsCall(request); } } else if (method == "ping") { if (!is_notification) { WriteResult(request["id"], json::object()); } } else { if (!is_notification) { WriteError(request["id"], -32601, "Method not found: " + method); } } } catch (const std::exception& e) { std::cerr << "[mcp] parse/dispatch error: " << e.what() << "\n"; } } std::cerr << "[mcp] stdin closed, exiting\n"; g_runtime.Shutdown(); return 0; }