require("dotenv").config(); const express = require("express"); const WebSocket = require("ws"); const crypto = require("crypto"); const fs = require("fs"); const path = require("path"); const app = express(); // ─── Config ────────────────────────────────────────────────────────────────── const API_KEY = process.env.API_KEY || "sk-test-key"; const DEEPGRAM_API_KEY = process.env.DEEPGRAM_API_KEY || ""; const PORT = process.env.PORT || 7860; const WS_TIMEOUT_MS = parseInt(process.env.WS_TIMEOUT_MS || "60000", 10); const MAX_INPUT_LENGTH = parseInt(process.env.MAX_INPUT_LENGTH || "4096", 10); const MAX_CACHE_FILES = parseInt(process.env.MAX_CACHE_FILES || "10", 10); const CACHE_DIR = path.join("/tmp/", "cache"); if (!fs.existsSync(CACHE_DIR)) fs.mkdirSync(CACHE_DIR, { recursive: true }); // ─── Middleware ────────────────────────────────────────────────────────────── app.use(express.json({ limit: "1mb" })); app.use((_req, res, next) => { res.setHeader("Access-Control-Allow-Origin", "*"); res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS, DELETE"); res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization"); next(); }); app.options("*", (_req, res) => res.sendStatus(204)); app.use((req, _res, next) => { req.id = `chatcmpl-${crypto.randomBytes(12).toString("hex")}`; next(); }); // ─── OpenAI Error Format ───────────────────────────────────────────────────── function openaiError(res, status, message, type = "invalid_request_error", param = null) { res.status(status).json({ error: { message, type, param, code: status === 401 ? "invalid_api_key" : status === 429 ? "rate_limit_exceeded" : null, }, }); } // ─── Auth ──────────────────────────────────────────────────────────────────── function auth(req, res, next) { const header = req.headers.authorization; if (!header || !header.startsWith("Bearer ")) { return openaiError(res, 401, "You didn't provide an API key. Provide your API key in an Authorization header using Bearer auth."); } if (header.slice(7).trim() !== API_KEY) { return openaiError(res, 401, "Incorrect API key provided."); } next(); } // ─── WAV Header ────────────────────────────────────────────────────────────── function wavHeader(dataSize, sampleRate = 24000, bits = 16, ch = 1) { const h = Buffer.alloc(44); h.write("RIFF", 0); h.writeUInt32LE(36 + dataSize, 4); h.write("WAVE", 8); h.write("fmt ", 12); h.writeUInt32LE(16, 16); h.writeUInt16LE(1, 20); h.writeUInt16LE(ch, 22); h.writeUInt32LE(sampleRate, 24); h.writeUInt32LE(sampleRate * ch * (bits / 8), 28); h.writeUInt16LE(ch * (bits / 8), 32); h.writeUInt16LE(bits, 34); h.write("data", 36); h.writeUInt32LE(dataSize, 40); return h; } // ─── Cache Cleanup ─────────────────────────────────────────────────────────── function cleanupCache() { try { const files = fs.readdirSync(CACHE_DIR) .map((f) => ({ name: f, time: fs.statSync(path.join(CACHE_DIR, f)).mtimeMs })) .sort((a, b) => b.time - a.time); if (files.length > MAX_CACHE_FILES) { for (const f of files.slice(MAX_CACHE_FILES)) { fs.unlinkSync(path.join(CACHE_DIR, f.name)); console.log(`[Cache] Deleted: ${f.name}`); } } } catch (e) { console.error("[Cache] Cleanup error:", e.message); } } // ─── Synthesize via Deepgram Agent WebSocket ───────────────────────────────── function synthesize(text, requestId, voiceName = "jessica", res = null, response_format = "wav") { return new Promise((resolve, reject) => { const audioChunks = []; let settled = false; let headersSent = false; const t0 = Date.now(); const ws = new WebSocket("wss://agent.deepgram.com/v1/agent/converse", [ "token", DEEPGRAM_API_KEY, ]); const timer = setTimeout(() => { if (!settled) { settled = true; ws.close(); reject(new Error("TTS timed out")); } }, WS_TIMEOUT_MS); // Voice registry — supports multiple TTS providers const voiceRegistry = { // ── ElevenLabs ────────────────────────────────────────────────── "jessica": { provider: { type: "eleven_labs", model_id: "eleven_multilingual_v2", voice_id: "cgSgspJ2msm6clMCkdW9", } }, "daniel": { provider: { type: "eleven_labs", model_id: "eleven_multilingual_v2", voice_id: "onwK4e9ZLuTAKqWW03F9", } }, "piper": { provider: { type: "eleven_labs", model_id: "eleven_multilingual_v2", voice_id: "DtsPFCrhbCbbJkwZsb3d", } }, "mark": { provider: { type: "eleven_labs", model_id: "eleven_multilingual_v2", voice_id: "UgBBYS2sOqTuMpoF3BR0", } }, // ── Cartesia ──────────────────────────────────────────────────── "kentucky_man": { provider: { type: "cartesia", model_id: "sonic-2", voice: { mode: "id", id: "726d5ae5-055f-4c3d-8355-d9677de68937" }, } }, "helpful_woman": { provider: { type: "cartesia", model_id: "sonic-2", voice: { mode: "id", id: "156fb8d2-335b-4950-9cb3-a2d33befec77" }, } }, }; const voiceKey = voiceName.toLowerCase().replace(/[\s-]+/g, "_"); const voiceConfig = voiceRegistry[voiceKey] || voiceRegistry["jessica"]; ws.on("open", () => { ws.send(JSON.stringify({ type: "Settings", audio: { input: { encoding: "linear16", sample_rate: 48000 }, output: { encoding: "linear16", sample_rate: 24000, container: "none" }, }, agent: { language: "en", speak: voiceConfig, listen: { provider: { type: "deepgram", version: "v1", model: "nova-3" } }, think: { provider: { type: "open_ai", model: "gpt-4o-mini" }, prompt: "TTS engine. Do not respond." }, greeting: text, }, })); }); ws.on("message", (data, isBinary) => { if (settled) return; if (isBinary) { const chunk = Buffer.from(data); audioChunks.push(chunk); if (res) { if (!headersSent) { res.setHeader("Content-Type", response_format === "pcm" ? "audio/pcm" : "audio/wav"); res.setHeader("Transfer-Encoding", "chunked"); res.setHeader("x-request-id", requestId); // Send a dummy WAV header of maximum length if needed, or omit it. // Because we stream real-time, we cannot know the final dataSize for WAV. // However, chunked transfer audio playback usually tolerates "0xFFFFFFFF" data block size. // We subtract 36 to prevent UInt32 overflow inside the wavHeader function. if (response_format !== "pcm") { res.write(wavHeader(0xFFFFFFFF - 36)); } headersSent = true; } res.write(chunk); } return; } try { const msg = JSON.parse(data.toString()); if (msg.type === "AgentAudioDone") { settled = true; clearTimeout(timer); const pcm = Buffer.concat(audioChunks); console.log(`[${requestId}] ✅ ${pcm.length}B ${audioChunks.length} chunks ${Date.now() - t0}ms`); if (res) res.end(); ws.close(); resolve(pcm); } else if (msg.type === "Error") { settled = true; clearTimeout(timer); ws.close(); reject(new Error(msg.message || "Deepgram error")); } } catch { } }); ws.on("error", (e) => { if (!settled) { settled = true; clearTimeout(timer); reject(e); } }); ws.on("close", (code) => { clearTimeout(timer); if (!settled) { settled = true; audioChunks.length > 0 ? resolve(Buffer.concat(audioChunks)) : reject(new Error(`WS closed (${code})`)); } }); }); } // ─── Models ────────────────────────────────────────────────────────────────── const MODELS = { "gpt-4o-mini-tts": true, "tts-1": true, "tts-1-hd": true }; app.get("/v1/models", auth, (_req, res) => { res.json({ object: "list", data: Object.keys(MODELS).map((id) => ({ id, object: "model", created: 1700000000, owned_by: "system", permission: [], root: id, parent: null, })), }); }); app.get("/v1/models/:model", auth, (req, res) => { if (!MODELS[req.params.model]) return openaiError(res, 404, `Model '${req.params.model}' not found`, "invalid_request_error", "model"); res.json({ id: req.params.model, object: "model", created: 1700000000, owned_by: "system", permission: [], root: req.params.model, parent: null }); }); // ─── POST /v1/audio/speech ─────────────────────────────────────────────────── // 100% OpenAI compatible: accepts model, input, voice, response_format, speed // Always buffers complete audio and returns with Content-Length (like OpenAI) // Clients can use with_streaming_response to read progressively on their end app.post("/v1/audio/speech", auth, async (req, res) => { const rid = req.id; try { const { model = "gpt-4o-mini-tts", input, voice = "jessica", response_format = "wav", speed, } = req.body; // Validate if (!input || typeof input !== "string") return openaiError(res, 400, "Missing required parameter: 'input'", "invalid_request_error", "input"); if (input.length > MAX_INPUT_LENGTH) return openaiError(res, 400, `Input too long (max ${MAX_INPUT_LENGTH} chars)`, "invalid_request_error", "input"); if (!MODELS[model]) return openaiError(res, 404, `Model '${model}' not found`, "invalid_request_error", "model"); if (!DEEPGRAM_API_KEY) return openaiError(res, 500, "DEEPGRAM_API_KEY not configured", "server_error"); const validFormats = ["mp3", "opus", "aac", "flac", "wav", "pcm"]; if (!validFormats.includes(response_format)) { return openaiError(res, 400, `Invalid response_format '${response_format}'. Supported: ${validFormats.join(", ")}`, "invalid_request_error", "response_format"); } console.log(`[${rid}] POST /v1/audio/speech model=${model} voice=${voice} fmt=${response_format} len=${input.length}`); // Synthesize via streaming to response directly const pcmData = await synthesize(input, rid, voice, res, response_format); if (pcmData.length === 0 && !res.headersSent) return openaiError(res, 500, "No audio data received", "server_error"); // The actual binary data was already streamed to 'res'. // However, we still need to build the audio file for the cache system. let audioBuffer; if (response_format === "pcm") { audioBuffer = pcmData; } else { // Reconstruct the proper valid WAV block locally with correct final content length to save to disk. audioBuffer = Buffer.concat([wavHeader(pcmData.length), pcmData]); } // Save to cache & cleanup const ext = response_format === "pcm" ? "pcm" : "wav"; const filename = `${Date.now()}_${crypto.randomBytes(4).toString("hex")}.${ext}`; fs.writeFileSync(path.join(CACHE_DIR, filename), audioBuffer); setImmediate(cleanupCache); } catch (err) { console.error(`[${rid}] ❌ ${err.message}`); if (!res.headersSent) openaiError(res, 500, err.message, "server_error"); } }); // ─── POST /v1/text:synthesize (Google Cloud TTS Compatible) ────────────────── // Maps Google Cloud TTS requests to our internal Voice Registry app.post("/v1/text:synthesize", async (req, res) => { // Note: Google Cloud typically uses query parameters like ?key=API_KEY. // For this proxy, we check the query string OR the Bearer token. const providedKey = req.query.key || (req.headers.authorization ? req.headers.authorization.slice(7).trim() : null); if (!providedKey || providedKey !== API_KEY) { return res.status(401).json({ error: { code: 401, message: "Request had invalid authentication credentials.", status: "UNAUTHENTICATED" } }); } const rid = req.id || `gcp-${crypto.randomBytes(4).toString("hex")}`; try { const payload = req.body; const text = payload?.input?.text; const voiceName = payload?.voice?.name || "jessica"; let encodingRequested = payload?.audioConfig?.audioEncoding || "LINEAR16"; // MP3, LINEAR16, OGG_OPUS if (!text) { return res.status(400).json({ error: { code: 400, message: "Invalid JSON payload received. Missing input.text", status: "INVALID_ARGUMENT" }}); } if (!DEEPGRAM_API_KEY) { return res.status(500).json({ error: { code: 500, message: "DEEPGRAM_API_KEY not configured", status: "INTERNAL" }}); } // Map Google encoding string to our internal formats let internalFormat = "wav"; if (encodingRequested === "LINEAR16") internalFormat = "pcm"; if (encodingRequested === "MP3") internalFormat = "mp3"; console.log(`[${rid}] POST /v1/text:synthesize voice=${voiceName} enc=${encodingRequested} len=${text.length}`); // Google TTS does not stream via HTTP, it returns a massive base64 blob in JSON. // So we will await the entire buffer instead of passing res. const pcmData = await synthesize(text, rid, voiceName, null, internalFormat); if (pcmData.length === 0) throw new Error("No audio data received"); let audioBuffer; if (internalFormat === "pcm") { audioBuffer = pcmData; } else { // For wav, mp3 (fake), opus - return WAV audioBuffer = Buffer.concat([wavHeader(pcmData.length), pcmData]); } // Return Base64 Encoded JSON matching Google Cloud TTS const base64Audio = audioBuffer.toString("base64"); res.json({ audioContent: base64Audio }); } catch (err) { console.error(`[${rid}] ❌ Google Cloud API Error: ${err.message}`); res.status(500).json({ error: { code: 500, message: err.message, status: "INTERNAL" } }); } }); // ─── Cached files ──────────────────────────────────────────────────────────── app.get("/v1/audio/files", auth, (_req, res) => { try { const files = fs.readdirSync(CACHE_DIR) .map((f) => { const s = fs.statSync(path.join(CACHE_DIR, f)); return { name: f, size: s.size, created: s.mtimeMs }; }) .sort((a, b) => b.created - a.created); res.json({ files, count: files.length, max: MAX_CACHE_FILES }); } catch (e) { openaiError(res, 500, e.message, "server_error"); } }); app.get("/v1/audio/files/:filename", auth, (req, res) => { const fp = path.join(CACHE_DIR, req.params.filename); if (!fs.existsSync(fp)) return openaiError(res, 404, "File not found"); res.setHeader("Content-Type", req.params.filename.endsWith(".wav") ? "audio/wav" : "audio/pcm"); res.sendFile(fp); }); // ─── Health ────────────────────────────────────────────────────────────────── app.get("/health", (_req, res) => { const cacheFiles = fs.existsSync(CACHE_DIR) ? fs.readdirSync(CACHE_DIR).length : 0; res.json({ status: "ok", version: "1.0.0", service: "openai-tts-proxy", deepgram: !!DEEPGRAM_API_KEY, providers: { eleven_labs: ["jessica", "daniel", "piper", "mark"], cartesia: ["kentucky_man", "helpful_woman"], }, voices: ["jessica", "daniel", "kentucky_man", "helpful_woman", "piper", "mark"], models: Object.keys(MODELS), cache: { files: cacheFiles, max: MAX_CACHE_FILES }, }); }); // ─── Root (Render health check) ────────────────────────────────────────────── app.get("/", (_req, res) => res.json({ status: "ok", service: "openai-tts-proxy", docs: "POST /v1/audio/speech, GET /v1/models, GET /health" })); // ─── Catch-all & error handler ─────────────────────────────────────────────── app.use((req, res) => openaiError(res, 404, `Unknown request URL: ${req.method} ${req.path}`)); app.use((err, _req, res, _next) => { console.error("Unhandled:", err); openaiError(res, 500, "Internal server error", "server_error"); }); // ─── Graceful Shutdown ─────────────────────────────────────────────────────── let server; function shutdown(sig) { console.log(`\n${sig} — shutting down...`); server?.close(() => process.exit(0)); setTimeout(() => process.exit(1), 10000); } process.on("SIGTERM", () => shutdown("SIGTERM")); process.on("SIGINT", () => shutdown("SIGINT")); // ─── Start ─────────────────────────────────────────────────────────────────── server = app.listen(PORT, "0.0.0.0", () => { console.log(`\n🔊 HuggingFace TTS API (ElevenLabs & Cartesia) v1.0.0`); console.log(` http://localhost:${PORT}`); console.log(` POST /v1/audio/speech (OpenAI Compatible)`); console.log(` POST /v1/text:synthesize (Google TTS Compatible)`); console.log(` GET /v1/models`); console.log(` GET /v1/audio/files`); console.log(` GET /health`); console.log(` Backend: ElevenLabs (Jessica & Daniel) via Deepgram`); console.log(` Cache: max ${MAX_CACHE_FILES} files`); console.log(` Deepgram: ${DEEPGRAM_API_KEY ? "✅" : "❌ missing"}\n`); });