Spaces:
Running
Running
| require("dotenv").config(); | |
| const express = require("express"); | |
| const WebSocket = require("ws"); | |
| const crypto = require("crypto"); | |
| const fs = require("fs"); | |
| const path = require("path"); | |
| const app = express(); | |
| // βββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| const API_KEY = process.env.API_KEY || "sk-test-key"; | |
| const DEEPGRAM_API_KEY = process.env.DEEPGRAM_API_KEY || ""; | |
| const PORT = process.env.PORT || 7860; | |
| const WS_TIMEOUT_MS = parseInt(process.env.WS_TIMEOUT_MS || "60000", 10); | |
| const MAX_INPUT_LENGTH = parseInt(process.env.MAX_INPUT_LENGTH || "4096", 10); | |
| const MAX_CACHE_FILES = parseInt(process.env.MAX_CACHE_FILES || "10", 10); | |
| const CACHE_DIR = path.join("/tmp/", "cache"); | |
| if (!fs.existsSync(CACHE_DIR)) fs.mkdirSync(CACHE_DIR, { recursive: true }); | |
| // βββ Middleware ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| app.use(express.json({ limit: "1mb" })); | |
| app.use((_req, res, next) => { | |
| res.setHeader("Access-Control-Allow-Origin", "*"); | |
| res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS, DELETE"); | |
| res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization"); | |
| next(); | |
| }); | |
| app.options("*", (_req, res) => res.sendStatus(204)); | |
| app.use((req, _res, next) => { req.id = `chatcmpl-${crypto.randomBytes(12).toString("hex")}`; next(); }); | |
| // βββ OpenAI Error Format βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function openaiError(res, status, message, type = "invalid_request_error", param = null) { | |
| res.status(status).json({ | |
| error: { | |
| message, | |
| type, | |
| param, | |
| code: status === 401 ? "invalid_api_key" : status === 429 ? "rate_limit_exceeded" : null, | |
| }, | |
| }); | |
| } | |
| // βββ Auth ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function auth(req, res, next) { | |
| const header = req.headers.authorization; | |
| if (!header || !header.startsWith("Bearer ")) { | |
| return openaiError(res, 401, "You didn't provide an API key. Provide your API key in an Authorization header using Bearer auth."); | |
| } | |
| if (header.slice(7).trim() !== API_KEY) { | |
| return openaiError(res, 401, "Incorrect API key provided."); | |
| } | |
| next(); | |
| } | |
| // βββ WAV Header ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function wavHeader(dataSize, sampleRate = 24000, bits = 16, ch = 1) { | |
| const h = Buffer.alloc(44); | |
| h.write("RIFF", 0); | |
| h.writeUInt32LE(36 + dataSize, 4); | |
| h.write("WAVE", 8); | |
| h.write("fmt ", 12); | |
| h.writeUInt32LE(16, 16); | |
| h.writeUInt16LE(1, 20); | |
| h.writeUInt16LE(ch, 22); | |
| h.writeUInt32LE(sampleRate, 24); | |
| h.writeUInt32LE(sampleRate * ch * (bits / 8), 28); | |
| h.writeUInt16LE(ch * (bits / 8), 32); | |
| h.writeUInt16LE(bits, 34); | |
| h.write("data", 36); | |
| h.writeUInt32LE(dataSize, 40); | |
| return h; | |
| } | |
| // βββ Cache Cleanup βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function cleanupCache() { | |
| try { | |
| const files = fs.readdirSync(CACHE_DIR) | |
| .map((f) => ({ name: f, time: fs.statSync(path.join(CACHE_DIR, f)).mtimeMs })) | |
| .sort((a, b) => b.time - a.time); | |
| if (files.length > MAX_CACHE_FILES) { | |
| for (const f of files.slice(MAX_CACHE_FILES)) { | |
| fs.unlinkSync(path.join(CACHE_DIR, f.name)); | |
| console.log(`[Cache] Deleted: ${f.name}`); | |
| } | |
| } | |
| } catch (e) { | |
| console.error("[Cache] Cleanup error:", e.message); | |
| } | |
| } | |
| // βββ Synthesize via Deepgram Agent WebSocket βββββββββββββββββββββββββββββββββ | |
| function synthesize(text, requestId, voiceName = "jessica", res = null, response_format = "wav") { | |
| return new Promise((resolve, reject) => { | |
| const audioChunks = []; | |
| let settled = false; | |
| let headersSent = false; | |
| const t0 = Date.now(); | |
| const ws = new WebSocket("wss://agent.deepgram.com/v1/agent/converse", [ | |
| "token", DEEPGRAM_API_KEY, | |
| ]); | |
| const timer = setTimeout(() => { | |
| if (!settled) { settled = true; ws.close(); reject(new Error("TTS timed out")); } | |
| }, WS_TIMEOUT_MS); | |
| // Voice registry β supports multiple TTS providers | |
| const voiceRegistry = { | |
| // ββ ElevenLabs ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "jessica": { | |
| provider: { | |
| type: "eleven_labs", | |
| model_id: "eleven_multilingual_v2", | |
| voice_id: "cgSgspJ2msm6clMCkdW9", | |
| } | |
| }, | |
| "daniel": { | |
| provider: { | |
| type: "eleven_labs", | |
| model_id: "eleven_multilingual_v2", | |
| voice_id: "onwK4e9ZLuTAKqWW03F9", | |
| } | |
| }, | |
| "piper": { | |
| provider: { | |
| type: "eleven_labs", | |
| model_id: "eleven_multilingual_v2", | |
| voice_id: "DtsPFCrhbCbbJkwZsb3d", | |
| } | |
| }, | |
| "mark": { | |
| provider: { | |
| type: "eleven_labs", | |
| model_id: "eleven_multilingual_v2", | |
| voice_id: "UgBBYS2sOqTuMpoF3BR0", | |
| } | |
| }, | |
| // ββ Cartesia ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "kentucky_man": { | |
| provider: { | |
| type: "cartesia", | |
| model_id: "sonic-2", | |
| voice: { mode: "id", id: "726d5ae5-055f-4c3d-8355-d9677de68937" }, | |
| } | |
| }, | |
| "helpful_woman": { | |
| provider: { | |
| type: "cartesia", | |
| model_id: "sonic-2", | |
| voice: { mode: "id", id: "156fb8d2-335b-4950-9cb3-a2d33befec77" }, | |
| } | |
| }, | |
| }; | |
| const voiceKey = voiceName.toLowerCase().replace(/[\s-]+/g, "_"); | |
| const voiceConfig = voiceRegistry[voiceKey] || voiceRegistry["jessica"]; | |
| ws.on("open", () => { | |
| ws.send(JSON.stringify({ | |
| type: "Settings", | |
| audio: { | |
| input: { encoding: "linear16", sample_rate: 48000 }, | |
| output: { encoding: "linear16", sample_rate: 24000, container: "none" }, | |
| }, | |
| agent: { | |
| language: "en", | |
| speak: voiceConfig, | |
| listen: { provider: { type: "deepgram", version: "v1", model: "nova-3" } }, | |
| think: { provider: { type: "open_ai", model: "gpt-4o-mini" }, prompt: "TTS engine. Do not respond." }, | |
| greeting: text, | |
| }, | |
| })); | |
| }); | |
| ws.on("message", (data, isBinary) => { | |
| if (settled) return; | |
| if (isBinary) { | |
| const chunk = Buffer.from(data); | |
| audioChunks.push(chunk); | |
| if (res) { | |
| if (!headersSent) { | |
| res.setHeader("Content-Type", response_format === "pcm" ? "audio/pcm" : "audio/wav"); | |
| res.setHeader("Transfer-Encoding", "chunked"); | |
| res.setHeader("x-request-id", requestId); | |
| // Send a dummy WAV header of maximum length if needed, or omit it. | |
| // Because we stream real-time, we cannot know the final dataSize for WAV. | |
| // However, chunked transfer audio playback usually tolerates "0xFFFFFFFF" data block size. | |
| // We subtract 36 to prevent UInt32 overflow inside the wavHeader function. | |
| if (response_format !== "pcm") { | |
| res.write(wavHeader(0xFFFFFFFF - 36)); | |
| } | |
| headersSent = true; | |
| } | |
| res.write(chunk); | |
| } | |
| return; | |
| } | |
| try { | |
| const msg = JSON.parse(data.toString()); | |
| if (msg.type === "AgentAudioDone") { | |
| settled = true; clearTimeout(timer); | |
| const pcm = Buffer.concat(audioChunks); | |
| console.log(`[${requestId}] β ${pcm.length}B ${audioChunks.length} chunks ${Date.now() - t0}ms`); | |
| if (res) res.end(); | |
| ws.close(); resolve(pcm); | |
| } else if (msg.type === "Error") { | |
| settled = true; clearTimeout(timer); ws.close(); | |
| reject(new Error(msg.message || "Deepgram error")); | |
| } | |
| } catch { } | |
| }); | |
| ws.on("error", (e) => { if (!settled) { settled = true; clearTimeout(timer); reject(e); } }); | |
| ws.on("close", (code) => { | |
| clearTimeout(timer); | |
| if (!settled) { | |
| settled = true; | |
| audioChunks.length > 0 ? resolve(Buffer.concat(audioChunks)) : reject(new Error(`WS closed (${code})`)); | |
| } | |
| }); | |
| }); | |
| } | |
| // βββ Models ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| const MODELS = { "gpt-4o-mini-tts": true, "tts-1": true, "tts-1-hd": true }; | |
| app.get("/v1/models", auth, (_req, res) => { | |
| res.json({ | |
| object: "list", | |
| data: Object.keys(MODELS).map((id) => ({ | |
| id, object: "model", created: 1700000000, owned_by: "system", | |
| permission: [], root: id, parent: null, | |
| })), | |
| }); | |
| }); | |
| app.get("/v1/models/:model", auth, (req, res) => { | |
| if (!MODELS[req.params.model]) return openaiError(res, 404, `Model '${req.params.model}' not found`, "invalid_request_error", "model"); | |
| res.json({ id: req.params.model, object: "model", created: 1700000000, owned_by: "system", permission: [], root: req.params.model, parent: null }); | |
| }); | |
| // βββ POST /v1/audio/speech βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| // 100% OpenAI compatible: accepts model, input, voice, response_format, speed | |
| // Always buffers complete audio and returns with Content-Length (like OpenAI) | |
| // Clients can use with_streaming_response to read progressively on their end | |
| app.post("/v1/audio/speech", auth, async (req, res) => { | |
| const rid = req.id; | |
| try { | |
| const { | |
| model = "gpt-4o-mini-tts", | |
| input, | |
| voice = "jessica", | |
| response_format = "wav", | |
| speed, | |
| } = req.body; | |
| // Validate | |
| if (!input || typeof input !== "string") return openaiError(res, 400, "Missing required parameter: 'input'", "invalid_request_error", "input"); | |
| if (input.length > MAX_INPUT_LENGTH) return openaiError(res, 400, `Input too long (max ${MAX_INPUT_LENGTH} chars)`, "invalid_request_error", "input"); | |
| if (!MODELS[model]) return openaiError(res, 404, `Model '${model}' not found`, "invalid_request_error", "model"); | |
| if (!DEEPGRAM_API_KEY) return openaiError(res, 500, "DEEPGRAM_API_KEY not configured", "server_error"); | |
| const validFormats = ["mp3", "opus", "aac", "flac", "wav", "pcm"]; | |
| if (!validFormats.includes(response_format)) { | |
| return openaiError(res, 400, `Invalid response_format '${response_format}'. Supported: ${validFormats.join(", ")}`, "invalid_request_error", "response_format"); | |
| } | |
| console.log(`[${rid}] POST /v1/audio/speech model=${model} voice=${voice} fmt=${response_format} len=${input.length}`); | |
| // Synthesize via streaming to response directly | |
| const pcmData = await synthesize(input, rid, voice, res, response_format); | |
| if (pcmData.length === 0 && !res.headersSent) return openaiError(res, 500, "No audio data received", "server_error"); | |
| // The actual binary data was already streamed to 'res'. | |
| // However, we still need to build the audio file for the cache system. | |
| let audioBuffer; | |
| if (response_format === "pcm") { | |
| audioBuffer = pcmData; | |
| } else { | |
| // Reconstruct the proper valid WAV block locally with correct final content length to save to disk. | |
| audioBuffer = Buffer.concat([wavHeader(pcmData.length), pcmData]); | |
| } | |
| // Save to cache & cleanup | |
| const ext = response_format === "pcm" ? "pcm" : "wav"; | |
| const filename = `${Date.now()}_${crypto.randomBytes(4).toString("hex")}.${ext}`; | |
| fs.writeFileSync(path.join(CACHE_DIR, filename), audioBuffer); | |
| setImmediate(cleanupCache); | |
| } catch (err) { | |
| console.error(`[${rid}] β ${err.message}`); | |
| if (!res.headersSent) openaiError(res, 500, err.message, "server_error"); | |
| } | |
| }); | |
| // βββ POST /v1/text:synthesize (Google Cloud TTS Compatible) ββββββββββββββββββ | |
| // Maps Google Cloud TTS requests to our internal Voice Registry | |
| app.post("/v1/text:synthesize", async (req, res) => { | |
| // Note: Google Cloud typically uses query parameters like ?key=API_KEY. | |
| // For this proxy, we check the query string OR the Bearer token. | |
| const providedKey = req.query.key || (req.headers.authorization ? req.headers.authorization.slice(7).trim() : null); | |
| if (!providedKey || providedKey !== API_KEY) { | |
| return res.status(401).json({ | |
| error: { code: 401, message: "Request had invalid authentication credentials.", status: "UNAUTHENTICATED" } | |
| }); | |
| } | |
| const rid = req.id || `gcp-${crypto.randomBytes(4).toString("hex")}`; | |
| try { | |
| const payload = req.body; | |
| const text = payload?.input?.text; | |
| const voiceName = payload?.voice?.name || "jessica"; | |
| let encodingRequested = payload?.audioConfig?.audioEncoding || "LINEAR16"; // MP3, LINEAR16, OGG_OPUS | |
| if (!text) { | |
| return res.status(400).json({ error: { code: 400, message: "Invalid JSON payload received. Missing input.text", status: "INVALID_ARGUMENT" }}); | |
| } | |
| if (!DEEPGRAM_API_KEY) { | |
| return res.status(500).json({ error: { code: 500, message: "DEEPGRAM_API_KEY not configured", status: "INTERNAL" }}); | |
| } | |
| // Map Google encoding string to our internal formats | |
| let internalFormat = "wav"; | |
| if (encodingRequested === "LINEAR16") internalFormat = "pcm"; | |
| if (encodingRequested === "MP3") internalFormat = "mp3"; | |
| console.log(`[${rid}] POST /v1/text:synthesize voice=${voiceName} enc=${encodingRequested} len=${text.length}`); | |
| // Google TTS does not stream via HTTP, it returns a massive base64 blob in JSON. | |
| // So we will await the entire buffer instead of passing res. | |
| const pcmData = await synthesize(text, rid, voiceName, null, internalFormat); | |
| if (pcmData.length === 0) throw new Error("No audio data received"); | |
| let audioBuffer; | |
| if (internalFormat === "pcm") { | |
| audioBuffer = pcmData; | |
| } else { | |
| // For wav, mp3 (fake), opus - return WAV | |
| audioBuffer = Buffer.concat([wavHeader(pcmData.length), pcmData]); | |
| } | |
| // Return Base64 Encoded JSON matching Google Cloud TTS | |
| const base64Audio = audioBuffer.toString("base64"); | |
| res.json({ | |
| audioContent: base64Audio | |
| }); | |
| } catch (err) { | |
| console.error(`[${rid}] β Google Cloud API Error: ${err.message}`); | |
| res.status(500).json({ | |
| error: { code: 500, message: err.message, status: "INTERNAL" } | |
| }); | |
| } | |
| }); | |
| // βββ Cached files ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| app.get("/v1/audio/files", auth, (_req, res) => { | |
| try { | |
| const files = fs.readdirSync(CACHE_DIR) | |
| .map((f) => { const s = fs.statSync(path.join(CACHE_DIR, f)); return { name: f, size: s.size, created: s.mtimeMs }; }) | |
| .sort((a, b) => b.created - a.created); | |
| res.json({ files, count: files.length, max: MAX_CACHE_FILES }); | |
| } catch (e) { openaiError(res, 500, e.message, "server_error"); } | |
| }); | |
| app.get("/v1/audio/files/:filename", auth, (req, res) => { | |
| const fp = path.join(CACHE_DIR, req.params.filename); | |
| if (!fs.existsSync(fp)) return openaiError(res, 404, "File not found"); | |
| res.setHeader("Content-Type", req.params.filename.endsWith(".wav") ? "audio/wav" : "audio/pcm"); | |
| res.sendFile(fp); | |
| }); | |
| // βββ Health ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| app.get("/health", (_req, res) => { | |
| const cacheFiles = fs.existsSync(CACHE_DIR) ? fs.readdirSync(CACHE_DIR).length : 0; | |
| res.json({ | |
| status: "ok", | |
| version: "1.0.0", | |
| service: "openai-tts-proxy", | |
| deepgram: !!DEEPGRAM_API_KEY, | |
| providers: { | |
| eleven_labs: ["jessica", "daniel", "piper", "mark"], | |
| cartesia: ["kentucky_man", "helpful_woman"], | |
| }, | |
| voices: ["jessica", "daniel", "kentucky_man", "helpful_woman", "piper", "mark"], | |
| models: Object.keys(MODELS), | |
| cache: { files: cacheFiles, max: MAX_CACHE_FILES }, | |
| }); | |
| }); | |
| // βββ Root (Render health check) ββββββββββββββββββββββββββββββββββββββββββββββ | |
| app.get("/", (_req, res) => res.json({ status: "ok", service: "openai-tts-proxy", docs: "POST /v1/audio/speech, GET /v1/models, GET /health" })); | |
| // βββ Catch-all & error handler βββββββββββββββββββββββββββββββββββββββββββββββ | |
| app.use((req, res) => openaiError(res, 404, `Unknown request URL: ${req.method} ${req.path}`)); | |
| app.use((err, _req, res, _next) => { console.error("Unhandled:", err); openaiError(res, 500, "Internal server error", "server_error"); }); | |
| // βββ Graceful Shutdown βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| let server; | |
| function shutdown(sig) { | |
| console.log(`\n${sig} β shutting down...`); | |
| server?.close(() => process.exit(0)); | |
| setTimeout(() => process.exit(1), 10000); | |
| } | |
| process.on("SIGTERM", () => shutdown("SIGTERM")); | |
| process.on("SIGINT", () => shutdown("SIGINT")); | |
| // βββ Start βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| server = app.listen(PORT, "0.0.0.0", () => { | |
| console.log(`\nπ HuggingFace TTS API (ElevenLabs & Cartesia) v1.0.0`); | |
| console.log(` http://localhost:${PORT}`); | |
| console.log(` POST /v1/audio/speech (OpenAI Compatible)`); | |
| console.log(` POST /v1/text:synthesize (Google TTS Compatible)`); | |
| console.log(` GET /v1/models`); | |
| console.log(` GET /v1/audio/files`); | |
| console.log(` GET /health`); | |
| console.log(` Backend: ElevenLabs (Jessica & Daniel) via Deepgram`); | |
| console.log(` Cache: max ${MAX_CACHE_FILES} files`); | |
| console.log(` Deepgram: ${DEEPGRAM_API_KEY ? "β " : "β missing"}\n`); | |
| }); | |