FeatherLabs-tts / server.js
owlninjam's picture
Upload server.js
bcdf6e1 verified
require("dotenv").config();
const express = require("express");
const WebSocket = require("ws");
const crypto = require("crypto");
const fs = require("fs");
const path = require("path");
const app = express();
// ─── Config ──────────────────────────────────────────────────────────────────
const API_KEY = process.env.API_KEY || "sk-test-key";
const DEEPGRAM_API_KEY = process.env.DEEPGRAM_API_KEY || "";
const PORT = process.env.PORT || 7860;
const WS_TIMEOUT_MS = parseInt(process.env.WS_TIMEOUT_MS || "60000", 10);
const MAX_INPUT_LENGTH = parseInt(process.env.MAX_INPUT_LENGTH || "4096", 10);
const MAX_CACHE_FILES = parseInt(process.env.MAX_CACHE_FILES || "10", 10);
const CACHE_DIR = path.join("/tmp/", "cache");
if (!fs.existsSync(CACHE_DIR)) fs.mkdirSync(CACHE_DIR, { recursive: true });
// ─── Middleware ──────────────────────────────────────────────────────────────
app.use(express.json({ limit: "1mb" }));
app.use((_req, res, next) => {
res.setHeader("Access-Control-Allow-Origin", "*");
res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS, DELETE");
res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
next();
});
app.options("*", (_req, res) => res.sendStatus(204));
app.use((req, _res, next) => { req.id = `chatcmpl-${crypto.randomBytes(12).toString("hex")}`; next(); });
// ─── OpenAI Error Format ─────────────────────────────────────────────────────
function openaiError(res, status, message, type = "invalid_request_error", param = null) {
res.status(status).json({
error: {
message,
type,
param,
code: status === 401 ? "invalid_api_key" : status === 429 ? "rate_limit_exceeded" : null,
},
});
}
// ─── Auth ────────────────────────────────────────────────────────────────────
function auth(req, res, next) {
const header = req.headers.authorization;
if (!header || !header.startsWith("Bearer ")) {
return openaiError(res, 401, "You didn't provide an API key. Provide your API key in an Authorization header using Bearer auth.");
}
if (header.slice(7).trim() !== API_KEY) {
return openaiError(res, 401, "Incorrect API key provided.");
}
next();
}
// ─── WAV Header ──────────────────────────────────────────────────────────────
function wavHeader(dataSize, sampleRate = 24000, bits = 16, ch = 1) {
const h = Buffer.alloc(44);
h.write("RIFF", 0);
h.writeUInt32LE(36 + dataSize, 4);
h.write("WAVE", 8);
h.write("fmt ", 12);
h.writeUInt32LE(16, 16);
h.writeUInt16LE(1, 20);
h.writeUInt16LE(ch, 22);
h.writeUInt32LE(sampleRate, 24);
h.writeUInt32LE(sampleRate * ch * (bits / 8), 28);
h.writeUInt16LE(ch * (bits / 8), 32);
h.writeUInt16LE(bits, 34);
h.write("data", 36);
h.writeUInt32LE(dataSize, 40);
return h;
}
// ─── Cache Cleanup ───────────────────────────────────────────────────────────
function cleanupCache() {
try {
const files = fs.readdirSync(CACHE_DIR)
.map((f) => ({ name: f, time: fs.statSync(path.join(CACHE_DIR, f)).mtimeMs }))
.sort((a, b) => b.time - a.time);
if (files.length > MAX_CACHE_FILES) {
for (const f of files.slice(MAX_CACHE_FILES)) {
fs.unlinkSync(path.join(CACHE_DIR, f.name));
console.log(`[Cache] Deleted: ${f.name}`);
}
}
} catch (e) {
console.error("[Cache] Cleanup error:", e.message);
}
}
// ─── Synthesize via Deepgram Agent WebSocket ─────────────────────────────────
function synthesize(text, requestId, voiceName = "jessica", res = null, response_format = "wav") {
return new Promise((resolve, reject) => {
const audioChunks = [];
let settled = false;
let headersSent = false;
const t0 = Date.now();
const ws = new WebSocket("wss://agent.deepgram.com/v1/agent/converse", [
"token", DEEPGRAM_API_KEY,
]);
const timer = setTimeout(() => {
if (!settled) { settled = true; ws.close(); reject(new Error("TTS timed out")); }
}, WS_TIMEOUT_MS);
// Voice registry β€” supports multiple TTS providers
const voiceRegistry = {
// ── ElevenLabs ──────────────────────────────────────────────────
"jessica": {
provider: {
type: "eleven_labs",
model_id: "eleven_multilingual_v2",
voice_id: "cgSgspJ2msm6clMCkdW9",
}
},
"daniel": {
provider: {
type: "eleven_labs",
model_id: "eleven_multilingual_v2",
voice_id: "onwK4e9ZLuTAKqWW03F9",
}
},
"piper": {
provider: {
type: "eleven_labs",
model_id: "eleven_multilingual_v2",
voice_id: "DtsPFCrhbCbbJkwZsb3d",
}
},
"mark": {
provider: {
type: "eleven_labs",
model_id: "eleven_multilingual_v2",
voice_id: "UgBBYS2sOqTuMpoF3BR0",
}
},
// ── Cartesia ────────────────────────────────────────────────────
"kentucky_man": {
provider: {
type: "cartesia",
model_id: "sonic-2",
voice: { mode: "id", id: "726d5ae5-055f-4c3d-8355-d9677de68937" },
}
},
"helpful_woman": {
provider: {
type: "cartesia",
model_id: "sonic-2",
voice: { mode: "id", id: "156fb8d2-335b-4950-9cb3-a2d33befec77" },
}
},
};
const voiceKey = voiceName.toLowerCase().replace(/[\s-]+/g, "_");
const voiceConfig = voiceRegistry[voiceKey] || voiceRegistry["jessica"];
ws.on("open", () => {
ws.send(JSON.stringify({
type: "Settings",
audio: {
input: { encoding: "linear16", sample_rate: 48000 },
output: { encoding: "linear16", sample_rate: 24000, container: "none" },
},
agent: {
language: "en",
speak: voiceConfig,
listen: { provider: { type: "deepgram", version: "v1", model: "nova-3" } },
think: { provider: { type: "open_ai", model: "gpt-4o-mini" }, prompt: "TTS engine. Do not respond." },
greeting: text,
},
}));
});
ws.on("message", (data, isBinary) => {
if (settled) return;
if (isBinary) {
const chunk = Buffer.from(data);
audioChunks.push(chunk);
if (res) {
if (!headersSent) {
res.setHeader("Content-Type", response_format === "pcm" ? "audio/pcm" : "audio/wav");
res.setHeader("Transfer-Encoding", "chunked");
res.setHeader("x-request-id", requestId);
// Send a dummy WAV header of maximum length if needed, or omit it.
// Because we stream real-time, we cannot know the final dataSize for WAV.
// However, chunked transfer audio playback usually tolerates "0xFFFFFFFF" data block size.
// We subtract 36 to prevent UInt32 overflow inside the wavHeader function.
if (response_format !== "pcm") {
res.write(wavHeader(0xFFFFFFFF - 36));
}
headersSent = true;
}
res.write(chunk);
}
return;
}
try {
const msg = JSON.parse(data.toString());
if (msg.type === "AgentAudioDone") {
settled = true; clearTimeout(timer);
const pcm = Buffer.concat(audioChunks);
console.log(`[${requestId}] βœ… ${pcm.length}B ${audioChunks.length} chunks ${Date.now() - t0}ms`);
if (res) res.end();
ws.close(); resolve(pcm);
} else if (msg.type === "Error") {
settled = true; clearTimeout(timer); ws.close();
reject(new Error(msg.message || "Deepgram error"));
}
} catch { }
});
ws.on("error", (e) => { if (!settled) { settled = true; clearTimeout(timer); reject(e); } });
ws.on("close", (code) => {
clearTimeout(timer);
if (!settled) {
settled = true;
audioChunks.length > 0 ? resolve(Buffer.concat(audioChunks)) : reject(new Error(`WS closed (${code})`));
}
});
});
}
// ─── Models ──────────────────────────────────────────────────────────────────
const MODELS = { "gpt-4o-mini-tts": true, "tts-1": true, "tts-1-hd": true };
app.get("/v1/models", auth, (_req, res) => {
res.json({
object: "list",
data: Object.keys(MODELS).map((id) => ({
id, object: "model", created: 1700000000, owned_by: "system",
permission: [], root: id, parent: null,
})),
});
});
app.get("/v1/models/:model", auth, (req, res) => {
if (!MODELS[req.params.model]) return openaiError(res, 404, `Model '${req.params.model}' not found`, "invalid_request_error", "model");
res.json({ id: req.params.model, object: "model", created: 1700000000, owned_by: "system", permission: [], root: req.params.model, parent: null });
});
// ─── POST /v1/audio/speech ───────────────────────────────────────────────────
// 100% OpenAI compatible: accepts model, input, voice, response_format, speed
// Always buffers complete audio and returns with Content-Length (like OpenAI)
// Clients can use with_streaming_response to read progressively on their end
app.post("/v1/audio/speech", auth, async (req, res) => {
const rid = req.id;
try {
const {
model = "gpt-4o-mini-tts",
input,
voice = "jessica",
response_format = "wav",
speed,
} = req.body;
// Validate
if (!input || typeof input !== "string") return openaiError(res, 400, "Missing required parameter: 'input'", "invalid_request_error", "input");
if (input.length > MAX_INPUT_LENGTH) return openaiError(res, 400, `Input too long (max ${MAX_INPUT_LENGTH} chars)`, "invalid_request_error", "input");
if (!MODELS[model]) return openaiError(res, 404, `Model '${model}' not found`, "invalid_request_error", "model");
if (!DEEPGRAM_API_KEY) return openaiError(res, 500, "DEEPGRAM_API_KEY not configured", "server_error");
const validFormats = ["mp3", "opus", "aac", "flac", "wav", "pcm"];
if (!validFormats.includes(response_format)) {
return openaiError(res, 400, `Invalid response_format '${response_format}'. Supported: ${validFormats.join(", ")}`, "invalid_request_error", "response_format");
}
console.log(`[${rid}] POST /v1/audio/speech model=${model} voice=${voice} fmt=${response_format} len=${input.length}`);
// Synthesize via streaming to response directly
const pcmData = await synthesize(input, rid, voice, res, response_format);
if (pcmData.length === 0 && !res.headersSent) return openaiError(res, 500, "No audio data received", "server_error");
// The actual binary data was already streamed to 'res'.
// However, we still need to build the audio file for the cache system.
let audioBuffer;
if (response_format === "pcm") {
audioBuffer = pcmData;
} else {
// Reconstruct the proper valid WAV block locally with correct final content length to save to disk.
audioBuffer = Buffer.concat([wavHeader(pcmData.length), pcmData]);
}
// Save to cache & cleanup
const ext = response_format === "pcm" ? "pcm" : "wav";
const filename = `${Date.now()}_${crypto.randomBytes(4).toString("hex")}.${ext}`;
fs.writeFileSync(path.join(CACHE_DIR, filename), audioBuffer);
setImmediate(cleanupCache);
} catch (err) {
console.error(`[${rid}] ❌ ${err.message}`);
if (!res.headersSent) openaiError(res, 500, err.message, "server_error");
}
});
// ─── POST /v1/text:synthesize (Google Cloud TTS Compatible) ──────────────────
// Maps Google Cloud TTS requests to our internal Voice Registry
app.post("/v1/text:synthesize", async (req, res) => {
// Note: Google Cloud typically uses query parameters like ?key=API_KEY.
// For this proxy, we check the query string OR the Bearer token.
const providedKey = req.query.key || (req.headers.authorization ? req.headers.authorization.slice(7).trim() : null);
if (!providedKey || providedKey !== API_KEY) {
return res.status(401).json({
error: { code: 401, message: "Request had invalid authentication credentials.", status: "UNAUTHENTICATED" }
});
}
const rid = req.id || `gcp-${crypto.randomBytes(4).toString("hex")}`;
try {
const payload = req.body;
const text = payload?.input?.text;
const voiceName = payload?.voice?.name || "jessica";
let encodingRequested = payload?.audioConfig?.audioEncoding || "LINEAR16"; // MP3, LINEAR16, OGG_OPUS
if (!text) {
return res.status(400).json({ error: { code: 400, message: "Invalid JSON payload received. Missing input.text", status: "INVALID_ARGUMENT" }});
}
if (!DEEPGRAM_API_KEY) {
return res.status(500).json({ error: { code: 500, message: "DEEPGRAM_API_KEY not configured", status: "INTERNAL" }});
}
// Map Google encoding string to our internal formats
let internalFormat = "wav";
if (encodingRequested === "LINEAR16") internalFormat = "pcm";
if (encodingRequested === "MP3") internalFormat = "mp3";
console.log(`[${rid}] POST /v1/text:synthesize voice=${voiceName} enc=${encodingRequested} len=${text.length}`);
// Google TTS does not stream via HTTP, it returns a massive base64 blob in JSON.
// So we will await the entire buffer instead of passing res.
const pcmData = await synthesize(text, rid, voiceName, null, internalFormat);
if (pcmData.length === 0) throw new Error("No audio data received");
let audioBuffer;
if (internalFormat === "pcm") {
audioBuffer = pcmData;
} else {
// For wav, mp3 (fake), opus - return WAV
audioBuffer = Buffer.concat([wavHeader(pcmData.length), pcmData]);
}
// Return Base64 Encoded JSON matching Google Cloud TTS
const base64Audio = audioBuffer.toString("base64");
res.json({
audioContent: base64Audio
});
} catch (err) {
console.error(`[${rid}] ❌ Google Cloud API Error: ${err.message}`);
res.status(500).json({
error: { code: 500, message: err.message, status: "INTERNAL" }
});
}
});
// ─── Cached files ────────────────────────────────────────────────────────────
app.get("/v1/audio/files", auth, (_req, res) => {
try {
const files = fs.readdirSync(CACHE_DIR)
.map((f) => { const s = fs.statSync(path.join(CACHE_DIR, f)); return { name: f, size: s.size, created: s.mtimeMs }; })
.sort((a, b) => b.created - a.created);
res.json({ files, count: files.length, max: MAX_CACHE_FILES });
} catch (e) { openaiError(res, 500, e.message, "server_error"); }
});
app.get("/v1/audio/files/:filename", auth, (req, res) => {
const fp = path.join(CACHE_DIR, req.params.filename);
if (!fs.existsSync(fp)) return openaiError(res, 404, "File not found");
res.setHeader("Content-Type", req.params.filename.endsWith(".wav") ? "audio/wav" : "audio/pcm");
res.sendFile(fp);
});
// ─── Health ──────────────────────────────────────────────────────────────────
app.get("/health", (_req, res) => {
const cacheFiles = fs.existsSync(CACHE_DIR) ? fs.readdirSync(CACHE_DIR).length : 0;
res.json({
status: "ok",
version: "1.0.0",
service: "openai-tts-proxy",
deepgram: !!DEEPGRAM_API_KEY,
providers: {
eleven_labs: ["jessica", "daniel", "piper", "mark"],
cartesia: ["kentucky_man", "helpful_woman"],
},
voices: ["jessica", "daniel", "kentucky_man", "helpful_woman", "piper", "mark"],
models: Object.keys(MODELS),
cache: { files: cacheFiles, max: MAX_CACHE_FILES },
});
});
// ─── Root (Render health check) ──────────────────────────────────────────────
app.get("/", (_req, res) => res.json({ status: "ok", service: "openai-tts-proxy", docs: "POST /v1/audio/speech, GET /v1/models, GET /health" }));
// ─── Catch-all & error handler ───────────────────────────────────────────────
app.use((req, res) => openaiError(res, 404, `Unknown request URL: ${req.method} ${req.path}`));
app.use((err, _req, res, _next) => { console.error("Unhandled:", err); openaiError(res, 500, "Internal server error", "server_error"); });
// ─── Graceful Shutdown ───────────────────────────────────────────────────────
let server;
function shutdown(sig) {
console.log(`\n${sig} β€” shutting down...`);
server?.close(() => process.exit(0));
setTimeout(() => process.exit(1), 10000);
}
process.on("SIGTERM", () => shutdown("SIGTERM"));
process.on("SIGINT", () => shutdown("SIGINT"));
// ─── Start ───────────────────────────────────────────────────────────────────
server = app.listen(PORT, "0.0.0.0", () => {
console.log(`\nπŸ”Š HuggingFace TTS API (ElevenLabs & Cartesia) v1.0.0`);
console.log(` http://localhost:${PORT}`);
console.log(` POST /v1/audio/speech (OpenAI Compatible)`);
console.log(` POST /v1/text:synthesize (Google TTS Compatible)`);
console.log(` GET /v1/models`);
console.log(` GET /v1/audio/files`);
console.log(` GET /health`);
console.log(` Backend: ElevenLabs (Jessica & Daniel) via Deepgram`);
console.log(` Cache: max ${MAX_CACHE_FILES} files`);
console.log(` Deepgram: ${DEEPGRAM_API_KEY ? "βœ…" : "❌ missing"}\n`);
});