Spaces:

owlninjam
/

FeatherLabs-tts

Sleeping

File size: 21,107 Bytes

require("dotenv").config();
const express = require("express");
const WebSocket = require("ws");
const crypto = require("crypto");
const fs = require("fs");
const path = require("path");
const app = express();

// ─── Config ──────────────────────────────────────────────────────────────────
const API_KEY = process.env.API_KEY || "sk-test-key";
const DEEPGRAM_API_KEY = process.env.DEEPGRAM_API_KEY || "";
const PORT = process.env.PORT || 7860;
const WS_TIMEOUT_MS = parseInt(process.env.WS_TIMEOUT_MS || "60000", 10);
const MAX_INPUT_LENGTH = parseInt(process.env.MAX_INPUT_LENGTH || "4096", 10);
const MAX_CACHE_FILES = parseInt(process.env.MAX_CACHE_FILES || "10", 10);
const CACHE_DIR = path.join("/tmp/", "cache");

if (!fs.existsSync(CACHE_DIR)) fs.mkdirSync(CACHE_DIR, { recursive: true });

// ─── Middleware ──────────────────────────────────────────────────────────────
app.use(express.json({ limit: "1mb" }));
app.use((_req, res, next) => {
    res.setHeader("Access-Control-Allow-Origin", "*");
    res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS, DELETE");
    res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
    next();
});
app.options("*", (_req, res) => res.sendStatus(204));
app.use((req, _res, next) => { req.id = `chatcmpl-${crypto.randomBytes(12).toString("hex")}`; next(); });

// ─── OpenAI Error Format ─────────────────────────────────────────────────────
function openaiError(res, status, message, type = "invalid_request_error", param = null) {
    res.status(status).json({
        error: {
            message,
            type,
            param,
            code: status === 401 ? "invalid_api_key" : status === 429 ? "rate_limit_exceeded" : null,
        },
    });
}

// ─── Auth ────────────────────────────────────────────────────────────────────
function auth(req, res, next) {
    const header = req.headers.authorization;
    if (!header || !header.startsWith("Bearer ")) {
        return openaiError(res, 401, "You didn't provide an API key. Provide your API key in an Authorization header using Bearer auth.");
    }
    if (header.slice(7).trim() !== API_KEY) {
        return openaiError(res, 401, "Incorrect API key provided.");
    }
    next();
}

// ─── WAV Header ──────────────────────────────────────────────────────────────
function wavHeader(dataSize, sampleRate = 24000, bits = 16, ch = 1) {
    const h = Buffer.alloc(44);
    h.write("RIFF", 0);
    h.writeUInt32LE(36 + dataSize, 4);
    h.write("WAVE", 8);
    h.write("fmt ", 12);
    h.writeUInt32LE(16, 16);
    h.writeUInt16LE(1, 20);
    h.writeUInt16LE(ch, 22);
    h.writeUInt32LE(sampleRate, 24);
    h.writeUInt32LE(sampleRate * ch * (bits / 8), 28);
    h.writeUInt16LE(ch * (bits / 8), 32);
    h.writeUInt16LE(bits, 34);
    h.write("data", 36);
    h.writeUInt32LE(dataSize, 40);
    return h;
}

// ─── Cache Cleanup ───────────────────────────────────────────────────────────
function cleanupCache() {
    try {
        const files = fs.readdirSync(CACHE_DIR)
            .map((f) => ({ name: f, time: fs.statSync(path.join(CACHE_DIR, f)).mtimeMs }))
            .sort((a, b) => b.time - a.time);
        if (files.length > MAX_CACHE_FILES) {
            for (const f of files.slice(MAX_CACHE_FILES)) {
                fs.unlinkSync(path.join(CACHE_DIR, f.name));
                console.log(`[Cache] Deleted: ${f.name}`);
            }
        }
    } catch (e) {
        console.error("[Cache] Cleanup error:", e.message);
    }
}

// ─── Synthesize via Deepgram Agent WebSocket ─────────────────────────────────
function synthesize(text, requestId, voiceName = "jessica", res = null, response_format = "wav") {
    return new Promise((resolve, reject) => {
        const audioChunks = [];
        let settled = false;
        let headersSent = false;
        const t0 = Date.now();

        const ws = new WebSocket("wss://agent.deepgram.com/v1/agent/converse", [
            "token", DEEPGRAM_API_KEY,
        ]);

        const timer = setTimeout(() => {
            if (!settled) { settled = true; ws.close(); reject(new Error("TTS timed out")); }
        }, WS_TIMEOUT_MS);

        // Voice registry — supports multiple TTS providers
        const voiceRegistry = {
            // ── ElevenLabs ──────────────────────────────────────────────────
            "jessica": {
                provider: {
                    type: "eleven_labs",
                    model_id: "eleven_multilingual_v2",
                    voice_id: "cgSgspJ2msm6clMCkdW9",
                }
            },
            "daniel": {
                provider: {
                    type: "eleven_labs",
                    model_id: "eleven_multilingual_v2",
                    voice_id: "onwK4e9ZLuTAKqWW03F9",
                }
            },
            "piper": {
                provider: {
                    type: "eleven_labs",
                    model_id: "eleven_multilingual_v2",
                    voice_id: "DtsPFCrhbCbbJkwZsb3d",
                }
            },
            "mark": {
                provider: {
                    type: "eleven_labs",
                    model_id: "eleven_multilingual_v2",
                    voice_id: "UgBBYS2sOqTuMpoF3BR0",
                }
            },
            // ── Cartesia ────────────────────────────────────────────────────
            "kentucky_man": {
                provider: {
                    type: "cartesia",
                    model_id: "sonic-2",
                    voice: { mode: "id", id: "726d5ae5-055f-4c3d-8355-d9677de68937" },
                }
            },
            "helpful_woman": {
                provider: {
                    type: "cartesia",
                    model_id: "sonic-2",
                    voice: { mode: "id", id: "156fb8d2-335b-4950-9cb3-a2d33befec77" },
                }
            },
        };

        const voiceKey = voiceName.toLowerCase().replace(/[\s-]+/g, "_");
        const voiceConfig = voiceRegistry[voiceKey] || voiceRegistry["jessica"];

        ws.on("open", () => {
            ws.send(JSON.stringify({
                type: "Settings",
                audio: {
                    input: { encoding: "linear16", sample_rate: 48000 },
                    output: { encoding: "linear16", sample_rate: 24000, container: "none" },
                },
                agent: {
                    language: "en",
                    speak: voiceConfig,
                    listen: { provider: { type: "deepgram", version: "v1", model: "nova-3" } },
                    think: { provider: { type: "open_ai", model: "gpt-4o-mini" }, prompt: "TTS engine. Do not respond." },
                    greeting: text,
                },
            }));
        });


        ws.on("message", (data, isBinary) => {
            if (settled) return;
            if (isBinary) { 
                const chunk = Buffer.from(data);
                audioChunks.push(chunk); 
                
                if (res) {
                    if (!headersSent) {
                        res.setHeader("Content-Type", response_format === "pcm" ? "audio/pcm" : "audio/wav");
                        res.setHeader("Transfer-Encoding", "chunked");
                        res.setHeader("x-request-id", requestId);
                        // Send a dummy WAV header of maximum length if needed, or omit it. 
                        // Because we stream real-time, we cannot know the final dataSize for WAV. 
                        // However, chunked transfer audio playback usually tolerates "0xFFFFFFFF" data block size.
                        // We subtract 36 to prevent UInt32 overflow inside the wavHeader function.
                        if (response_format !== "pcm") {
                            res.write(wavHeader(0xFFFFFFFF - 36));
                        }
                        headersSent = true;
                    }
                    res.write(chunk);
                }
                return; 
            }
            try {
                const msg = JSON.parse(data.toString());
                if (msg.type === "AgentAudioDone") {
                    settled = true; clearTimeout(timer);
                    const pcm = Buffer.concat(audioChunks);
                    console.log(`[${requestId}] ✅ ${pcm.length}B ${audioChunks.length} chunks ${Date.now() - t0}ms`);
                    if (res) res.end();
                    ws.close(); resolve(pcm);
                } else if (msg.type === "Error") {
                    settled = true; clearTimeout(timer); ws.close();
                    reject(new Error(msg.message || "Deepgram error"));
                }
            } catch { }
        });

        ws.on("error", (e) => { if (!settled) { settled = true; clearTimeout(timer); reject(e); } });
        ws.on("close", (code) => {
            clearTimeout(timer);
            if (!settled) {
                settled = true;
                audioChunks.length > 0 ? resolve(Buffer.concat(audioChunks)) : reject(new Error(`WS closed (${code})`));
            }
        });
    });
}

// ─── Models ──────────────────────────────────────────────────────────────────
const MODELS = { "gpt-4o-mini-tts": true, "tts-1": true, "tts-1-hd": true };

app.get("/v1/models", auth, (_req, res) => {
    res.json({
        object: "list",
        data: Object.keys(MODELS).map((id) => ({
            id, object: "model", created: 1700000000, owned_by: "system",
            permission: [], root: id, parent: null,
        })),
    });
});

app.get("/v1/models/:model", auth, (req, res) => {
    if (!MODELS[req.params.model]) return openaiError(res, 404, `Model '${req.params.model}' not found`, "invalid_request_error", "model");
    res.json({ id: req.params.model, object: "model", created: 1700000000, owned_by: "system", permission: [], root: req.params.model, parent: null });
});

// ─── POST /v1/audio/speech ───────────────────────────────────────────────────
// 100% OpenAI compatible: accepts model, input, voice, response_format, speed
// Always buffers complete audio and returns with Content-Length (like OpenAI)
// Clients can use with_streaming_response to read progressively on their end
app.post("/v1/audio/speech", auth, async (req, res) => {
    const rid = req.id;
    try {
        const {
            model = "gpt-4o-mini-tts",
            input,
            voice = "jessica",
            response_format = "wav",
            speed,
        } = req.body;

        // Validate
        if (!input || typeof input !== "string") return openaiError(res, 400, "Missing required parameter: 'input'", "invalid_request_error", "input");
        if (input.length > MAX_INPUT_LENGTH) return openaiError(res, 400, `Input too long (max ${MAX_INPUT_LENGTH} chars)`, "invalid_request_error", "input");
        if (!MODELS[model]) return openaiError(res, 404, `Model '${model}' not found`, "invalid_request_error", "model");
        if (!DEEPGRAM_API_KEY) return openaiError(res, 500, "DEEPGRAM_API_KEY not configured", "server_error");

        const validFormats = ["mp3", "opus", "aac", "flac", "wav", "pcm"];
        if (!validFormats.includes(response_format)) {
            return openaiError(res, 400, `Invalid response_format '${response_format}'. Supported: ${validFormats.join(", ")}`, "invalid_request_error", "response_format");
        }

        console.log(`[${rid}] POST /v1/audio/speech model=${model} voice=${voice} fmt=${response_format} len=${input.length}`);

        // Synthesize via streaming to response directly
        const pcmData = await synthesize(input, rid, voice, res, response_format);
        if (pcmData.length === 0 && !res.headersSent) return openaiError(res, 500, "No audio data received", "server_error");

        // The actual binary data was already streamed to 'res'.
        // However, we still need to build the audio file for the cache system.
        let audioBuffer;
        if (response_format === "pcm") {
            audioBuffer = pcmData;
        } else {
            // Reconstruct the proper valid WAV block locally with correct final content length to save to disk.
            audioBuffer = Buffer.concat([wavHeader(pcmData.length), pcmData]);
        }

        // Save to cache & cleanup
        const ext = response_format === "pcm" ? "pcm" : "wav";
        const filename = `${Date.now()}_${crypto.randomBytes(4).toString("hex")}.${ext}`;
        fs.writeFileSync(path.join(CACHE_DIR, filename), audioBuffer);
        setImmediate(cleanupCache);
        
    } catch (err) {
        console.error(`[${rid}] ❌ ${err.message}`);
        if (!res.headersSent) openaiError(res, 500, err.message, "server_error");
    }
});

// ─── POST /v1/text:synthesize (Google Cloud TTS Compatible) ──────────────────
// Maps Google Cloud TTS requests to our internal Voice Registry
app.post("/v1/text:synthesize", async (req, res) => {
    // Note: Google Cloud typically uses query parameters like ?key=API_KEY.
    // For this proxy, we check the query string OR the Bearer token.
    const providedKey = req.query.key || (req.headers.authorization ? req.headers.authorization.slice(7).trim() : null);
    if (!providedKey || providedKey !== API_KEY) {
        return res.status(401).json({
            error: { code: 401, message: "Request had invalid authentication credentials.", status: "UNAUTHENTICATED" }
        });
    }

    const rid = req.id || `gcp-${crypto.randomBytes(4).toString("hex")}`;
    try {
        const payload = req.body;
        const text = payload?.input?.text;
        const voiceName = payload?.voice?.name || "jessica"; 
        let encodingRequested = payload?.audioConfig?.audioEncoding || "LINEAR16"; // MP3, LINEAR16, OGG_OPUS

        if (!text) {
            return res.status(400).json({ error: { code: 400, message: "Invalid JSON payload received. Missing input.text", status: "INVALID_ARGUMENT" }});
        }
        if (!DEEPGRAM_API_KEY) {
            return res.status(500).json({ error: { code: 500, message: "DEEPGRAM_API_KEY not configured", status: "INTERNAL" }});
        }

        // Map Google encoding string to our internal formats
        let internalFormat = "wav";
        if (encodingRequested === "LINEAR16") internalFormat = "pcm";
        if (encodingRequested === "MP3") internalFormat = "mp3";

        console.log(`[${rid}] POST /v1/text:synthesize voice=${voiceName} enc=${encodingRequested} len=${text.length}`);

        // Google TTS does not stream via HTTP, it returns a massive base64 blob in JSON.
        // So we will await the entire buffer instead of passing res.
        const pcmData = await synthesize(text, rid, voiceName, null, internalFormat);
        if (pcmData.length === 0) throw new Error("No audio data received");

        let audioBuffer;
        if (internalFormat === "pcm") {
            audioBuffer = pcmData;
        } else {
            // For wav, mp3 (fake), opus - return WAV
            audioBuffer = Buffer.concat([wavHeader(pcmData.length), pcmData]);
        }

        // Return Base64 Encoded JSON matching Google Cloud TTS 
        const base64Audio = audioBuffer.toString("base64");
        
        res.json({
            audioContent: base64Audio
        });

    } catch (err) {
        console.error(`[${rid}] ❌ Google Cloud API Error: ${err.message}`);
        res.status(500).json({
            error: { code: 500, message: err.message, status: "INTERNAL" }
        });
    }
});

// ─── Cached files ────────────────────────────────────────────────────────────
app.get("/v1/audio/files", auth, (_req, res) => {
    try {
        const files = fs.readdirSync(CACHE_DIR)
            .map((f) => { const s = fs.statSync(path.join(CACHE_DIR, f)); return { name: f, size: s.size, created: s.mtimeMs }; })
            .sort((a, b) => b.created - a.created);
        res.json({ files, count: files.length, max: MAX_CACHE_FILES });
    } catch (e) { openaiError(res, 500, e.message, "server_error"); }
});

app.get("/v1/audio/files/:filename", auth, (req, res) => {
    const fp = path.join(CACHE_DIR, req.params.filename);
    if (!fs.existsSync(fp)) return openaiError(res, 404, "File not found");
    res.setHeader("Content-Type", req.params.filename.endsWith(".wav") ? "audio/wav" : "audio/pcm");
    res.sendFile(fp);
});

// ─── Health ──────────────────────────────────────────────────────────────────
app.get("/health", (_req, res) => {
    const cacheFiles = fs.existsSync(CACHE_DIR) ? fs.readdirSync(CACHE_DIR).length : 0;
    res.json({
        status: "ok",
        version: "1.0.0",
        service: "openai-tts-proxy",
        deepgram: !!DEEPGRAM_API_KEY,
        providers: {
            eleven_labs: ["jessica", "daniel", "piper", "mark"],
            cartesia: ["kentucky_man", "helpful_woman"],
        },
        voices: ["jessica", "daniel", "kentucky_man", "helpful_woman", "piper", "mark"],
        models: Object.keys(MODELS),
        cache: { files: cacheFiles, max: MAX_CACHE_FILES },
    });
});

// ─── Root (Render health check) ──────────────────────────────────────────────
app.get("/", (_req, res) => res.json({ status: "ok", service: "openai-tts-proxy", docs: "POST /v1/audio/speech, GET /v1/models, GET /health" }));

// ─── Catch-all & error handler ───────────────────────────────────────────────
app.use((req, res) => openaiError(res, 404, `Unknown request URL: ${req.method} ${req.path}`));
app.use((err, _req, res, _next) => { console.error("Unhandled:", err); openaiError(res, 500, "Internal server error", "server_error"); });

// ─── Graceful Shutdown ───────────────────────────────────────────────────────
let server;
function shutdown(sig) {
    console.log(`\n${sig} — shutting down...`);
    server?.close(() => process.exit(0));
    setTimeout(() => process.exit(1), 10000);
}
process.on("SIGTERM", () => shutdown("SIGTERM"));
process.on("SIGINT", () => shutdown("SIGINT"));

// ─── Start ───────────────────────────────────────────────────────────────────
server = app.listen(PORT, "0.0.0.0", () => {
    console.log(`\n🔊 HuggingFace TTS API (ElevenLabs & Cartesia) v1.0.0`);
    console.log(`   http://localhost:${PORT}`);
    console.log(`   POST /v1/audio/speech        (OpenAI Compatible)`);
    console.log(`   POST /v1/text:synthesize     (Google TTS Compatible)`);
    console.log(`   GET  /v1/models`);
    console.log(`   GET  /v1/audio/files`);
    console.log(`   GET  /health`);
    console.log(`   Backend: ElevenLabs (Jessica & Daniel) via Deepgram`);
    console.log(`   Cache: max ${MAX_CACHE_FILES} files`);
    console.log(`   Deepgram: ${DEEPGRAM_API_KEY ? "✅" : "❌ missing"}\n`);
});