| import { completeSimple, type AssistantMessage } from "@mariozechner/pi-ai"; |
| import { describe, expect, it, vi, beforeEach } from "vitest"; |
| import { ensureCustomApiRegistered } from "../agents/custom-api-registry.js"; |
| import { getApiKeyForModel } from "../agents/model-auth.js"; |
| import { resolveModel } from "../agents/pi-embedded-runner/model.js"; |
| import type { OpenClawConfig } from "../config/config.js"; |
| import { withEnv } from "../test-utils/env.js"; |
| import * as tts from "./tts.js"; |
|
|
| vi.mock("@mariozechner/pi-ai", async (importOriginal) => { |
| const original = await importOriginal<typeof import("@mariozechner/pi-ai")>(); |
| return { |
| ...original, |
| completeSimple: vi.fn(), |
| }; |
| }); |
|
|
| vi.mock("@mariozechner/pi-ai/oauth", () => ({ |
| getOAuthProviders: () => [], |
| getOAuthApiKey: vi.fn(async () => null), |
| })); |
|
|
| vi.mock("../agents/pi-embedded-runner/model.js", () => ({ |
| resolveModel: vi.fn((provider: string, modelId: string) => ({ |
| model: { |
| provider, |
| id: modelId, |
| name: modelId, |
| api: "openai-completions", |
| reasoning: false, |
| input: ["text"], |
| cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, |
| contextWindow: 128000, |
| maxTokens: 8192, |
| } as any, |
| authStorage: { profiles: {} }, |
| modelRegistry: { find: vi.fn() }, |
| })), |
| })); |
|
|
| vi.mock("../agents/model-auth.js", () => ({ |
| getApiKeyForModel: vi.fn(async () => ({ |
| apiKey: "test-api-key", |
| source: "test", |
| mode: "api-key", |
| })), |
| requireApiKey: vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""), |
| })); |
|
|
| vi.mock("../agents/custom-api-registry.js", () => ({ |
| ensureCustomApiRegistered: vi.fn(), |
| })); |
|
|
| const { _test, resolveTtsConfig, maybeApplyTtsToPayload, getTtsProvider } = tts; |
|
|
| const { |
| isValidVoiceId, |
| isValidOpenAIVoice, |
| isValidOpenAIModel, |
| OPENAI_TTS_MODELS, |
| OPENAI_TTS_VOICES, |
| parseTtsDirectives, |
| resolveOpenAITtsInstructions, |
| resolveModelOverridePolicy, |
| summarizeText, |
| resolveOutputFormat, |
| resolveEdgeOutputFormat, |
| } = _test; |
|
|
| const mockAssistantMessage = (content: AssistantMessage["content"]): AssistantMessage => ({ |
| role: "assistant", |
| content, |
| api: "openai-completions", |
| provider: "openai", |
| model: "gpt-4o-mini", |
| usage: { |
| input: 1, |
| output: 1, |
| cacheRead: 0, |
| cacheWrite: 0, |
| totalTokens: 2, |
| cost: { |
| input: 0, |
| output: 0, |
| cacheRead: 0, |
| cacheWrite: 0, |
| total: 0, |
| }, |
| }, |
| stopReason: "stop", |
| timestamp: Date.now(), |
| }); |
|
|
| function createOpenAiTelephonyCfg(model: "tts-1" | "gpt-4o-mini-tts"): OpenClawConfig { |
| return { |
| messages: { |
| tts: { |
| provider: "openai", |
| openai: { |
| apiKey: "test-key", |
| model, |
| voice: "alloy", |
| instructions: "Speak warmly", |
| }, |
| }, |
| }, |
| }; |
| } |
|
|
| describe("tts", () => { |
| beforeEach(() => { |
| vi.clearAllMocks(); |
| vi.mocked(completeSimple).mockResolvedValue( |
| mockAssistantMessage([{ type: "text", text: "Summary" }]), |
| ); |
| }); |
|
|
| describe("isValidVoiceId", () => { |
| it("validates ElevenLabs voice ID length and character rules", () => { |
| const cases = [ |
| { value: "pMsXgVXv3BLzUgSXRplE", expected: true }, |
| { value: "21m00Tcm4TlvDq8ikWAM", expected: true }, |
| { value: "EXAVITQu4vr4xnSDxMaL", expected: true }, |
| { value: "a1b2c3d4e5", expected: true }, |
| { value: "a".repeat(40), expected: true }, |
| { value: "", expected: false }, |
| { value: "abc", expected: false }, |
| { value: "123456789", expected: false }, |
| { value: "a".repeat(41), expected: false }, |
| { value: "a".repeat(100), expected: false }, |
| { value: "pMsXgVXv3BLz-gSXRplE", expected: false }, |
| { value: "pMsXgVXv3BLz_gSXRplE", expected: false }, |
| { value: "pMsXgVXv3BLz gSXRplE", expected: false }, |
| { value: "../../../etc/passwd", expected: false }, |
| { value: "voice?param=value", expected: false }, |
| ] as const; |
| for (const testCase of cases) { |
| expect(isValidVoiceId(testCase.value), testCase.value).toBe(testCase.expected); |
| } |
| }); |
| }); |
|
|
| describe("isValidOpenAIVoice", () => { |
| it("accepts all valid OpenAI voices including newer additions", () => { |
| for (const voice of OPENAI_TTS_VOICES) { |
| expect(isValidOpenAIVoice(voice)).toBe(true); |
| } |
| for (const newerVoice of ["ballad", "cedar", "juniper", "marin", "verse"]) { |
| expect(isValidOpenAIVoice(newerVoice), newerVoice).toBe(true); |
| } |
| }); |
|
|
| it("rejects invalid voice names", () => { |
| expect(isValidOpenAIVoice("invalid")).toBe(false); |
| expect(isValidOpenAIVoice("")).toBe(false); |
| expect(isValidOpenAIVoice("ALLOY")).toBe(false); |
| expect(isValidOpenAIVoice("alloy ")).toBe(false); |
| expect(isValidOpenAIVoice(" alloy")).toBe(false); |
| }); |
|
|
| it("treats the default endpoint with trailing slash as the default endpoint", () => { |
| expect(isValidOpenAIVoice("kokoro-custom-voice", "https://api.openai.com/v1/")).toBe(false); |
| }); |
| }); |
|
|
| describe("isValidOpenAIModel", () => { |
| it("matches the supported model set and rejects unsupported values", () => { |
| expect(OPENAI_TTS_MODELS).toContain("gpt-4o-mini-tts"); |
| expect(OPENAI_TTS_MODELS).toContain("tts-1"); |
| expect(OPENAI_TTS_MODELS).toContain("tts-1-hd"); |
| expect(OPENAI_TTS_MODELS).toHaveLength(3); |
| expect(Array.isArray(OPENAI_TTS_MODELS)).toBe(true); |
| expect(OPENAI_TTS_MODELS.length).toBeGreaterThan(0); |
| const cases = [ |
| { model: "gpt-4o-mini-tts", expected: true }, |
| { model: "tts-1", expected: true }, |
| { model: "tts-1-hd", expected: true }, |
| { model: "invalid", expected: false }, |
| { model: "", expected: false }, |
| { model: "gpt-4", expected: false }, |
| ] as const; |
| for (const testCase of cases) { |
| expect(isValidOpenAIModel(testCase.model), testCase.model).toBe(testCase.expected); |
| } |
| }); |
|
|
| it("treats the default endpoint with trailing slash as the default endpoint", () => { |
| expect(isValidOpenAIModel("kokoro-custom-model", "https://api.openai.com/v1/")).toBe(false); |
| }); |
| }); |
|
|
| describe("resolveOpenAITtsInstructions", () => { |
| it("keeps instructions only for gpt-4o-mini-tts variants", () => { |
| expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", " Speak warmly ")).toBe( |
| "Speak warmly", |
| ); |
| expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts-2025-12-15", "Speak warmly")).toBe( |
| "Speak warmly", |
| ); |
| expect(resolveOpenAITtsInstructions("tts-1", "Speak warmly")).toBeUndefined(); |
| expect(resolveOpenAITtsInstructions("tts-1-hd", "Speak warmly")).toBeUndefined(); |
| expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", " ")).toBeUndefined(); |
| }); |
| }); |
|
|
| describe("resolveOutputFormat", () => { |
| it("selects opus for voice-bubble channels (telegram/feishu/whatsapp) and mp3 for others", () => { |
| const cases = [ |
| { |
| channel: "telegram", |
| expected: { |
| openai: "opus", |
| elevenlabs: "opus_48000_64", |
| extension: ".opus", |
| voiceCompatible: true, |
| }, |
| }, |
| { |
| channel: "feishu", |
| expected: { |
| openai: "opus", |
| elevenlabs: "opus_48000_64", |
| extension: ".opus", |
| voiceCompatible: true, |
| }, |
| }, |
| { |
| channel: "whatsapp", |
| expected: { |
| openai: "opus", |
| elevenlabs: "opus_48000_64", |
| extension: ".opus", |
| voiceCompatible: true, |
| }, |
| }, |
| { |
| channel: "discord", |
| expected: { |
| openai: "mp3", |
| elevenlabs: "mp3_44100_128", |
| extension: ".mp3", |
| voiceCompatible: false, |
| }, |
| }, |
| ] as const; |
| for (const testCase of cases) { |
| const output = resolveOutputFormat(testCase.channel); |
| expect(output.openai, testCase.channel).toBe(testCase.expected.openai); |
| expect(output.elevenlabs, testCase.channel).toBe(testCase.expected.elevenlabs); |
| expect(output.extension, testCase.channel).toBe(testCase.expected.extension); |
| expect(output.voiceCompatible, testCase.channel).toBe(testCase.expected.voiceCompatible); |
| } |
| }); |
| }); |
|
|
| describe("resolveEdgeOutputFormat", () => { |
| const baseCfg: OpenClawConfig = { |
| agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, |
| messages: { tts: {} }, |
| }; |
|
|
| it("uses default edge output format unless overridden", () => { |
| const cases = [ |
| { |
| name: "default", |
| cfg: baseCfg, |
| expected: "audio-24khz-48kbitrate-mono-mp3", |
| }, |
| { |
| name: "override", |
| cfg: { |
| ...baseCfg, |
| messages: { |
| tts: { |
| edge: { outputFormat: "audio-24khz-96kbitrate-mono-mp3" }, |
| }, |
| }, |
| } as OpenClawConfig, |
| expected: "audio-24khz-96kbitrate-mono-mp3", |
| }, |
| ] as const; |
| for (const testCase of cases) { |
| const config = resolveTtsConfig(testCase.cfg); |
| expect(resolveEdgeOutputFormat(config), testCase.name).toBe(testCase.expected); |
| } |
| }); |
| }); |
|
|
| describe("parseTtsDirectives", () => { |
| it("extracts overrides and strips directives when enabled", () => { |
| const policy = resolveModelOverridePolicy({ enabled: true, allowProvider: true }); |
| const input = |
| "Hello [[tts:provider=elevenlabs voiceId=pMsXgVXv3BLzUgSXRplE stability=0.4 speed=1.1]] world\n\n" + |
| "[[tts:text]](laughs) Read the song once more.[[/tts:text]]"; |
| const result = parseTtsDirectives(input, policy); |
|
|
| expect(result.cleanedText).not.toContain("[[tts:"); |
| expect(result.ttsText).toBe("(laughs) Read the song once more."); |
| expect(result.overrides.provider).toBe("elevenlabs"); |
| expect(result.overrides.elevenlabs?.voiceId).toBe("pMsXgVXv3BLzUgSXRplE"); |
| expect(result.overrides.elevenlabs?.voiceSettings?.stability).toBe(0.4); |
| expect(result.overrides.elevenlabs?.voiceSettings?.speed).toBe(1.1); |
| }); |
|
|
| it("accepts edge as provider override", () => { |
| const policy = resolveModelOverridePolicy({ enabled: true, allowProvider: true }); |
| const input = "Hello [[tts:provider=edge]] world"; |
| const result = parseTtsDirectives(input, policy); |
|
|
| expect(result.overrides.provider).toBe("edge"); |
| }); |
|
|
| it("rejects provider override by default while keeping voice overrides enabled", () => { |
| const policy = resolveModelOverridePolicy({ enabled: true }); |
| const input = "Hello [[tts:provider=edge voice=alloy]] world"; |
| const result = parseTtsDirectives(input, policy); |
|
|
| expect(result.overrides.provider).toBeUndefined(); |
| expect(result.overrides.openai?.voice).toBe("alloy"); |
| }); |
|
|
| it("keeps text intact when overrides are disabled", () => { |
| const policy = resolveModelOverridePolicy({ enabled: false }); |
| const input = "Hello [[tts:voice=alloy]] world"; |
| const result = parseTtsDirectives(input, policy); |
|
|
| expect(result.cleanedText).toBe(input); |
| expect(result.overrides.provider).toBeUndefined(); |
| }); |
|
|
| it("accepts custom voices and models when openaiBaseUrl is a non-default endpoint", () => { |
| const policy = resolveModelOverridePolicy({ enabled: true }); |
| const input = "Hello [[tts:voice=kokoro-chinese model=kokoro-v1]] world"; |
| const customBaseUrl = "http://localhost:8880/v1"; |
|
|
| const result = parseTtsDirectives(input, policy, customBaseUrl); |
|
|
| expect(result.overrides.openai?.voice).toBe("kokoro-chinese"); |
| expect(result.overrides.openai?.model).toBe("kokoro-v1"); |
| expect(result.warnings).toHaveLength(0); |
| }); |
|
|
| it("rejects unknown voices and models when openaiBaseUrl is the default OpenAI endpoint", () => { |
| const policy = resolveModelOverridePolicy({ enabled: true }); |
| const input = "Hello [[tts:voice=kokoro-chinese model=kokoro-v1]] world"; |
| const defaultBaseUrl = "https://api.openai.com/v1"; |
|
|
| const result = parseTtsDirectives(input, policy, defaultBaseUrl); |
|
|
| expect(result.overrides.openai?.voice).toBeUndefined(); |
| expect(result.warnings).toContain('invalid OpenAI voice "kokoro-chinese"'); |
| }); |
|
|
| it("strips orphaned closing tags and bare directives from visible text", () => { |
| const policy = resolveModelOverridePolicy({ enabled: true }); |
| const input = "[[tts:widowmaker]] [[tts:es]] Entendido. [[/tts:text]] [[/tts:texto]]"; |
|
|
| const result = parseTtsDirectives(input, policy); |
|
|
| expect(result.cleanedText.trim()).toBe("Entendido."); |
| expect(result.hasDirective).toBe(true); |
| }); |
| }); |
|
|
| describe("summarizeText", () => { |
| const baseCfg: OpenClawConfig = { |
| agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, |
| messages: { tts: {} }, |
| }; |
| const baseConfig = resolveTtsConfig(baseCfg); |
|
|
| it("summarizes text and returns result with metrics", async () => { |
| const mockSummary = "This is a summarized version of the text."; |
| vi.mocked(completeSimple).mockResolvedValue( |
| mockAssistantMessage([{ type: "text", text: mockSummary }]), |
| ); |
|
|
| const longText = "A".repeat(2000); |
| const result = await summarizeText({ |
| text: longText, |
| targetLength: 1500, |
| cfg: baseCfg, |
| config: baseConfig, |
| timeoutMs: 30_000, |
| }); |
|
|
| expect(result.summary).toBe(mockSummary); |
| expect(result.inputLength).toBe(2000); |
| expect(result.outputLength).toBe(mockSummary.length); |
| expect(result.latencyMs).toBeGreaterThanOrEqual(0); |
| expect(completeSimple).toHaveBeenCalledTimes(1); |
| }); |
|
|
| it("calls the summary model with the expected parameters", async () => { |
| await summarizeText({ |
| text: "Long text to summarize", |
| targetLength: 500, |
| cfg: baseCfg, |
| config: baseConfig, |
| timeoutMs: 30_000, |
| }); |
|
|
| const callArgs = vi.mocked(completeSimple).mock.calls[0]; |
| expect(callArgs?.[1]?.messages?.[0]?.role).toBe("user"); |
| expect(callArgs?.[2]?.maxTokens).toBe(250); |
| expect(callArgs?.[2]?.temperature).toBe(0.3); |
| expect(getApiKeyForModel).toHaveBeenCalledTimes(1); |
| }); |
|
|
| it("uses summaryModel override when configured", async () => { |
| const cfg: OpenClawConfig = { |
| agents: { defaults: { model: { primary: "anthropic/claude-opus-4-5" } } }, |
| messages: { tts: { summaryModel: "openai/gpt-4.1-mini" } }, |
| }; |
| const config = resolveTtsConfig(cfg); |
| await summarizeText({ |
| text: "Long text to summarize", |
| targetLength: 500, |
| cfg, |
| config, |
| timeoutMs: 30_000, |
| }); |
|
|
| expect(resolveModel).toHaveBeenCalledWith("openai", "gpt-4.1-mini", undefined, cfg); |
| }); |
|
|
| it("registers the Ollama api before direct summarization", async () => { |
| vi.mocked(resolveModel).mockReturnValue({ |
| model: { |
| provider: "ollama", |
| id: "qwen3:8b", |
| name: "qwen3:8b", |
| api: "ollama", |
| baseUrl: "http://127.0.0.1:11434", |
| reasoning: false, |
| input: ["text"], |
| cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, |
| contextWindow: 128000, |
| maxTokens: 8192, |
| } as any, |
| authStorage: { profiles: {} } as never, |
| modelRegistry: { find: vi.fn() } as never, |
| } as never); |
|
|
| await summarizeText({ |
| text: "Long text to summarize", |
| targetLength: 500, |
| cfg: baseCfg, |
| config: baseConfig, |
| timeoutMs: 30_000, |
| }); |
|
|
| expect(ensureCustomApiRegistered).toHaveBeenCalledWith("ollama", expect.any(Function)); |
| }); |
|
|
| it("validates targetLength bounds", async () => { |
| const cases = [ |
| { targetLength: 99, shouldThrow: true }, |
| { targetLength: 100, shouldThrow: false }, |
| { targetLength: 10000, shouldThrow: false }, |
| { targetLength: 10001, shouldThrow: true }, |
| ] as const; |
| for (const testCase of cases) { |
| const call = summarizeText({ |
| text: "text", |
| targetLength: testCase.targetLength, |
| cfg: baseCfg, |
| config: baseConfig, |
| timeoutMs: 30_000, |
| }); |
| if (testCase.shouldThrow) { |
| await expect(call, String(testCase.targetLength)).rejects.toThrow( |
| `Invalid targetLength: ${testCase.targetLength}`, |
| ); |
| } else { |
| await expect(call, String(testCase.targetLength)).resolves.toBeDefined(); |
| } |
| } |
| }); |
|
|
| it("throws when summary output is missing or empty", async () => { |
| const cases = [ |
| { name: "no summary blocks", message: mockAssistantMessage([]) }, |
| { |
| name: "empty summary content", |
| message: mockAssistantMessage([{ type: "text", text: " " }]), |
| }, |
| ] as const; |
| for (const testCase of cases) { |
| vi.mocked(completeSimple).mockResolvedValue(testCase.message); |
| await expect( |
| summarizeText({ |
| text: "text", |
| targetLength: 500, |
| cfg: baseCfg, |
| config: baseConfig, |
| timeoutMs: 30_000, |
| }), |
| testCase.name, |
| ).rejects.toThrow("No summary returned"); |
| } |
| }); |
| }); |
|
|
| describe("getTtsProvider", () => { |
| const baseCfg: OpenClawConfig = { |
| agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, |
| messages: { tts: {} }, |
| }; |
|
|
| it("selects provider based on available API keys", () => { |
| const cases = [ |
| { |
| env: { |
| OPENAI_API_KEY: "test-openai-key", |
| ELEVENLABS_API_KEY: undefined, |
| XI_API_KEY: undefined, |
| }, |
| prefsPath: "/tmp/tts-prefs-openai.json", |
| expected: "openai", |
| }, |
| { |
| env: { |
| OPENAI_API_KEY: undefined, |
| ELEVENLABS_API_KEY: "test-elevenlabs-key", |
| XI_API_KEY: undefined, |
| }, |
| prefsPath: "/tmp/tts-prefs-elevenlabs.json", |
| expected: "elevenlabs", |
| }, |
| { |
| env: { |
| OPENAI_API_KEY: undefined, |
| ELEVENLABS_API_KEY: undefined, |
| XI_API_KEY: undefined, |
| }, |
| prefsPath: "/tmp/tts-prefs-edge.json", |
| expected: "edge", |
| }, |
| ] as const; |
|
|
| for (const testCase of cases) { |
| withEnv(testCase.env, () => { |
| const config = resolveTtsConfig(baseCfg); |
| const provider = getTtsProvider(config, testCase.prefsPath); |
| expect(provider).toBe(testCase.expected); |
| }); |
| } |
| }); |
| }); |
|
|
| describe("resolveTtsConfig – openai.baseUrl", () => { |
| const baseCfg: OpenClawConfig = { |
| agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, |
| messages: { tts: {} }, |
| }; |
|
|
| it("defaults to the official OpenAI endpoint", () => { |
| withEnv({ OPENAI_TTS_BASE_URL: undefined }, () => { |
| const config = resolveTtsConfig(baseCfg); |
| expect(config.openai.baseUrl).toBe("https://api.openai.com/v1"); |
| }); |
| }); |
|
|
| it("picks up OPENAI_TTS_BASE_URL env var when no config baseUrl is set", () => { |
| withEnv({ OPENAI_TTS_BASE_URL: "http://localhost:8880/v1" }, () => { |
| const config = resolveTtsConfig(baseCfg); |
| expect(config.openai.baseUrl).toBe("http://localhost:8880/v1"); |
| }); |
| }); |
|
|
| it("config baseUrl takes precedence over env var", () => { |
| const cfg: OpenClawConfig = { |
| ...baseCfg, |
| messages: { |
| tts: { openai: { baseUrl: "http://my-server:9000/v1" } }, |
| }, |
| }; |
| withEnv({ OPENAI_TTS_BASE_URL: "http://localhost:8880/v1" }, () => { |
| const config = resolveTtsConfig(cfg); |
| expect(config.openai.baseUrl).toBe("http://my-server:9000/v1"); |
| }); |
| }); |
|
|
| it("strips trailing slashes from the resolved baseUrl", () => { |
| const cfg: OpenClawConfig = { |
| ...baseCfg, |
| messages: { |
| tts: { openai: { baseUrl: "http://my-server:9000/v1///" } }, |
| }, |
| }; |
| const config = resolveTtsConfig(cfg); |
| expect(config.openai.baseUrl).toBe("http://my-server:9000/v1"); |
| }); |
|
|
| it("strips trailing slashes from env var baseUrl", () => { |
| withEnv({ OPENAI_TTS_BASE_URL: "http://localhost:8880/v1/" }, () => { |
| const config = resolveTtsConfig(baseCfg); |
| expect(config.openai.baseUrl).toBe("http://localhost:8880/v1"); |
| }); |
| }); |
| }); |
|
|
| describe("textToSpeechTelephony – openai instructions", () => { |
| const withMockedTelephonyFetch = async ( |
| run: (fetchMock: ReturnType<typeof vi.fn>) => Promise<void>, |
| ) => { |
| const originalFetch = globalThis.fetch; |
| const fetchMock = vi.fn(async () => ({ |
| ok: true, |
| arrayBuffer: async () => new ArrayBuffer(2), |
| })); |
| globalThis.fetch = fetchMock as unknown as typeof fetch; |
| try { |
| await run(fetchMock); |
| } finally { |
| globalThis.fetch = originalFetch; |
| } |
| }; |
|
|
| async function expectTelephonyInstructions( |
| model: "tts-1" | "gpt-4o-mini-tts", |
| expectedInstructions: string | undefined, |
| ) { |
| await withMockedTelephonyFetch(async (fetchMock) => { |
| const result = await tts.textToSpeechTelephony({ |
| text: "Hello there, friendly caller.", |
| cfg: createOpenAiTelephonyCfg(model), |
| }); |
|
|
| expect(result.success).toBe(true); |
| expect(fetchMock).toHaveBeenCalledTimes(1); |
| const [, init] = fetchMock.mock.calls[0] as [string, RequestInit]; |
| expect(typeof init.body).toBe("string"); |
| const body = JSON.parse(init.body as string) as Record<string, unknown>; |
| expect(body.instructions).toBe(expectedInstructions); |
| }); |
| } |
|
|
| it("omits instructions for unsupported speech models", async () => { |
| await expectTelephonyInstructions("tts-1", undefined); |
| }); |
|
|
| it("includes instructions for gpt-4o-mini-tts", async () => { |
| await expectTelephonyInstructions("gpt-4o-mini-tts", "Speak warmly"); |
| }); |
| }); |
|
|
| describe("maybeApplyTtsToPayload", () => { |
| const baseCfg: OpenClawConfig = { |
| agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, |
| messages: { |
| tts: { |
| auto: "inbound", |
| provider: "openai", |
| openai: { apiKey: "test-key", model: "gpt-4o-mini-tts", voice: "alloy" }, |
| }, |
| }, |
| }; |
|
|
| const withMockedAutoTtsFetch = async ( |
| run: (fetchMock: ReturnType<typeof vi.fn>) => Promise<void>, |
| ) => { |
| const prevPrefs = process.env.OPENCLAW_TTS_PREFS; |
| process.env.OPENCLAW_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`; |
| const originalFetch = globalThis.fetch; |
| const fetchMock = vi.fn(async () => ({ |
| ok: true, |
| arrayBuffer: async () => new ArrayBuffer(1), |
| })); |
| globalThis.fetch = fetchMock as unknown as typeof fetch; |
| try { |
| await run(fetchMock); |
| } finally { |
| globalThis.fetch = originalFetch; |
| process.env.OPENCLAW_TTS_PREFS = prevPrefs; |
| } |
| }; |
|
|
| const taggedCfg: OpenClawConfig = { |
| ...baseCfg, |
| messages: { |
| ...baseCfg.messages!, |
| tts: { ...baseCfg.messages!.tts, auto: "tagged" }, |
| }, |
| }; |
|
|
| it("applies inbound auto-TTS gating by audio status and cleaned text length", async () => { |
| const cases = [ |
| { |
| name: "inbound gating blocks non-audio", |
| payload: { text: "Hello world" }, |
| inboundAudio: false, |
| expectedFetchCalls: 0, |
| expectSamePayload: true, |
| }, |
| { |
| name: "inbound gating blocks too-short cleaned text", |
| payload: { text: "### **bold**" }, |
| inboundAudio: true, |
| expectedFetchCalls: 0, |
| expectSamePayload: true, |
| }, |
| { |
| name: "inbound gating allows audio with real text", |
| payload: { text: "Hello world" }, |
| inboundAudio: true, |
| expectedFetchCalls: 1, |
| expectSamePayload: false, |
| }, |
| ] as const; |
|
|
| for (const testCase of cases) { |
| await withMockedAutoTtsFetch(async (fetchMock) => { |
| const result = await maybeApplyTtsToPayload({ |
| payload: testCase.payload, |
| cfg: baseCfg, |
| kind: "final", |
| inboundAudio: testCase.inboundAudio, |
| }); |
| expect(fetchMock, testCase.name).toHaveBeenCalledTimes(testCase.expectedFetchCalls); |
| if (testCase.expectSamePayload) { |
| expect(result, testCase.name).toBe(testCase.payload); |
| } else { |
| expect(result.mediaUrl, testCase.name).toBeDefined(); |
| } |
| }); |
| } |
| }); |
|
|
| it("skips auto-TTS in tagged mode unless a tts tag is present", async () => { |
| await withMockedAutoTtsFetch(async (fetchMock) => { |
| const payload = { text: "Hello world" }; |
| const result = await maybeApplyTtsToPayload({ |
| payload, |
| cfg: taggedCfg, |
| kind: "final", |
| }); |
|
|
| expect(result).toBe(payload); |
| expect(fetchMock).not.toHaveBeenCalled(); |
| }); |
| }); |
|
|
| it("runs auto-TTS in tagged mode when tags are present", async () => { |
| await withMockedAutoTtsFetch(async (fetchMock) => { |
| const result = await maybeApplyTtsToPayload({ |
| payload: { text: "[[tts:text]]Hello world[[/tts:text]]" }, |
| cfg: taggedCfg, |
| kind: "final", |
| }); |
|
|
| expect(result.mediaUrl).toBeDefined(); |
| expect(fetchMock).toHaveBeenCalledTimes(1); |
| }); |
| }); |
|
|
| it("can suppress synthesis while still stripping visible TTS residue", async () => { |
| await withMockedAutoTtsFetch(async (fetchMock) => { |
| const result = await maybeApplyTtsToPayload({ |
| payload: { text: "[[tts:widowmaker]] Hola [[/tts:text]]" }, |
| cfg: taggedCfg, |
| kind: "final", |
| suppressSynthesis: true, |
| }); |
|
|
| expect(result.text).toBe("Hola"); |
| expect(result.mediaUrl).toBeUndefined(); |
| expect(fetchMock).not.toHaveBeenCalled(); |
| }); |
| }); |
| }); |
| }); |
|
|