| import fs from "node:fs/promises"; |
| import path from "node:path"; |
| import { afterAll, beforeAll, beforeEach, describe, expect, it, vi } from "vitest"; |
| import type { MsgContext } from "../auto-reply/templating.js"; |
| import type { OpenClawConfig } from "../config/config.js"; |
| import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js"; |
| import { createSafeAudioFixtureBuffer } from "./runner.test-utils.js"; |
|
|
| |
| |
| |
|
|
| vi.mock("../agents/model-auth.js", () => ({ |
| resolveApiKeyForProvider: vi.fn(async () => ({ |
| apiKey: "test-key", |
| source: "test", |
| mode: "api-key", |
| })), |
| requireApiKey: (auth: { apiKey?: string; mode?: string }, provider: string) => { |
| if (auth?.apiKey) { |
| return auth.apiKey; |
| } |
| throw new Error(`No API key resolved for provider "${provider}" (auth mode: ${auth?.mode}).`); |
| }, |
| resolveAwsSdkEnvVarName: vi.fn(() => undefined), |
| resolveEnvApiKey: vi.fn(() => null), |
| resolveModelAuthMode: vi.fn(() => "api-key"), |
| getApiKeyForModel: vi.fn(async () => ({ apiKey: "test-key", source: "test", mode: "api-key" })), |
| getCustomProviderApiKey: vi.fn(() => undefined), |
| ensureAuthProfileStore: vi.fn(async () => ({})), |
| resolveAuthProfileOrder: vi.fn(() => []), |
| })); |
|
|
| const { MediaFetchErrorMock } = vi.hoisted(() => { |
| class MediaFetchErrorMock extends Error { |
| code: string; |
| constructor(message: string, code: string) { |
| super(message); |
| this.name = "MediaFetchError"; |
| this.code = code; |
| } |
| } |
| return { MediaFetchErrorMock }; |
| }); |
|
|
| vi.mock("../media/fetch.js", () => ({ |
| fetchRemoteMedia: vi.fn(), |
| MediaFetchError: MediaFetchErrorMock, |
| })); |
|
|
| vi.mock("../process/exec.js", () => ({ |
| runExec: vi.fn(), |
| runCommandWithTimeout: vi.fn(), |
| })); |
|
|
| const mockDeliverOutboundPayloads = vi.fn(); |
|
|
| vi.mock("../infra/outbound/deliver.js", () => ({ |
| deliverOutboundPayloads: (...args: unknown[]) => mockDeliverOutboundPayloads(...args), |
| })); |
|
|
| |
| |
| |
|
|
| let applyMediaUnderstanding: typeof import("./apply.js").applyMediaUnderstanding; |
| let clearMediaUnderstandingBinaryCacheForTests: () => void; |
|
|
| const TEMP_MEDIA_PREFIX = "openclaw-echo-transcript-test-"; |
| let suiteTempMediaRootDir = ""; |
|
|
| async function createTempAudioFile(): Promise<string> { |
| const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "case-")); |
| const filePath = path.join(dir, "note.ogg"); |
| await fs.writeFile(filePath, createSafeAudioFixtureBuffer(2048)); |
| return filePath; |
| } |
|
|
| function createAudioCtxWithProvider(mediaPath: string, extra?: Partial<MsgContext>): MsgContext { |
| return { |
| Body: "<media:audio>", |
| MediaPath: mediaPath, |
| MediaType: "audio/ogg", |
| Provider: "whatsapp", |
| From: "+10000000001", |
| AccountId: "acc1", |
| ...extra, |
| }; |
| } |
|
|
| function createAudioConfigWithEcho(opts?: { |
| echoTranscript?: boolean; |
| echoFormat?: string; |
| transcribedText?: string; |
| }): { |
| cfg: OpenClawConfig; |
| providers: Record<string, { id: string; transcribeAudio: () => Promise<{ text: string }> }>; |
| } { |
| const cfg: OpenClawConfig = { |
| tools: { |
| media: { |
| audio: { |
| enabled: true, |
| maxBytes: 1024 * 1024, |
| models: [{ provider: "groq" }], |
| echoTranscript: opts?.echoTranscript ?? true, |
| ...(opts?.echoFormat !== undefined ? { echoFormat: opts.echoFormat } : {}), |
| }, |
| }, |
| }, |
| }; |
| const providers = { |
| groq: { |
| id: "groq", |
| transcribeAudio: async () => ({ text: opts?.transcribedText ?? "hello world" }), |
| }, |
| }; |
| return { cfg, providers }; |
| } |
|
|
| function expectSingleEchoDeliveryCall() { |
| expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce(); |
| const callArgs = mockDeliverOutboundPayloads.mock.calls[0]?.[0]; |
| expect(callArgs).toBeDefined(); |
| return callArgs as { |
| to?: string; |
| channel?: string; |
| accountId?: string; |
| payloads: Array<{ text?: string }>; |
| }; |
| } |
|
|
| function createAudioConfigWithoutEchoFlag() { |
| const { cfg, providers } = createAudioConfigWithEcho(); |
| const audio = cfg.tools?.media?.audio as { echoTranscript?: boolean } | undefined; |
| if (audio && "echoTranscript" in audio) { |
| delete audio.echoTranscript; |
| } |
| return { cfg, providers }; |
| } |
|
|
| |
| |
| |
|
|
| describe("applyMediaUnderstanding โ echo transcript", () => { |
| beforeAll(async () => { |
| const baseDir = resolvePreferredOpenClawTmpDir(); |
| await fs.mkdir(baseDir, { recursive: true }); |
| suiteTempMediaRootDir = await fs.mkdtemp(path.join(baseDir, TEMP_MEDIA_PREFIX)); |
| const mod = await import("./apply.js"); |
| applyMediaUnderstanding = mod.applyMediaUnderstanding; |
| const runner = await import("./runner.js"); |
| clearMediaUnderstandingBinaryCacheForTests = runner.clearMediaUnderstandingBinaryCacheForTests; |
| }); |
|
|
| beforeEach(() => { |
| mockDeliverOutboundPayloads.mockClear(); |
| mockDeliverOutboundPayloads.mockResolvedValue([{ channel: "whatsapp", messageId: "echo-1" }]); |
| clearMediaUnderstandingBinaryCacheForTests?.(); |
| }); |
|
|
| afterAll(async () => { |
| if (!suiteTempMediaRootDir) { |
| return; |
| } |
| await fs.rm(suiteTempMediaRootDir, { recursive: true, force: true }); |
| suiteTempMediaRootDir = ""; |
| }); |
|
|
| it("does NOT echo when echoTranscript is false (default)", async () => { |
| const mediaPath = await createTempAudioFile(); |
| const ctx = createAudioCtxWithProvider(mediaPath); |
| const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: false }); |
|
|
| await applyMediaUnderstanding({ ctx, cfg, providers }); |
|
|
| expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); |
| }); |
|
|
| it("does NOT echo when echoTranscript is absent (default)", async () => { |
| const mediaPath = await createTempAudioFile(); |
| const ctx = createAudioCtxWithProvider(mediaPath); |
| const { cfg, providers } = createAudioConfigWithoutEchoFlag(); |
|
|
| await applyMediaUnderstanding({ ctx, cfg, providers }); |
|
|
| expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); |
| }); |
|
|
| it("echoes transcript with default format when echoTranscript is true", async () => { |
| const mediaPath = await createTempAudioFile(); |
| const ctx = createAudioCtxWithProvider(mediaPath); |
| const { cfg, providers } = createAudioConfigWithEcho({ |
| echoTranscript: true, |
| transcribedText: "hello world", |
| }); |
|
|
| await applyMediaUnderstanding({ ctx, cfg, providers }); |
|
|
| const callArgs = expectSingleEchoDeliveryCall(); |
| expect(callArgs.channel).toBe("whatsapp"); |
| expect(callArgs.to).toBe("+10000000001"); |
| expect(callArgs.accountId).toBe("acc1"); |
| expect(callArgs.payloads).toHaveLength(1); |
| expect(callArgs.payloads[0].text).toBe('๐ "hello world"'); |
| }); |
|
|
| it("uses custom echoFormat when provided", async () => { |
| const mediaPath = await createTempAudioFile(); |
| const ctx = createAudioCtxWithProvider(mediaPath); |
| const { cfg, providers } = createAudioConfigWithEcho({ |
| echoTranscript: true, |
| echoFormat: "๐๏ธ Heard: {transcript}", |
| transcribedText: "custom message", |
| }); |
|
|
| await applyMediaUnderstanding({ ctx, cfg, providers }); |
|
|
| const callArgs = expectSingleEchoDeliveryCall(); |
| expect(callArgs.payloads[0].text).toBe("๐๏ธ Heard: custom message"); |
| }); |
|
|
| it("does NOT echo when there are no audio attachments", async () => { |
| |
| const dir = await fs.mkdtemp(path.join(suiteTempMediaRootDir, "img-")); |
| const imgPath = path.join(dir, "photo.jpg"); |
| await fs.writeFile(imgPath, Buffer.from([0xff, 0xd8, 0xff, 0xe0])); |
|
|
| const ctx: MsgContext = { |
| Body: "<media:image>", |
| MediaPath: imgPath, |
| MediaType: "image/jpeg", |
| Provider: "whatsapp", |
| From: "+10000000001", |
| }; |
|
|
| const { cfg, providers } = createAudioConfigWithEcho({ |
| echoTranscript: true, |
| transcribedText: "should not appear", |
| }); |
| cfg.tools!.media!.image = { enabled: false }; |
|
|
| await applyMediaUnderstanding({ ctx, cfg, providers }); |
|
|
| |
| expect(ctx.Transcript).toBeUndefined(); |
| expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); |
| }); |
|
|
| it("does NOT echo when transcription fails", async () => { |
| const mediaPath = await createTempAudioFile(); |
| const ctx = createAudioCtxWithProvider(mediaPath); |
| const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); |
| providers.groq.transcribeAudio = async () => { |
| throw new Error("transcription provider failure"); |
| }; |
|
|
| |
| await applyMediaUnderstanding({ ctx, cfg, providers }); |
|
|
| expect(ctx.Transcript).toBeUndefined(); |
| expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); |
| }); |
|
|
| it("does NOT echo when channel is not deliverable", async () => { |
| const mediaPath = await createTempAudioFile(); |
| |
| const ctx = createAudioCtxWithProvider(mediaPath, { |
| Provider: "internal-system", |
| From: "some-source", |
| }); |
| const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); |
|
|
| await applyMediaUnderstanding({ ctx, cfg, providers }); |
|
|
| |
| expect(ctx.Transcript).toBe("hello world"); |
| |
| expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); |
| }); |
|
|
| it("does NOT echo when ctx has no From or OriginatingTo", async () => { |
| const mediaPath = await createTempAudioFile(); |
| const ctx: MsgContext = { |
| Body: "<media:audio>", |
| MediaPath: mediaPath, |
| MediaType: "audio/ogg", |
| Provider: "whatsapp", |
| |
| }; |
| const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); |
|
|
| await applyMediaUnderstanding({ ctx, cfg, providers }); |
|
|
| expect(ctx.Transcript).toBe("hello world"); |
| expect(mockDeliverOutboundPayloads).not.toHaveBeenCalled(); |
| }); |
|
|
| it("uses OriginatingTo when From is absent", async () => { |
| const mediaPath = await createTempAudioFile(); |
| const ctx: MsgContext = { |
| Body: "<media:audio>", |
| MediaPath: mediaPath, |
| MediaType: "audio/ogg", |
| Provider: "whatsapp", |
| OriginatingTo: "+19999999999", |
| }; |
| const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); |
|
|
| await applyMediaUnderstanding({ ctx, cfg, providers }); |
|
|
| const callArgs = expectSingleEchoDeliveryCall(); |
| expect(callArgs.to).toBe("+19999999999"); |
| }); |
|
|
| it("echo delivery failure does not throw or break transcription", async () => { |
| const mediaPath = await createTempAudioFile(); |
| const ctx = createAudioCtxWithProvider(mediaPath); |
| const { cfg, providers } = createAudioConfigWithEcho({ echoTranscript: true }); |
|
|
| mockDeliverOutboundPayloads.mockRejectedValueOnce(new Error("delivery timeout")); |
|
|
| |
| const result = await applyMediaUnderstanding({ ctx, cfg, providers }); |
|
|
| |
| expect(result.appliedAudio).toBe(true); |
| expect(ctx.Transcript).toBe("hello world"); |
| |
| expect(mockDeliverOutboundPayloads).toHaveBeenCalledOnce(); |
| }); |
| }); |
|
|