Spaces:

mishig
/

chat-ui

Running on CPU Upgrade

App Files Files Community

victor HF Staff commited on Dec 4, 2025

Commit

64c473c

unverified ·

1 Parent(s): 72bda43

Transcribe UI (#2006)

Browse files

* Add voice transcription feature to chat

Introduces voice recording and transcription in the chat window using Whisper models. Adds new components for voice recording and audio waveform visualization, updates feature flags and environment variables, and implements a server API endpoint for audio transcription.

* Fix frequencyData reactivity in VoiceRecorder

Replaces in-place mutation of frequencyData with assignment of a new Uint8Array to ensure Svelte reactivity in the voice visualization.

* Improve AudioWaveform visualization and responsiveness

Refactors AudioWaveform.svelte to use a timeline-based amplitude visualization with dynamic pill count based on container width, smoother animation, and responsive resizing. Updates VoiceRecorder.svelte to remove fixed pillCount and adjust maxHeight for better appearance. These changes provide a more accurate and visually appealing waveform display.

* Improve audio waveform and voice recorder UI

Reduced pill width and gap in AudioWaveform for a denser visualization, boosted amplitude for better visibility, and updated pill color for improved contrast. Adjusted padding in VoiceRecorder for a more compact layout. Fixed ChatWindow to trim transcribed text before appending to draft.

* Update voice recorder UI and loading icons

Improved styling for voice recording buttons and waveform container in ChatWindow and VoiceRecorder components. Replaced loading icon with EosIconsLoading for transcription state and adjusted class names for better visual consistency.

* Add touch device support for voice recording send

VoiceRecorder and ChatWindow now detect touch devices and adjust behavior: on touch devices, the confirm button sends the audio message immediately using a new onsend handler, with updated button styling and icon. This improves usability for mobile users by streamlining the voice message workflow.

* Add TRANSCRIPTION_MODEL env var to dev and prod configs

Introduces the TRANSCRIPTION_MODEL environment variable set to 'openai/whisper-large-v3-turbo' in both dev.yaml and prod.yaml to support transcription functionality.

* Update VoiceRecorder.svelte

* Add error notifications for transcription and recording

Introduces user-facing error messages for transcription and recording failures by updating the $error store. This improves feedback to users when audio processing encounters issues.

Files changed (8) hide show

.env +9 -0
chart/env/dev.yaml +1 -0
chart/env/prod.yaml +1 -0
src/lib/components/chat/ChatWindow.svelte +101 -1
src/lib/components/chat/VoiceRecorder.svelte +205 -0
src/lib/components/voice/AudioWaveform.svelte +96 -0
src/lib/server/api/routes/groups/misc.ts +2 -0
src/routes/api/transcribe/+server.ts +96 -0

.env CHANGED Viewed

@@ -90,6 +90,15 @@ PUBLIC_LLM_ROUTER_LOGO_URL=
 # Public alias id used for the virtual router model (Omni). Defaults to "omni".
 PUBLIC_LLM_ROUTER_ALIAS_ID=omni
 ### Authentication ###
 # Parameters to enable open id login
 OPENID_CONFIG=

 # Public alias id used for the virtual router model (Omni). Defaults to "omni".
 PUBLIC_LLM_ROUTER_ALIAS_ID=omni
+### Transcription ###
+# Voice-to-text transcription using Whisper models
+# If set, enables the microphone button in the chat input
+# Example: openai/whisper-large-v3-turbo
+TRANSCRIPTION_MODEL=
+# Optional: Base URL for transcription API (defaults to HF inference)
+# Default: https://router.huggingface.co/hf-inference/models
+TRANSCRIPTION_BASE_URL=
 ### Authentication ###
 # Parameters to enable open id login
 OPENID_CONFIG=

chart/env/dev.yaml CHANGED Viewed

@@ -70,6 +70,7 @@ envVars:
   LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
   LLM_ROUTER_ENABLE_TOOLS: "true"
   LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
   MCP_SERVERS: >
     [{"name": "Web Search (Exa)", "url": "https://mcp.exa.ai/mcp"}, {"name": "Hugging Face", "url": "https://hf.co/mcp?login"}]
   PUBLIC_LLM_ROUTER_DISPLAY_NAME: "Omni"

   LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
   LLM_ROUTER_ENABLE_TOOLS: "true"
   LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
+  TRANSCRIPTION_MODEL: "openai/whisper-large-v3-turbo"
   MCP_SERVERS: >
     [{"name": "Web Search (Exa)", "url": "https://mcp.exa.ai/mcp"}, {"name": "Hugging Face", "url": "https://hf.co/mcp?login"}]
   PUBLIC_LLM_ROUTER_DISPLAY_NAME: "Omni"

chart/env/prod.yaml CHANGED Viewed

@@ -80,6 +80,7 @@ envVars:
   LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
   LLM_ROUTER_ENABLE_TOOLS: "true"
   LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
   MCP_SERVERS: >
     [{"name": "Web Search (Exa)", "url": "https://mcp.exa.ai/mcp"}, {"name": "Hugging Face", "url": "https://hf.co/mcp?login"}]
   PUBLIC_LLM_ROUTER_DISPLAY_NAME: "Omni"

   LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
   LLM_ROUTER_ENABLE_TOOLS: "true"
   LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
+  TRANSCRIPTION_MODEL: "openai/whisper-large-v3-turbo"
   MCP_SERVERS: >
     [{"name": "Web Search (Exa)", "url": "https://mcp.exa.ai/mcp"}, {"name": "Hugging Face", "url": "https://hf.co/mcp?login"}]
   PUBLIC_LLM_ROUTER_DISPLAY_NAME: "Omni"

src/lib/components/chat/ChatWindow.svelte CHANGED Viewed

@@ -6,8 +6,10 @@
 	import CarbonCaretDown from "~icons/carbon/caret-down";
 	import CarbonDirectionRight from "~icons/carbon/direction-right-01";
 	import IconArrowUp from "~icons/lucide/arrow-up";
 	import ChatInput from "./ChatInput.svelte";
 	import StopGeneratingBtn from "../StopGeneratingBtn.svelte";
 	import type { Model } from "$lib/types/Model";
 	import FileDropzone from "./FileDropzone.svelte";
@@ -24,6 +26,7 @@
 	import ChatIntroduction from "./ChatIntroduction.svelte";
 	import UploadedFile from "./UploadedFile.svelte";
 	import { useSettingsStore } from "$lib/stores/settings";
 	import ModelSwitch from "./ModelSwitch.svelte";
 	import { routerExamples } from "$lib/constants/routerExamples";
 	import { mcpExamples } from "$lib/constants/mcpExamples";
@@ -85,6 +88,14 @@
 	let editMsdgId: Message["id"] | null = $state(null);
 	let pastedLongContent = $state(false);
 	const handleSubmit = () => {
 		if (requireAuthUser() || loading || !draft) return;
 		onmessage?.(draft);
@@ -374,6 +385,71 @@
 	function startFollowUp(followUp: RouterFollowUp) {
 		triggerPrompt(followUp.prompt);
 	}
 </script>
 <svelte:window
@@ -531,7 +607,18 @@
 					"max-sm:mb-4": focused && isVirtualKeyboard(),
 				}}
 			>
-				{#if onDrag && isFileUploadEnabled}
 					<FileDropzone bind:files bind:onDrag mimeTypes={activeMimeTypes} />
 				{:else}
 					<div
@@ -563,6 +650,19 @@
 								classNames="absolute bottom-2 right-2 size-8 sm:size-7 self-end rounded-full border bg-white text-black shadow transition-none dark:border-transparent dark:bg-gray-600 dark:text-white"
 							/>
 						{:else}
 							<button
 								class="btn absolute bottom-2 right-2 size-8 self-end rounded-full border bg-white text-black shadow transition-none enabled:hover:bg-white enabled:hover:shadow-inner dark:border-transparent dark:bg-gray-600 dark:text-white dark:hover:enabled:bg-black sm:size-7 {!draft ||
 								isReadOnly

 	import CarbonCaretDown from "~icons/carbon/caret-down";
 	import CarbonDirectionRight from "~icons/carbon/direction-right-01";
 	import IconArrowUp from "~icons/lucide/arrow-up";
+	import IconMic from "~icons/lucide/mic";
 	import ChatInput from "./ChatInput.svelte";
+	import VoiceRecorder from "./VoiceRecorder.svelte";
 	import StopGeneratingBtn from "../StopGeneratingBtn.svelte";
 	import type { Model } from "$lib/types/Model";
 	import FileDropzone from "./FileDropzone.svelte";
 	import ChatIntroduction from "./ChatIntroduction.svelte";
 	import UploadedFile from "./UploadedFile.svelte";
 	import { useSettingsStore } from "$lib/stores/settings";
+	import { error } from "$lib/stores/errors";
 	import ModelSwitch from "./ModelSwitch.svelte";
 	import { routerExamples } from "$lib/constants/routerExamples";
 	import { mcpExamples } from "$lib/constants/mcpExamples";
 	let editMsdgId: Message["id"] | null = $state(null);
 	let pastedLongContent = $state(false);
+	// Voice recording state
+	let isRecording = $state(false);
+	let isTranscribing = $state(false);
+	let transcriptionEnabled = $derived(
+		!!(page.data as { transcriptionEnabled?: boolean }).transcriptionEnabled
+	);
+	let isTouchDevice = $derived(browser && navigator.maxTouchPoints > 0);
 	const handleSubmit = () => {
 		if (requireAuthUser() || loading || !draft) return;
 		onmessage?.(draft);
 	function startFollowUp(followUp: RouterFollowUp) {
 		triggerPrompt(followUp.prompt);
 	}
+	async function handleRecordingConfirm(audioBlob: Blob) {
+		isRecording = false;
+		isTranscribing = true;
+		try {
+			const response = await fetch(`${base}/api/transcribe`, {
+				method: "POST",
+				headers: { "Content-Type": audioBlob.type },
+				body: audioBlob,
+			});
+			if (!response.ok) {
+				throw new Error(await response.text());
+			}
+			const { text } = await response.json();
+			const trimmedText = text?.trim();
+			if (trimmedText) {
+				// Append transcribed text to draft
+				draft = draft.trim() ? `${draft.trim()} ${trimmedText}` : trimmedText;
+			}
+		} catch (err) {
+			console.error("Transcription error:", err);
+			$error = "Transcription failed. Please try again.";
+		} finally {
+			isTranscribing = false;
+		}
+	}
+	async function handleRecordingSend(audioBlob: Blob) {
+		isRecording = false;
+		isTranscribing = true;
+		try {
+			const response = await fetch(`${base}/api/transcribe`, {
+				method: "POST",
+				headers: { "Content-Type": audioBlob.type },
+				body: audioBlob,
+			});
+			if (!response.ok) {
+				throw new Error(await response.text());
+			}
+			const { text } = await response.json();
+			const trimmedText = text?.trim();
+			if (trimmedText) {
+				// Set draft and send immediately
+				draft = draft.trim() ? `${draft.trim()} ${trimmedText}` : trimmedText;
+				handleSubmit();
+			}
+		} catch (err) {
+			console.error("Transcription error:", err);
+			$error = "Transcription failed. Please try again.";
+		} finally {
+			isTranscribing = false;
+		}
+	}
+	function handleRecordingError(message: string) {
+		console.error("Recording error:", message);
+		isRecording = false;
+		$error = message;
+	}
 </script>
 <svelte:window
 					"max-sm:mb-4": focused && isVirtualKeyboard(),
 				}}
 			>
+				{#if isRecording || isTranscribing}
+					<VoiceRecorder
+						{isTranscribing}
+						{isTouchDevice}
+						oncancel={() => {
+							isRecording = false;
+						}}
+						onconfirm={handleRecordingConfirm}
+						onsend={handleRecordingSend}
+						onerror={handleRecordingError}
+					/>
+				{:else if onDrag && isFileUploadEnabled}
 					<FileDropzone bind:files bind:onDrag mimeTypes={activeMimeTypes} />
 				{:else}
 					<div
 								classNames="absolute bottom-2 right-2 size-8 sm:size-7 self-end rounded-full border bg-white text-black shadow transition-none dark:border-transparent dark:bg-gray-600 dark:text-white"
 							/>
 						{:else}
+							{#if transcriptionEnabled}
+								<button
+									type="button"
+									class="btn absolute bottom-2 right-10 mr-1 size-8 self-end rounded-full border bg-white/50 text-gray-500 transition-none hover:bg-gray-50 hover:text-gray-700 dark:border-transparent dark:bg-gray-600 dark:text-gray-300 dark:hover:bg-gray-500 dark:hover:text-white sm:right-9 sm:size-7"
+									disabled={isReadOnly}
+									onclick={() => {
+										isRecording = true;
+									}}
+									aria-label="Start voice recording"
+								>
+									<IconMic class="size-4" />
+								</button>
+							{/if}
 							<button
 								class="btn absolute bottom-2 right-2 size-8 self-end rounded-full border bg-white text-black shadow transition-none enabled:hover:bg-white enabled:hover:shadow-inner dark:border-transparent dark:bg-gray-600 dark:text-white dark:hover:enabled:bg-black sm:size-7 {!draft ||
 								isReadOnly

src/lib/components/chat/VoiceRecorder.svelte ADDED Viewed

	@@ -0,0 +1,205 @@

+<script lang="ts">
+	import { onMount, onDestroy } from "svelte";
+	import CarbonClose from "~icons/carbon/close";
+	import CarbonCheckmark from "~icons/carbon/checkmark";
+	import IconArrowUp from "~icons/lucide/arrow-up";
+	import EosIconsLoading from "~icons/eos-icons/loading";
+	import IconLoading from "$lib/components/icons/IconLoading.svelte";
+	import AudioWaveform from "$lib/components/voice/AudioWaveform.svelte";
+	interface Props {
+		isTranscribing: boolean;
+		isTouchDevice: boolean;
+		oncancel: () => void;
+		onconfirm: (audioBlob: Blob) => void;
+		onsend: (audioBlob: Blob) => void;
+		onerror: (message: string) => void;
+	}
+	let { isTranscribing, isTouchDevice, oncancel, onconfirm, onsend, onerror }: Props = $props();
+	let mediaRecorder: MediaRecorder | null = $state(null);
+	let audioChunks: Blob[] = $state([]);
+	let analyser: AnalyserNode | null = $state(null);
+	let frequencyData: Uint8Array = $state(new Uint8Array(32));
+	let animationFrameId: number | null = $state(null);
+	let audioContext: AudioContext | null = $state(null);
+	let mediaStream: MediaStream | null = $state(null);
+	function startVisualization() {
+		function update() {
+			if (analyser) {
+				const data = new Uint8Array(analyser.frequencyBinCount);
+				analyser.getByteFrequencyData(data);
+				// Create new array to trigger Svelte reactivity
+				frequencyData = data;
+			}
+			animationFrameId = requestAnimationFrame(update);
+		}
+		update();
+	}
+	function stopVisualization() {
+		if (animationFrameId !== null) {
+			cancelAnimationFrame(animationFrameId);
+			animationFrameId = null;
+		}
+	}
+	async function startRecording() {
+		try {
+			const stream = await navigator.mediaDevices.getUserMedia({
+				audio: {
+					channelCount: 1,
+					sampleRate: 16000, // Whisper prefers 16kHz
+					echoCancellation: true,
+					noiseSuppression: true,
+				},
+			});
+			mediaStream = stream;
+			// Set up audio context for visualization
+			audioContext = new AudioContext();
+			const source = audioContext.createMediaStreamSource(stream);
+			analyser = audioContext.createAnalyser();
+			analyser.fftSize = 64; // Small for performance, gives 32 frequency bins
+			analyser.smoothingTimeConstant = 0.4;
+			source.connect(analyser);
+			frequencyData = new Uint8Array(analyser.frequencyBinCount);
+			// Start MediaRecorder
+			// Use webm/opus for broad browser support
+			const mimeType = MediaRecorder.isTypeSupported("audio/webm;codecs=opus")
+				? "audio/webm;codecs=opus"
+				: "audio/webm";
+			mediaRecorder = new MediaRecorder(stream, { mimeType });
+			audioChunks = [];
+			mediaRecorder.ondataavailable = (e) => {
+				if (e.data.size > 0) {
+					audioChunks = [...audioChunks, e.data];
+				}
+			};
+			mediaRecorder.start(100); // Collect data every 100ms
+			startVisualization();
+		} catch (err) {
+			if (err instanceof DOMException) {
+				if (err.name === "NotAllowedError") {
+					onerror("Microphone access denied. Please allow in browser settings.");
+				} else if (err.name === "NotFoundError") {
+					onerror("No microphone found.");
+				} else {
+					onerror(`Microphone error: ${err.message}`);
+				}
+			} else {
+				onerror("Could not access microphone.");
+			}
+		}
+	}
+	function stopRecording(): Blob | null {
+		stopVisualization();
+		if (mediaRecorder && mediaRecorder.state !== "inactive") {
+			mediaRecorder.stop();
+		}
+		// Stop all audio tracks
+		if (mediaStream) {
+			mediaStream.getTracks().forEach((track) => track.stop());
+			mediaStream = null;
+		}
+		// Close audio context
+		if (audioContext) {
+			audioContext.close();
+			audioContext = null;
+		}
+		analyser = null;
+		mediaRecorder = null;
+		if (audioChunks.length === 0) {
+			return null;
+		}
+		// Create blob from chunks
+		const mimeType = audioChunks[0]?.type || "audio/webm";
+		return new Blob(audioChunks, { type: mimeType });
+	}
+	function handleCancel() {
+		stopRecording();
+		oncancel();
+	}
+	function handleConfirm() {
+		const audioBlob = stopRecording();
+		if (audioBlob && audioBlob.size > 0) {
+			if (isTouchDevice) {
+				onsend(audioBlob);
+			} else {
+				onconfirm(audioBlob);
+			}
+		} else {
+			onerror("No audio recorded. Please try again.");
+		}
+	}
+	onMount(() => {
+		startRecording();
+	});
+	onDestroy(() => {
+		stopRecording();
+	});
+</script>
+<div class="flex h-full w-full items-center justify-between px-3 py-1.5">
+	<!-- Cancel button -->
+	<button
+		type="button"
+		class="btn grid size-8 place-items-center rounded-full border bg-white text-black shadow transition-none hover:bg-gray-100 dark:border-transparent dark:bg-gray-600 dark:text-white dark:hover:bg-gray-500 sm:size-7"
+		onclick={handleCancel}
+		aria-label="Cancel recording"
+	>
+		<CarbonClose class="size-4" />
+	</button>
+	<!-- Waveform / Loading -->
+	<div class="flex h-12 flex-1 items-center overflow-hidden pl-2.5 pr-1.5">
+		{#if isTranscribing}
+			<div class="flex h-full w-full items-center justify-center">
+				<IconLoading classNames="text-gray-400" />
+			</div>
+		{:else}
+			<AudioWaveform {frequencyData} minHeight={4} maxHeight={40} />
+		{/if}
+	</div>
+	<!-- Confirm/Send button -->
+	<button
+		type="button"
+		class="btn grid size-8 place-items-center rounded-full border shadow transition-none disabled:opacity-50 sm:size-7 {isTouchDevice
+			? 'border-transparent bg-black text-white hover:bg-gray-800 dark:bg-white dark:text-black dark:hover:bg-gray-200'
+			: 'bg-white text-black hover:bg-gray-100 dark:border-transparent dark:bg-gray-600 dark:text-white dark:hover:bg-gray-500'}"
+		onclick={handleConfirm}
+		disabled={isTranscribing}
+		aria-label={isTranscribing
+			? "Transcribing..."
+			: isTouchDevice
+				? "Send message"
+				: "Confirm and transcribe"}
+	>
+		{#if isTranscribing}
+			<EosIconsLoading class="size-4" />
+		{:else if isTouchDevice}
+			<IconArrowUp class="size-4" />
+		{:else}
+			<CarbonCheckmark class="size-4" />
+		{/if}
+	</button>
+</div>

src/lib/components/voice/AudioWaveform.svelte ADDED Viewed

	@@ -0,0 +1,96 @@

+<script lang="ts">
+	import { onMount, onDestroy } from "svelte";
+	interface Props {
+		frequencyData: Uint8Array;
+		minHeight?: number;
+		maxHeight?: number;
+	}
+	let { frequencyData, minHeight = 4, maxHeight = 40 }: Props = $props();
+	const PILL_WIDTH = 2; // w-0.5 = 2px
+	const PILL_GAP = 2;
+	const SAMPLE_INTERVAL_MS = 50; // Sample every 50ms (~20 samples/sec)
+	let containerRef: HTMLDivElement | undefined = $state();
+	let timeline: number[] = $state([]);
+	let pillCount = $state(60); // Default, will be calculated from container width
+	let intervalId: ReturnType<typeof setInterval> | undefined;
+	let smoothedAmplitude = 0;
+	// Calculate average amplitude from frequency data
+	function getAmplitude(): number {
+		if (!frequencyData.length) return 0;
+		let sum = 0;
+		for (let i = 0; i < frequencyData.length; i++) {
+			sum += frequencyData[i];
+		}
+		return sum / frequencyData.length / 255; // Normalize to 0-1
+	}
+	function addSample() {
+		const rawAmplitude = getAmplitude();
+		// Smooth the amplitude for less jittery visualization
+		smoothedAmplitude = smoothedAmplitude * 0.3 + rawAmplitude * 0.7;
+		// Boost amplitude by 1.5x and apply slight curve for better visibility
+		const boostedAmplitude = Math.min(1, Math.pow(smoothedAmplitude * 1.5, 0.85));
+		const height = minHeight + boostedAmplitude * (maxHeight - minHeight);
+		// Push new sample, keep only pillCount samples (sliding window)
+		timeline = [...timeline, height].slice(-pillCount);
+	}
+	function calculatePillCount() {
+		if (containerRef) {
+			const width = containerRef.clientWidth;
+			pillCount = Math.max(20, Math.floor(width / (PILL_WIDTH + PILL_GAP)));
+		}
+	}
+	onMount(() => {
+		calculatePillCount();
+		// Initialize timeline with minimum height dots
+		timeline = Array(pillCount).fill(minHeight);
+		// Start sampling at fixed intervals
+		intervalId = setInterval(addSample, SAMPLE_INTERVAL_MS);
+		// Handle resize
+		const resizeObserver = new ResizeObserver(() => {
+			const oldCount = pillCount;
+			calculatePillCount();
+			// Adjust timeline buffer if container size changed
+			if (pillCount > oldCount) {
+				// Pad with min height on the left
+				timeline = [...Array(pillCount - oldCount).fill(minHeight), ...timeline];
+			} else if (pillCount < oldCount) {
+				timeline = timeline.slice(-pillCount);
+			}
+		});
+		if (containerRef) {
+			resizeObserver.observe(containerRef);
+		}
+		return () => {
+			resizeObserver.disconnect();
+		};
+	});
+	onDestroy(() => {
+		if (intervalId) clearInterval(intervalId);
+	});
+</script>
+<div bind:this={containerRef} class="flex h-12 w-full items-center justify-start gap-[2px]">
+	{#each timeline as height, i (i)}
+		<div
+			class="w-0.5 shrink-0 rounded-full bg-gray-400 dark:bg-white/60"
+			style="height: {Math.max(minHeight, Math.round(height))}px;"
+		></div>
+	{/each}
+</div>

src/lib/server/api/routes/groups/misc.ts CHANGED Viewed

@@ -13,6 +13,7 @@ export interface FeatureFlags {
 	enableAssistants: boolean;
 	loginEnabled: boolean;
 	isAdmin: boolean;
 }
 export const misc = new Elysia()
@@ -23,6 +24,7 @@ export const misc = new Elysia()
 			enableAssistants: config.ENABLE_ASSISTANTS === "true",
 			loginEnabled, // login feature is on when OID is configured
 			isAdmin: locals.isAdmin,
 		} satisfies FeatureFlags;
 	})
 	.get("/export", async ({ locals }) => {

 	enableAssistants: boolean;
 	loginEnabled: boolean;
 	isAdmin: boolean;
+	transcriptionEnabled: boolean;
 }
 export const misc = new Elysia()
 			enableAssistants: config.ENABLE_ASSISTANTS === "true",
 			loginEnabled, // login feature is on when OID is configured
 			isAdmin: locals.isAdmin,
+			transcriptionEnabled: !!config.get("TRANSCRIPTION_MODEL"),
 		} satisfies FeatureFlags;
 	})
 	.get("/export", async ({ locals }) => {

src/routes/api/transcribe/+server.ts ADDED Viewed

	@@ -0,0 +1,96 @@

+import { error, json } from "@sveltejs/kit";
+import { config } from "$lib/server/config";
+import { getApiToken } from "$lib/server/apiToken";
+import { logger } from "$lib/server/logger";
+const MAX_AUDIO_SIZE = 25 * 1024 * 1024; // 25MB
+const TRANSCRIPTION_TIMEOUT = 60000; // 60 seconds
+const ALLOWED_CONTENT_TYPES = [
+	"audio/webm",
+	"audio/ogg",
+	"audio/wav",
+	"audio/flac",
+	"audio/mpeg",
+	"audio/mp4",
+	"audio/x-wav",
+];
+export async function POST({ request, locals }) {
+	const transcriptionModel = config.get("TRANSCRIPTION_MODEL");
+	if (!transcriptionModel) {
+		throw error(503, "Transcription is not configured");
+	}
+	const token = getApiToken(locals);
+	if (!token) {
+		throw error(401, "Authentication required");
+	}
+	const contentType = request.headers.get("content-type") || "";
+	const isAllowed = ALLOWED_CONTENT_TYPES.some((type) => contentType.includes(type));
+	if (!isAllowed) {
+		logger.warn({ contentType }, "Unsupported audio format for transcription");
+		throw error(400, `Unsupported audio format: ${contentType}`);
+	}
+	const contentLength = parseInt(request.headers.get("content-length") || "0");
+	if (contentLength > MAX_AUDIO_SIZE) {
+		throw error(413, "Audio file too large (max 25MB)");
+	}
+	try {
+		const audioBuffer = await request.arrayBuffer();
+		if (audioBuffer.byteLength > MAX_AUDIO_SIZE) {
+			throw error(413, "Audio file too large (max 25MB)");
+		}
+		const baseUrl =
+			config.get("TRANSCRIPTION_BASE_URL") || "https://router.huggingface.co/hf-inference/models";
+		const apiUrl = `${baseUrl}/${transcriptionModel}`;
+		const controller = new AbortController();
+		const timeoutId = setTimeout(() => controller.abort(), TRANSCRIPTION_TIMEOUT);
+		const response = await fetch(apiUrl, {
+			method: "POST",
+			headers: {
+				Authorization: `Bearer ${token}`,
+				"Content-Type": contentType,
+			},
+			body: audioBuffer,
+			signal: controller.signal,
+		}).finally(() => clearTimeout(timeoutId));
+		if (!response.ok) {
+			const errorText = await response.text();
+			logger.error(
+				{ status: response.status, error: errorText, model: transcriptionModel },
+				"Whisper API error"
+			);
+			throw error(response.status, `Transcription failed: ${errorText}`);
+		}
+		const result = await response.json();
+		// Whisper API returns { text: "transcribed text" }
+		return json({ text: result.text || "" });
+	} catch (err) {
+		if (err instanceof Error && err.name === "AbortError") {
+			logger.error({ model: transcriptionModel }, "Transcription timeout");
+			throw error(504, "Transcription took too long. Please try a shorter recording.");
+		}
+		// Re-throw SvelteKit errors
+		if (err && typeof err === "object" && "status" in err) {
+			throw err;
+		}
+		logger.error(err, "Transcription error");
+		throw error(500, "Failed to transcribe audio");
+	}
+}