victor HF Staff commited on
Commit
64c473c
·
unverified ·
1 Parent(s): 72bda43

Transcribe UI (#2006)

Browse files

* Add voice transcription feature to chat

Introduces voice recording and transcription in the chat window using Whisper models. Adds new components for voice recording and audio waveform visualization, updates feature flags and environment variables, and implements a server API endpoint for audio transcription.

* Fix frequencyData reactivity in VoiceRecorder

Replaces in-place mutation of frequencyData with assignment of a new Uint8Array to ensure Svelte reactivity in the voice visualization.

* Improve AudioWaveform visualization and responsiveness

Refactors AudioWaveform.svelte to use a timeline-based amplitude visualization with dynamic pill count based on container width, smoother animation, and responsive resizing. Updates VoiceRecorder.svelte to remove fixed pillCount and adjust maxHeight for better appearance. These changes provide a more accurate and visually appealing waveform display.

* Improve audio waveform and voice recorder UI

Reduced pill width and gap in AudioWaveform for a denser visualization, boosted amplitude for better visibility, and updated pill color for improved contrast. Adjusted padding in VoiceRecorder for a more compact layout. Fixed ChatWindow to trim transcribed text before appending to draft.

* Update voice recorder UI and loading icons

Improved styling for voice recording buttons and waveform container in ChatWindow and VoiceRecorder components. Replaced loading icon with EosIconsLoading for transcription state and adjusted class names for better visual consistency.

* Add touch device support for voice recording send

VoiceRecorder and ChatWindow now detect touch devices and adjust behavior: on touch devices, the confirm button sends the audio message immediately using a new onsend handler, with updated button styling and icon. This improves usability for mobile users by streamlining the voice message workflow.

* Add TRANSCRIPTION_MODEL env var to dev and prod configs

Introduces the TRANSCRIPTION_MODEL environment variable set to 'openai/whisper-large-v3-turbo' in both dev.yaml and prod.yaml to support transcription functionality.

* Update VoiceRecorder.svelte

* Add error notifications for transcription and recording

Introduces user-facing error messages for transcription and recording failures by updating the $error store. This improves feedback to users when audio processing encounters issues.

.env CHANGED
@@ -90,6 +90,15 @@ PUBLIC_LLM_ROUTER_LOGO_URL=
90
  # Public alias id used for the virtual router model (Omni). Defaults to "omni".
91
  PUBLIC_LLM_ROUTER_ALIAS_ID=omni
92
 
 
 
 
 
 
 
 
 
 
93
  ### Authentication ###
94
  # Parameters to enable open id login
95
  OPENID_CONFIG=
 
90
  # Public alias id used for the virtual router model (Omni). Defaults to "omni".
91
  PUBLIC_LLM_ROUTER_ALIAS_ID=omni
92
 
93
+ ### Transcription ###
94
+ # Voice-to-text transcription using Whisper models
95
+ # If set, enables the microphone button in the chat input
96
+ # Example: openai/whisper-large-v3-turbo
97
+ TRANSCRIPTION_MODEL=
98
+ # Optional: Base URL for transcription API (defaults to HF inference)
99
+ # Default: https://router.huggingface.co/hf-inference/models
100
+ TRANSCRIPTION_BASE_URL=
101
+
102
  ### Authentication ###
103
  # Parameters to enable open id login
104
  OPENID_CONFIG=
chart/env/dev.yaml CHANGED
@@ -70,6 +70,7 @@ envVars:
70
  LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
71
  LLM_ROUTER_ENABLE_TOOLS: "true"
72
  LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
 
73
  MCP_SERVERS: >
74
  [{"name": "Web Search (Exa)", "url": "https://mcp.exa.ai/mcp"}, {"name": "Hugging Face", "url": "https://hf.co/mcp?login"}]
75
  PUBLIC_LLM_ROUTER_DISPLAY_NAME: "Omni"
 
70
  LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
71
  LLM_ROUTER_ENABLE_TOOLS: "true"
72
  LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
73
+ TRANSCRIPTION_MODEL: "openai/whisper-large-v3-turbo"
74
  MCP_SERVERS: >
75
  [{"name": "Web Search (Exa)", "url": "https://mcp.exa.ai/mcp"}, {"name": "Hugging Face", "url": "https://hf.co/mcp?login"}]
76
  PUBLIC_LLM_ROUTER_DISPLAY_NAME: "Omni"
chart/env/prod.yaml CHANGED
@@ -80,6 +80,7 @@ envVars:
80
  LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
81
  LLM_ROUTER_ENABLE_TOOLS: "true"
82
  LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
 
83
  MCP_SERVERS: >
84
  [{"name": "Web Search (Exa)", "url": "https://mcp.exa.ai/mcp"}, {"name": "Hugging Face", "url": "https://hf.co/mcp?login"}]
85
  PUBLIC_LLM_ROUTER_DISPLAY_NAME: "Omni"
 
80
  LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
81
  LLM_ROUTER_ENABLE_TOOLS: "true"
82
  LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
83
+ TRANSCRIPTION_MODEL: "openai/whisper-large-v3-turbo"
84
  MCP_SERVERS: >
85
  [{"name": "Web Search (Exa)", "url": "https://mcp.exa.ai/mcp"}, {"name": "Hugging Face", "url": "https://hf.co/mcp?login"}]
86
  PUBLIC_LLM_ROUTER_DISPLAY_NAME: "Omni"
src/lib/components/chat/ChatWindow.svelte CHANGED
@@ -6,8 +6,10 @@
6
  import CarbonCaretDown from "~icons/carbon/caret-down";
7
  import CarbonDirectionRight from "~icons/carbon/direction-right-01";
8
  import IconArrowUp from "~icons/lucide/arrow-up";
 
9
 
10
  import ChatInput from "./ChatInput.svelte";
 
11
  import StopGeneratingBtn from "../StopGeneratingBtn.svelte";
12
  import type { Model } from "$lib/types/Model";
13
  import FileDropzone from "./FileDropzone.svelte";
@@ -24,6 +26,7 @@
24
  import ChatIntroduction from "./ChatIntroduction.svelte";
25
  import UploadedFile from "./UploadedFile.svelte";
26
  import { useSettingsStore } from "$lib/stores/settings";
 
27
  import ModelSwitch from "./ModelSwitch.svelte";
28
  import { routerExamples } from "$lib/constants/routerExamples";
29
  import { mcpExamples } from "$lib/constants/mcpExamples";
@@ -85,6 +88,14 @@
85
  let editMsdgId: Message["id"] | null = $state(null);
86
  let pastedLongContent = $state(false);
87
 
 
 
 
 
 
 
 
 
88
  const handleSubmit = () => {
89
  if (requireAuthUser() || loading || !draft) return;
90
  onmessage?.(draft);
@@ -374,6 +385,71 @@
374
  function startFollowUp(followUp: RouterFollowUp) {
375
  triggerPrompt(followUp.prompt);
376
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  </script>
378
 
379
  <svelte:window
@@ -531,7 +607,18 @@
531
  "max-sm:mb-4": focused && isVirtualKeyboard(),
532
  }}
533
  >
534
- {#if onDrag && isFileUploadEnabled}
 
 
 
 
 
 
 
 
 
 
 
535
  <FileDropzone bind:files bind:onDrag mimeTypes={activeMimeTypes} />
536
  {:else}
537
  <div
@@ -563,6 +650,19 @@
563
  classNames="absolute bottom-2 right-2 size-8 sm:size-7 self-end rounded-full border bg-white text-black shadow transition-none dark:border-transparent dark:bg-gray-600 dark:text-white"
564
  />
565
  {:else}
 
 
 
 
 
 
 
 
 
 
 
 
 
566
  <button
567
  class="btn absolute bottom-2 right-2 size-8 self-end rounded-full border bg-white text-black shadow transition-none enabled:hover:bg-white enabled:hover:shadow-inner dark:border-transparent dark:bg-gray-600 dark:text-white dark:hover:enabled:bg-black sm:size-7 {!draft ||
568
  isReadOnly
 
6
  import CarbonCaretDown from "~icons/carbon/caret-down";
7
  import CarbonDirectionRight from "~icons/carbon/direction-right-01";
8
  import IconArrowUp from "~icons/lucide/arrow-up";
9
+ import IconMic from "~icons/lucide/mic";
10
 
11
  import ChatInput from "./ChatInput.svelte";
12
+ import VoiceRecorder from "./VoiceRecorder.svelte";
13
  import StopGeneratingBtn from "../StopGeneratingBtn.svelte";
14
  import type { Model } from "$lib/types/Model";
15
  import FileDropzone from "./FileDropzone.svelte";
 
26
  import ChatIntroduction from "./ChatIntroduction.svelte";
27
  import UploadedFile from "./UploadedFile.svelte";
28
  import { useSettingsStore } from "$lib/stores/settings";
29
+ import { error } from "$lib/stores/errors";
30
  import ModelSwitch from "./ModelSwitch.svelte";
31
  import { routerExamples } from "$lib/constants/routerExamples";
32
  import { mcpExamples } from "$lib/constants/mcpExamples";
 
88
  let editMsdgId: Message["id"] | null = $state(null);
89
  let pastedLongContent = $state(false);
90
 
91
+ // Voice recording state
92
+ let isRecording = $state(false);
93
+ let isTranscribing = $state(false);
94
+ let transcriptionEnabled = $derived(
95
+ !!(page.data as { transcriptionEnabled?: boolean }).transcriptionEnabled
96
+ );
97
+ let isTouchDevice = $derived(browser && navigator.maxTouchPoints > 0);
98
+
99
  const handleSubmit = () => {
100
  if (requireAuthUser() || loading || !draft) return;
101
  onmessage?.(draft);
 
385
  function startFollowUp(followUp: RouterFollowUp) {
386
  triggerPrompt(followUp.prompt);
387
  }
388
+
389
+ async function handleRecordingConfirm(audioBlob: Blob) {
390
+ isRecording = false;
391
+ isTranscribing = true;
392
+
393
+ try {
394
+ const response = await fetch(`${base}/api/transcribe`, {
395
+ method: "POST",
396
+ headers: { "Content-Type": audioBlob.type },
397
+ body: audioBlob,
398
+ });
399
+
400
+ if (!response.ok) {
401
+ throw new Error(await response.text());
402
+ }
403
+
404
+ const { text } = await response.json();
405
+ const trimmedText = text?.trim();
406
+ if (trimmedText) {
407
+ // Append transcribed text to draft
408
+ draft = draft.trim() ? `${draft.trim()} ${trimmedText}` : trimmedText;
409
+ }
410
+ } catch (err) {
411
+ console.error("Transcription error:", err);
412
+ $error = "Transcription failed. Please try again.";
413
+ } finally {
414
+ isTranscribing = false;
415
+ }
416
+ }
417
+
418
+ async function handleRecordingSend(audioBlob: Blob) {
419
+ isRecording = false;
420
+ isTranscribing = true;
421
+
422
+ try {
423
+ const response = await fetch(`${base}/api/transcribe`, {
424
+ method: "POST",
425
+ headers: { "Content-Type": audioBlob.type },
426
+ body: audioBlob,
427
+ });
428
+
429
+ if (!response.ok) {
430
+ throw new Error(await response.text());
431
+ }
432
+
433
+ const { text } = await response.json();
434
+ const trimmedText = text?.trim();
435
+ if (trimmedText) {
436
+ // Set draft and send immediately
437
+ draft = draft.trim() ? `${draft.trim()} ${trimmedText}` : trimmedText;
438
+ handleSubmit();
439
+ }
440
+ } catch (err) {
441
+ console.error("Transcription error:", err);
442
+ $error = "Transcription failed. Please try again.";
443
+ } finally {
444
+ isTranscribing = false;
445
+ }
446
+ }
447
+
448
+ function handleRecordingError(message: string) {
449
+ console.error("Recording error:", message);
450
+ isRecording = false;
451
+ $error = message;
452
+ }
453
  </script>
454
 
455
  <svelte:window
 
607
  "max-sm:mb-4": focused && isVirtualKeyboard(),
608
  }}
609
  >
610
+ {#if isRecording || isTranscribing}
611
+ <VoiceRecorder
612
+ {isTranscribing}
613
+ {isTouchDevice}
614
+ oncancel={() => {
615
+ isRecording = false;
616
+ }}
617
+ onconfirm={handleRecordingConfirm}
618
+ onsend={handleRecordingSend}
619
+ onerror={handleRecordingError}
620
+ />
621
+ {:else if onDrag && isFileUploadEnabled}
622
  <FileDropzone bind:files bind:onDrag mimeTypes={activeMimeTypes} />
623
  {:else}
624
  <div
 
650
  classNames="absolute bottom-2 right-2 size-8 sm:size-7 self-end rounded-full border bg-white text-black shadow transition-none dark:border-transparent dark:bg-gray-600 dark:text-white"
651
  />
652
  {:else}
653
+ {#if transcriptionEnabled}
654
+ <button
655
+ type="button"
656
+ class="btn absolute bottom-2 right-10 mr-1 size-8 self-end rounded-full border bg-white/50 text-gray-500 transition-none hover:bg-gray-50 hover:text-gray-700 dark:border-transparent dark:bg-gray-600 dark:text-gray-300 dark:hover:bg-gray-500 dark:hover:text-white sm:right-9 sm:size-7"
657
+ disabled={isReadOnly}
658
+ onclick={() => {
659
+ isRecording = true;
660
+ }}
661
+ aria-label="Start voice recording"
662
+ >
663
+ <IconMic class="size-4" />
664
+ </button>
665
+ {/if}
666
  <button
667
  class="btn absolute bottom-2 right-2 size-8 self-end rounded-full border bg-white text-black shadow transition-none enabled:hover:bg-white enabled:hover:shadow-inner dark:border-transparent dark:bg-gray-600 dark:text-white dark:hover:enabled:bg-black sm:size-7 {!draft ||
668
  isReadOnly
src/lib/components/chat/VoiceRecorder.svelte ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <script lang="ts">
2
+ import { onMount, onDestroy } from "svelte";
3
+ import CarbonClose from "~icons/carbon/close";
4
+ import CarbonCheckmark from "~icons/carbon/checkmark";
5
+ import IconArrowUp from "~icons/lucide/arrow-up";
6
+ import EosIconsLoading from "~icons/eos-icons/loading";
7
+ import IconLoading from "$lib/components/icons/IconLoading.svelte";
8
+ import AudioWaveform from "$lib/components/voice/AudioWaveform.svelte";
9
+
10
+ interface Props {
11
+ isTranscribing: boolean;
12
+ isTouchDevice: boolean;
13
+ oncancel: () => void;
14
+ onconfirm: (audioBlob: Blob) => void;
15
+ onsend: (audioBlob: Blob) => void;
16
+ onerror: (message: string) => void;
17
+ }
18
+
19
+ let { isTranscribing, isTouchDevice, oncancel, onconfirm, onsend, onerror }: Props = $props();
20
+
21
+ let mediaRecorder: MediaRecorder | null = $state(null);
22
+ let audioChunks: Blob[] = $state([]);
23
+ let analyser: AnalyserNode | null = $state(null);
24
+ let frequencyData: Uint8Array = $state(new Uint8Array(32));
25
+ let animationFrameId: number | null = $state(null);
26
+ let audioContext: AudioContext | null = $state(null);
27
+ let mediaStream: MediaStream | null = $state(null);
28
+
29
+ function startVisualization() {
30
+ function update() {
31
+ if (analyser) {
32
+ const data = new Uint8Array(analyser.frequencyBinCount);
33
+ analyser.getByteFrequencyData(data);
34
+ // Create new array to trigger Svelte reactivity
35
+ frequencyData = data;
36
+ }
37
+ animationFrameId = requestAnimationFrame(update);
38
+ }
39
+ update();
40
+ }
41
+
42
+ function stopVisualization() {
43
+ if (animationFrameId !== null) {
44
+ cancelAnimationFrame(animationFrameId);
45
+ animationFrameId = null;
46
+ }
47
+ }
48
+
49
+ async function startRecording() {
50
+ try {
51
+ const stream = await navigator.mediaDevices.getUserMedia({
52
+ audio: {
53
+ channelCount: 1,
54
+ sampleRate: 16000, // Whisper prefers 16kHz
55
+ echoCancellation: true,
56
+ noiseSuppression: true,
57
+ },
58
+ });
59
+
60
+ mediaStream = stream;
61
+
62
+ // Set up audio context for visualization
63
+ audioContext = new AudioContext();
64
+ const source = audioContext.createMediaStreamSource(stream);
65
+ analyser = audioContext.createAnalyser();
66
+ analyser.fftSize = 64; // Small for performance, gives 32 frequency bins
67
+ analyser.smoothingTimeConstant = 0.4;
68
+ source.connect(analyser);
69
+ frequencyData = new Uint8Array(analyser.frequencyBinCount);
70
+
71
+ // Start MediaRecorder
72
+ // Use webm/opus for broad browser support
73
+ const mimeType = MediaRecorder.isTypeSupported("audio/webm;codecs=opus")
74
+ ? "audio/webm;codecs=opus"
75
+ : "audio/webm";
76
+
77
+ mediaRecorder = new MediaRecorder(stream, { mimeType });
78
+ audioChunks = [];
79
+
80
+ mediaRecorder.ondataavailable = (e) => {
81
+ if (e.data.size > 0) {
82
+ audioChunks = [...audioChunks, e.data];
83
+ }
84
+ };
85
+
86
+ mediaRecorder.start(100); // Collect data every 100ms
87
+ startVisualization();
88
+ } catch (err) {
89
+ if (err instanceof DOMException) {
90
+ if (err.name === "NotAllowedError") {
91
+ onerror("Microphone access denied. Please allow in browser settings.");
92
+ } else if (err.name === "NotFoundError") {
93
+ onerror("No microphone found.");
94
+ } else {
95
+ onerror(`Microphone error: ${err.message}`);
96
+ }
97
+ } else {
98
+ onerror("Could not access microphone.");
99
+ }
100
+ }
101
+ }
102
+
103
+ function stopRecording(): Blob | null {
104
+ stopVisualization();
105
+
106
+ if (mediaRecorder && mediaRecorder.state !== "inactive") {
107
+ mediaRecorder.stop();
108
+ }
109
+
110
+ // Stop all audio tracks
111
+ if (mediaStream) {
112
+ mediaStream.getTracks().forEach((track) => track.stop());
113
+ mediaStream = null;
114
+ }
115
+
116
+ // Close audio context
117
+ if (audioContext) {
118
+ audioContext.close();
119
+ audioContext = null;
120
+ }
121
+
122
+ analyser = null;
123
+ mediaRecorder = null;
124
+
125
+ if (audioChunks.length === 0) {
126
+ return null;
127
+ }
128
+
129
+ // Create blob from chunks
130
+ const mimeType = audioChunks[0]?.type || "audio/webm";
131
+ return new Blob(audioChunks, { type: mimeType });
132
+ }
133
+
134
+ function handleCancel() {
135
+ stopRecording();
136
+ oncancel();
137
+ }
138
+
139
+ function handleConfirm() {
140
+ const audioBlob = stopRecording();
141
+ if (audioBlob && audioBlob.size > 0) {
142
+ if (isTouchDevice) {
143
+ onsend(audioBlob);
144
+ } else {
145
+ onconfirm(audioBlob);
146
+ }
147
+ } else {
148
+ onerror("No audio recorded. Please try again.");
149
+ }
150
+ }
151
+
152
+ onMount(() => {
153
+ startRecording();
154
+ });
155
+
156
+ onDestroy(() => {
157
+ stopRecording();
158
+ });
159
+ </script>
160
+
161
+ <div class="flex h-full w-full items-center justify-between px-3 py-1.5">
162
+ <!-- Cancel button -->
163
+ <button
164
+ type="button"
165
+ class="btn grid size-8 place-items-center rounded-full border bg-white text-black shadow transition-none hover:bg-gray-100 dark:border-transparent dark:bg-gray-600 dark:text-white dark:hover:bg-gray-500 sm:size-7"
166
+ onclick={handleCancel}
167
+ aria-label="Cancel recording"
168
+ >
169
+ <CarbonClose class="size-4" />
170
+ </button>
171
+
172
+ <!-- Waveform / Loading -->
173
+ <div class="flex h-12 flex-1 items-center overflow-hidden pl-2.5 pr-1.5">
174
+ {#if isTranscribing}
175
+ <div class="flex h-full w-full items-center justify-center">
176
+ <IconLoading classNames="text-gray-400" />
177
+ </div>
178
+ {:else}
179
+ <AudioWaveform {frequencyData} minHeight={4} maxHeight={40} />
180
+ {/if}
181
+ </div>
182
+
183
+ <!-- Confirm/Send button -->
184
+ <button
185
+ type="button"
186
+ class="btn grid size-8 place-items-center rounded-full border shadow transition-none disabled:opacity-50 sm:size-7 {isTouchDevice
187
+ ? 'border-transparent bg-black text-white hover:bg-gray-800 dark:bg-white dark:text-black dark:hover:bg-gray-200'
188
+ : 'bg-white text-black hover:bg-gray-100 dark:border-transparent dark:bg-gray-600 dark:text-white dark:hover:bg-gray-500'}"
189
+ onclick={handleConfirm}
190
+ disabled={isTranscribing}
191
+ aria-label={isTranscribing
192
+ ? "Transcribing..."
193
+ : isTouchDevice
194
+ ? "Send message"
195
+ : "Confirm and transcribe"}
196
+ >
197
+ {#if isTranscribing}
198
+ <EosIconsLoading class="size-4" />
199
+ {:else if isTouchDevice}
200
+ <IconArrowUp class="size-4" />
201
+ {:else}
202
+ <CarbonCheckmark class="size-4" />
203
+ {/if}
204
+ </button>
205
+ </div>
src/lib/components/voice/AudioWaveform.svelte ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <script lang="ts">
2
+ import { onMount, onDestroy } from "svelte";
3
+
4
+ interface Props {
5
+ frequencyData: Uint8Array;
6
+ minHeight?: number;
7
+ maxHeight?: number;
8
+ }
9
+
10
+ let { frequencyData, minHeight = 4, maxHeight = 40 }: Props = $props();
11
+
12
+ const PILL_WIDTH = 2; // w-0.5 = 2px
13
+ const PILL_GAP = 2;
14
+ const SAMPLE_INTERVAL_MS = 50; // Sample every 50ms (~20 samples/sec)
15
+
16
+ let containerRef: HTMLDivElement | undefined = $state();
17
+ let timeline: number[] = $state([]);
18
+ let pillCount = $state(60); // Default, will be calculated from container width
19
+ let intervalId: ReturnType<typeof setInterval> | undefined;
20
+ let smoothedAmplitude = 0;
21
+
22
+ // Calculate average amplitude from frequency data
23
+ function getAmplitude(): number {
24
+ if (!frequencyData.length) return 0;
25
+ let sum = 0;
26
+ for (let i = 0; i < frequencyData.length; i++) {
27
+ sum += frequencyData[i];
28
+ }
29
+ return sum / frequencyData.length / 255; // Normalize to 0-1
30
+ }
31
+
32
+ function addSample() {
33
+ const rawAmplitude = getAmplitude();
34
+ // Smooth the amplitude for less jittery visualization
35
+ smoothedAmplitude = smoothedAmplitude * 0.3 + rawAmplitude * 0.7;
36
+
37
+ // Boost amplitude by 1.5x and apply slight curve for better visibility
38
+ const boostedAmplitude = Math.min(1, Math.pow(smoothedAmplitude * 1.5, 0.85));
39
+
40
+ const height = minHeight + boostedAmplitude * (maxHeight - minHeight);
41
+
42
+ // Push new sample, keep only pillCount samples (sliding window)
43
+ timeline = [...timeline, height].slice(-pillCount);
44
+ }
45
+
46
+ function calculatePillCount() {
47
+ if (containerRef) {
48
+ const width = containerRef.clientWidth;
49
+ pillCount = Math.max(20, Math.floor(width / (PILL_WIDTH + PILL_GAP)));
50
+ }
51
+ }
52
+
53
+ onMount(() => {
54
+ calculatePillCount();
55
+
56
+ // Initialize timeline with minimum height dots
57
+ timeline = Array(pillCount).fill(minHeight);
58
+
59
+ // Start sampling at fixed intervals
60
+ intervalId = setInterval(addSample, SAMPLE_INTERVAL_MS);
61
+
62
+ // Handle resize
63
+ const resizeObserver = new ResizeObserver(() => {
64
+ const oldCount = pillCount;
65
+ calculatePillCount();
66
+ // Adjust timeline buffer if container size changed
67
+ if (pillCount > oldCount) {
68
+ // Pad with min height on the left
69
+ timeline = [...Array(pillCount - oldCount).fill(minHeight), ...timeline];
70
+ } else if (pillCount < oldCount) {
71
+ timeline = timeline.slice(-pillCount);
72
+ }
73
+ });
74
+
75
+ if (containerRef) {
76
+ resizeObserver.observe(containerRef);
77
+ }
78
+
79
+ return () => {
80
+ resizeObserver.disconnect();
81
+ };
82
+ });
83
+
84
+ onDestroy(() => {
85
+ if (intervalId) clearInterval(intervalId);
86
+ });
87
+ </script>
88
+
89
+ <div bind:this={containerRef} class="flex h-12 w-full items-center justify-start gap-[2px]">
90
+ {#each timeline as height, i (i)}
91
+ <div
92
+ class="w-0.5 shrink-0 rounded-full bg-gray-400 dark:bg-white/60"
93
+ style="height: {Math.max(minHeight, Math.round(height))}px;"
94
+ ></div>
95
+ {/each}
96
+ </div>
src/lib/server/api/routes/groups/misc.ts CHANGED
@@ -13,6 +13,7 @@ export interface FeatureFlags {
13
  enableAssistants: boolean;
14
  loginEnabled: boolean;
15
  isAdmin: boolean;
 
16
  }
17
 
18
  export const misc = new Elysia()
@@ -23,6 +24,7 @@ export const misc = new Elysia()
23
  enableAssistants: config.ENABLE_ASSISTANTS === "true",
24
  loginEnabled, // login feature is on when OID is configured
25
  isAdmin: locals.isAdmin,
 
26
  } satisfies FeatureFlags;
27
  })
28
  .get("/export", async ({ locals }) => {
 
13
  enableAssistants: boolean;
14
  loginEnabled: boolean;
15
  isAdmin: boolean;
16
+ transcriptionEnabled: boolean;
17
  }
18
 
19
  export const misc = new Elysia()
 
24
  enableAssistants: config.ENABLE_ASSISTANTS === "true",
25
  loginEnabled, // login feature is on when OID is configured
26
  isAdmin: locals.isAdmin,
27
+ transcriptionEnabled: !!config.get("TRANSCRIPTION_MODEL"),
28
  } satisfies FeatureFlags;
29
  })
30
  .get("/export", async ({ locals }) => {
src/routes/api/transcribe/+server.ts ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { error, json } from "@sveltejs/kit";
2
+ import { config } from "$lib/server/config";
3
+ import { getApiToken } from "$lib/server/apiToken";
4
+ import { logger } from "$lib/server/logger";
5
+
6
+ const MAX_AUDIO_SIZE = 25 * 1024 * 1024; // 25MB
7
+ const TRANSCRIPTION_TIMEOUT = 60000; // 60 seconds
8
+
9
+ const ALLOWED_CONTENT_TYPES = [
10
+ "audio/webm",
11
+ "audio/ogg",
12
+ "audio/wav",
13
+ "audio/flac",
14
+ "audio/mpeg",
15
+ "audio/mp4",
16
+ "audio/x-wav",
17
+ ];
18
+
19
+ export async function POST({ request, locals }) {
20
+ const transcriptionModel = config.get("TRANSCRIPTION_MODEL");
21
+
22
+ if (!transcriptionModel) {
23
+ throw error(503, "Transcription is not configured");
24
+ }
25
+
26
+ const token = getApiToken(locals);
27
+
28
+ if (!token) {
29
+ throw error(401, "Authentication required");
30
+ }
31
+
32
+ const contentType = request.headers.get("content-type") || "";
33
+ const isAllowed = ALLOWED_CONTENT_TYPES.some((type) => contentType.includes(type));
34
+
35
+ if (!isAllowed) {
36
+ logger.warn({ contentType }, "Unsupported audio format for transcription");
37
+ throw error(400, `Unsupported audio format: ${contentType}`);
38
+ }
39
+
40
+ const contentLength = parseInt(request.headers.get("content-length") || "0");
41
+ if (contentLength > MAX_AUDIO_SIZE) {
42
+ throw error(413, "Audio file too large (max 25MB)");
43
+ }
44
+
45
+ try {
46
+ const audioBuffer = await request.arrayBuffer();
47
+
48
+ if (audioBuffer.byteLength > MAX_AUDIO_SIZE) {
49
+ throw error(413, "Audio file too large (max 25MB)");
50
+ }
51
+
52
+ const baseUrl =
53
+ config.get("TRANSCRIPTION_BASE_URL") || "https://router.huggingface.co/hf-inference/models";
54
+ const apiUrl = `${baseUrl}/${transcriptionModel}`;
55
+
56
+ const controller = new AbortController();
57
+ const timeoutId = setTimeout(() => controller.abort(), TRANSCRIPTION_TIMEOUT);
58
+
59
+ const response = await fetch(apiUrl, {
60
+ method: "POST",
61
+ headers: {
62
+ Authorization: `Bearer ${token}`,
63
+ "Content-Type": contentType,
64
+ },
65
+ body: audioBuffer,
66
+ signal: controller.signal,
67
+ }).finally(() => clearTimeout(timeoutId));
68
+
69
+ if (!response.ok) {
70
+ const errorText = await response.text();
71
+ logger.error(
72
+ { status: response.status, error: errorText, model: transcriptionModel },
73
+ "Whisper API error"
74
+ );
75
+ throw error(response.status, `Transcription failed: ${errorText}`);
76
+ }
77
+
78
+ const result = await response.json();
79
+
80
+ // Whisper API returns { text: "transcribed text" }
81
+ return json({ text: result.text || "" });
82
+ } catch (err) {
83
+ if (err instanceof Error && err.name === "AbortError") {
84
+ logger.error({ model: transcriptionModel }, "Transcription timeout");
85
+ throw error(504, "Transcription took too long. Please try a shorter recording.");
86
+ }
87
+
88
+ // Re-throw SvelteKit errors
89
+ if (err && typeof err === "object" && "status" in err) {
90
+ throw err;
91
+ }
92
+
93
+ logger.error(err, "Transcription error");
94
+ throw error(500, "Failed to transcribe audio");
95
+ }
96
+ }