File size: 23,752 Bytes
9d18449
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
import React, { useState, useEffect, useRef } from 'react';
import { Mic, Square, Settings, Loader2, AlertCircle, Copy, CheckCircle2, ChevronDown, ChevronUp, Upload } from 'lucide-react';

// --- Feature Extraction: Log-Mel Spectrogram ---
// This model requires 80-dim log-mel spectrogram features, standard for Conformer models.
const computeLogMelSpectrogram = (audioData) => {
  const sr = 16000;
  const n_fft = 512;
  const win_length = 400; // 25ms
  const hop_length = 160; // 10ms
  const n_mels = 80;
  const preemph = 0.97;

  // 1. Preemphasis
  const preemphasized = new Float32Array(audioData.length);
  preemphasized[0] = audioData[0];
  for (let i = 1; i < audioData.length; i++) {
    preemphasized[i] = audioData[i] - preemph * audioData[i - 1];
  }

  // 2. Window (Hann)
  const window = new Float32Array(win_length);
  for (let i = 0; i < win_length; i++) {
    window[i] = 0.5 - 0.5 * Math.cos((2 * Math.PI * i) / (win_length - 1));
  }

  // 3. Mel Filterbank
  const fmin = 0;
  const fmax = 8000;
  const melMin = 2595 * Math.log10(1 + fmin / 700);
  const melMax = 2595 * Math.log10(1 + fmax / 700);
  const melPoints = Array.from({length: n_mels + 2}, (_, i) => melMin + i * (melMax - melMin) / (n_mels + 1));
  const hzPoints = melPoints.map(m => 700 * (Math.pow(10, m / 2595) - 1));
  const fftFreqs = Array.from({length: n_fft / 2 + 1}, (_, i) => (i * sr) / n_fft);
  
  const fbank = [];
  for (let i = 0; i < n_mels; i++) {
    const row = new Float32Array(n_fft / 2 + 1);
    const f_left = hzPoints[i];
    const f_center = hzPoints[i + 1];
    const f_right = hzPoints[i + 2];
    for (let j = 0; j < fftFreqs.length; j++) {
      const f = fftFreqs[j];
      if (f >= f_left && f <= f_center) {
        row[j] = (f - f_left) / (f_center - f_left);
      } else if (f >= f_center && f <= f_right) {
        row[j] = (f_right - f) / (f_right - f_center);
      }
    }
    fbank.push(row);
  }

  // 4. STFT & Log-Mel Computation
  const numFrames = Math.floor((preemphasized.length - win_length) / hop_length) + 1;
  if (numFrames <= 0) return { melSpec: new Float32Array(0), numFrames: 0 };
  
  const melSpec = new Float32Array(n_mels * numFrames);
  
  for (let frame = 0; frame < numFrames; frame++) {
    const start = frame * hop_length;
    const real = new Float32Array(n_fft);
    const imag = new Float32Array(n_fft);
    
    for (let i = 0; i < win_length; i++) {
      real[i] = preemphasized[start + i] * window[i];
    }
    
    // Cooley-Tukey FFT
    let j = 0;
    for (let i = 0; i < n_fft - 1; i++) {
      if (i < j) {
        let tr = real[i]; real[i] = real[j]; real[j] = tr;
        let ti = imag[i]; imag[i] = imag[j]; imag[j] = ti;
      }
      let m = n_fft >> 1;
      while (m >= 1 && j >= m) { j -= m; m >>= 1; }
      j += m;
    }
    
    for (let l = 2; l <= n_fft; l <<= 1) {
      let l2 = l >> 1;
      let u1 = 1.0, u2 = 0.0;
      let c1 = Math.cos(Math.PI / l2), c2 = -Math.sin(Math.PI / l2);
      for (let j = 0; j < l2; j++) {
        for (let i = j; i < n_fft; i += l) {
          let i1 = i + l2;
          let t1 = u1 * real[i1] - u2 * imag[i1];
          let t2 = u1 * imag[i1] + u2 * real[i1];
          real[i1] = real[i] - t1;
          imag[i1] = imag[i] - t2;
          real[i] += t1;
          imag[i] += t2;
        }
        let z = u1 * c1 - u2 * c2;
        u2 = u1 * c2 + u2 * c1;
        u1 = z;
      }
    }

    // Apply Mel Filterbank & Log
    for (let m = 0; m < n_mels; m++) {
      let melEnergy = 0;
      for (let i = 0; i <= n_fft / 2; i++) {
         const power = real[i] * real[i] + imag[i] * imag[i];
         melEnergy += power * fbank[m][i];
      }
      const logMel = Math.log(Math.max(melEnergy, 1e-9));
      melSpec[m * numFrames + frame] = logMel;
    }
  }

  // 5. Feature Standardization (per-instance mean/var normalization)
  for (let m = 0; m < n_mels; m++) {
    let sum = 0;
    for (let f = 0; f < numFrames; f++) {
      sum += melSpec[m * numFrames + f];
    }
    const mean = sum / numFrames;
    let sumSq = 0;
    for (let f = 0; f < numFrames; f++) {
      const diff = melSpec[m * numFrames + f] - mean;
      sumSq += diff * diff;
    }
    const std = Math.sqrt(sumSq / numFrames) + 1e-9;
    for (let f = 0; f < numFrames; f++) {
      melSpec[m * numFrames + f] = (melSpec[m * numFrames + f] - mean) / std;
    }
  }

  return { melSpec, numFrames };
};


export default function App() {
  // App State
  const [modelUrl, setModelUrl] = useState("https://huggingface.co/sulabhkatiyar/indicconformer-120m-onnx/resolve/main/ml/model.onnx");
  const [vocabUrl, setVocabUrl] = useState("https://huggingface.co/sulabhkatiyar/indicconformer-120m-onnx/resolve/main/ml/vocab.json");
  
  const [isOrtReady, setIsOrtReady] = useState(false);
  const [session, setSession] = useState(null);
  const [vocab, setVocab] = useState([]);
  const [isLoading, setIsLoading] = useState(false);
  
  const [isRecording, setIsRecording] = useState(false);
  const [status, setStatus] = useState("Please load the model to begin.");
  const [transcript, setTranscript] = useState("");
  const [copiedMessage, setCopiedMessage] = useState("");
  
  const [showSettings, setShowSettings] = useState(false);
  const [errorMessage, setErrorMessage] = useState("");
  
  // Refs for Audio Recording
  const mediaRecorderRef = useRef(null);
  const audioChunksRef = useRef([]);
  const fileInputRef = useRef(null);

  // Load onnxruntime-web script dynamically
  useEffect(() => {
    if (window.ort) {
      setIsOrtReady(true);
      return;
    }
    const script = document.createElement('script');
    script.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.min.js";
    script.async = true;
    script.onload = () => setIsOrtReady(true);
    script.onerror = () => setErrorMessage("Failed to load onnxruntime-web library.");
    document.body.appendChild(script);
  }, []);

  const loadVocab = async (url) => {
    const res = await fetch(url);
    if (!res.ok) throw new Error(`Failed to load vocab from ${url}`);
    
    try {
      // First attempt to parse as JSON
      const data = await res.json();
      if (Array.isArray(data)) {
        return data; // Simple array of tokens
      } else if (typeof data === 'object') {
        // Handle format {"token": index}
        const vocabArray = [];
        for (const [token, index] of Object.entries(data)) {
          vocabArray[index] = token;
        }
        return vocabArray;
      }
    } catch (e) {
      // Fallback to text-based parsing if JSON fails (e.g. for vocab.txt)
      const text = await res.text();
      return text.split('\n').map(line => line.trim()).filter(line => line.length > 0);
    }
    throw new Error("Invalid vocabulary format");
  };

  const initModel = async () => {
    if (!isOrtReady || !window.ort) {
      setErrorMessage("ONNX Runtime is not ready yet.");
      return;
    }
    
    setIsLoading(true);
    setErrorMessage("");
    setStatus("Downloading Vocabulary...");
    
    try {
      const loadedVocab = await loadVocab(vocabUrl);
      setVocab(loadedVocab);
      
      setStatus("Downloading ONNX Model (100MB+). This may take a while...");
      // Create Inference Session using the WASM execution provider
      const sess = await window.ort.InferenceSession.create(modelUrl, { 
         executionProviders: ['wasm'] 
      });
      
      setSession(sess);
      setStatus("Model Loaded & Ready. Press the microphone to speak.");
    } catch (err) {
      console.error(err);
      setErrorMessage(`Initialization Error: ${err.message}. Please check the URLs in Settings.`);
      setStatus("Failed to load model.");
    } finally {
      setIsLoading(false);
    }
  };

  const startRecording = async () => {
    try {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      const mediaRecorder = new MediaRecorder(stream);
      audioChunksRef.current = [];
      
      mediaRecorder.ondataavailable = (e) => {
        if (e.data.size > 0) audioChunksRef.current.push(e.data);
      };
      
      mediaRecorder.onstop = processAndInfer;
      mediaRecorderRef.current = mediaRecorder;
      mediaRecorder.start();
      
      setIsRecording(true);
      setStatus("Recording... Speak in Malayalam.");
      setErrorMessage("");
    } catch (err) {
      console.error(err);
      setErrorMessage("Microphone permission denied or an error occurred.");
    }
  };

  const stopRecording = () => {
    if (mediaRecorderRef.current && isRecording) {
      mediaRecorderRef.current.stop();
      setIsRecording(false);
      // Stops all microphone tracks
      mediaRecorderRef.current.stream.getTracks().forEach(track => track.stop());
    }
  };

  const processAndInfer = async () => {
    setStatus("Processing Audio...");
    try {
      // Decode audio and resample to 16kHz Mono Float32
      const blob = new Blob(audioChunksRef.current);
      const arrayBuffer = await blob.arrayBuffer();
      const audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
      const decodedData = await audioCtx.decodeAudioData(arrayBuffer);
      const float32Data = decodedData.getChannelData(0); // Mono channel
      
      setStatus("Running Inference...");
      await runInference(float32Data);
    } catch (err) {
      console.error(err);
      setErrorMessage(`Audio Processing Error: ${err.message}`);
      setStatus("Ready.");
    }
  };

  const handleFileUpload = async (e) => {
    const file = e.target.files[0];
    if (!file) return;

    setStatus("Processing Uploaded Audio...");
    setErrorMessage("");
    setIsLoading(true);
    
    try {
      const arrayBuffer = await file.arrayBuffer();
      const audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
      const decodedData = await audioCtx.decodeAudioData(arrayBuffer);
      const float32Data = decodedData.getChannelData(0); // Mono channel
      
      setStatus("Running Inference on File...");
      await runInference(float32Data);
    } catch (err) {
      console.error(err);
      setErrorMessage(`Audio Upload Error: ${err.message}`);
      setStatus("Ready.");
    } finally {
      setIsLoading(false);
      e.target.value = null; // Reset input to allow re-uploading the same file
    }
  };

  const runInference = async (float32Data) => {
    try {
      const inputNames = session.inputNames;
      const feeds = {};
      
      // Attempt 1: Raw Waveform tensor
      if (inputNames.includes('audio_signal')) {
        feeds['audio_signal'] = new window.ort.Tensor('float32', float32Data, [1, float32Data.length]);
      } else {
        throw new Error(`The model expects inputs: ${inputNames.join(', ')}.`);
      }
      
      if (inputNames.includes('length')) {
        feeds['length'] = new window.ort.Tensor('int64', new BigInt64Array([BigInt(float32Data.length)]), [1]);
      }
      
      let results;
      try {
        results = await session.run(feeds);
      } catch (runError) {
        // Attempt 2: Feature-extracted Log-Mel Spectrogram (Catches "Expected: 3" or "Expected: 80" errors)
        if (runError.message && (runError.message.includes("Expected: 3") || runError.message.includes("Expected: 80"))) {
          console.warn("Raw audio tensor failed. Model likely lacks a feature extractor. Computing 80-bin Log-Mel Spectrogram natively...");
          
          const { melSpec, numFrames } = computeLogMelSpectrogram(float32Data);
          if (numFrames <= 0) throw new Error("Audio sample is too short to process.");

          feeds['audio_signal'] = new window.ort.Tensor('float32', melSpec, [1, 80, numFrames]);
          
          if (inputNames.includes('length')) {
             feeds['length'] = new window.ort.Tensor('int64', new BigInt64Array([BigInt(numFrames)]), [1]);
          }
          
          results = await session.run(feeds);
        } else {
          throw runError; // Unhandled error
        }
      }
      
      // Assume the first output contains the logprobs/logits
      const outputName = session.outputNames[0]; 
      const outputTensor = results[outputName];
      const logits = outputTensor.data; 
      let dims = outputTensor.dims;
      
      // Standardize dims to [batch, time, vocab]
      if (dims.length === 2) dims = [1, dims[0], dims[1]];
      
      const text = decodeCTC(logits, dims, vocab);
      setTranscript(prev => prev + (prev ? " " : "") + text);
      setStatus("Transcription Complete. Ready for next.");
    } catch (err) {
      console.error(err);
      setErrorMessage(`Inference Error: ${err.message}`);
      setStatus("Ready.");
    }
  };

  const decodeCTC = (logits, dims, vocabList) => {
    const T = dims[1]; // Time frames
    const V = dims[2]; // Vocab size emitted by model
    let result = [];
    let prev_id = -1;
    
    // In typical NeMo models, the blank token is the last index
    const blankId = V - 1; 
    
    for (let t = 0; t < T; t++) {
      let max_val = -Infinity;
      let max_id = -1;
      
      for (let v = 0; v < V; v++) {
        const val = logits[t * V + v];
        if (val > max_val) {
          max_val = val;
          max_id = v;
        }
      }
      
      if (max_id !== prev_id && max_id !== blankId) {
        let token = "";
        if (max_id < vocabList.length) {
           token = vocabList[max_id];
        }
        
        // Ignore standard special tokens
        if (token && token !== '<blank>' && token !== '<pad>' && token !== '<s>' && token !== '</s>') {
           result.push(token);
        }
      }
      prev_id = max_id;
    }
    
    // Clean up SentencePiece artifacts (e.g., '_' or ' ')
    let decodedText = result.join('');
    decodedText = decodedText.replace(/ /g, ' ').replace(/_/g, ' ').trim();
    return decodedText.replace(/\s+/g, ' '); // Remove redundant spaces
  };

  const handleCopy = () => {
    const textArea = document.createElement("textarea");
    textArea.value = transcript;
    document.body.appendChild(textArea);
    textArea.select();
    try {
      document.execCommand('copy');
      setCopiedMessage("Copied to clipboard!");
      setTimeout(() => setCopiedMessage(""), 2000);
    } catch (err) {
      setCopiedMessage("Failed to copy");
      setTimeout(() => setCopiedMessage(""), 2000);
    }
    document.body.removeChild(textArea);
  };

  return (
    <div className="min-h-screen bg-neutral-50 dark:bg-neutral-900 text-neutral-900 dark:text-neutral-100 p-4 sm:p-8 font-sans selection:bg-blue-200 dark:selection:bg-blue-900">
      <div className="max-w-3xl mx-auto space-y-6">
        
        {/* Header */}
        <div className="text-center space-y-2">
          <h1 className="text-3xl sm:text-4xl font-extrabold tracking-tight bg-clip-text text-transparent bg-gradient-to-r from-blue-600 to-indigo-600 dark:from-blue-400 dark:to-indigo-400">
            Malayalam Speech-to-Text
          </h1>
          <p className="text-neutral-500 dark:text-neutral-400 text-sm sm:text-base">
            Powered by IndicConformer-120M & ONNX Runtime Web
          </p>
        </div>

        {/* Main Interface Card */}
        <div className="bg-white dark:bg-neutral-800 rounded-2xl shadow-xl border border-neutral-100 dark:border-neutral-700 overflow-hidden">
          
          {/* Status Bar */}
          <div className="bg-neutral-100 dark:bg-neutral-700/50 px-6 py-3 flex items-center justify-between">
            <div className="flex items-center space-x-2 text-sm font-medium text-neutral-600 dark:text-neutral-300">
              {isLoading ? (
                <Loader2 size={16} className="animate-spin text-blue-500" />
              ) : session ? (
                <CheckCircle2 size={16} className="text-emerald-500" />
              ) : (
                <AlertCircle size={16} className="text-amber-500" />
              )}
              <span>{status}</span>
            </div>
            
            <button 
              onClick={() => setShowSettings(!showSettings)}
              className="text-neutral-400 hover:text-neutral-600 dark:hover:text-neutral-200 transition-colors"
              title="Settings"
            >
              <Settings size={18} />
            </button>
          </div>

          {/* Settings Panel */}
          {showSettings && (
            <div className="px-6 py-4 bg-neutral-50 dark:bg-neutral-800/80 border-b border-neutral-100 dark:border-neutral-700 space-y-4">
              <h3 className="text-sm font-semibold uppercase tracking-wider text-neutral-500 dark:text-neutral-400">
                Model Configuration
              </h3>
              <div className="space-y-3 text-sm">
                <div>
                  <label className="block text-neutral-700 dark:text-neutral-300 mb-1 font-medium">ONNX Model URL</label>
                  <input 
                    type="text" 
                    value={modelUrl}
                    onChange={e => setModelUrl(e.target.value)}
                    className="w-full p-2.5 border border-neutral-300 dark:border-neutral-600 rounded-lg bg-white dark:bg-neutral-900 focus:ring-2 focus:ring-blue-500 focus:border-blue-500 outline-none transition-all"
                  />
                </div>
                <div>
                  <label className="block text-neutral-700 dark:text-neutral-300 mb-1 font-medium">Vocabulary URL (.txt)</label>
                  <input 
                    type="text" 
                    value={vocabUrl}
                    onChange={e => setVocabUrl(e.target.value)}
                    className="w-full p-2.5 border border-neutral-300 dark:border-neutral-600 rounded-lg bg-white dark:bg-neutral-900 focus:ring-2 focus:ring-blue-500 focus:border-blue-500 outline-none transition-all"
                  />
                </div>
                <div className="flex items-center justify-between pt-2">
                  <span className="text-xs text-neutral-500 dark:text-neutral-400 flex items-center">
                    <AlertCircle size={12} className="inline mr-1" /> Re-initialize model after changing URLs.
                  </span>
                  <button
                    onClick={initModel}
                    disabled={isLoading}
                    className="px-4 py-2 bg-neutral-200 dark:bg-neutral-700 hover:bg-neutral-300 dark:hover:bg-neutral-600 rounded-lg font-medium transition-colors text-sm"
                  >
                    Load / Refresh Model
                  </button>
                </div>
              </div>
            </div>
          )}

          {/* Action Area */}
          <div className="p-8 flex flex-col items-center justify-center space-y-6">
            
            {/* Error Message Display */}
            {errorMessage && (
              <div className="w-full p-4 bg-red-50 dark:bg-red-900/20 text-red-600 dark:text-red-400 rounded-xl text-sm border border-red-100 dark:border-red-900/50 flex items-start">
                <AlertCircle size={18} className="mr-2 flex-shrink-0 mt-0.5" />
                <span>{errorMessage}</span>
              </div>
            )}

            {!session && !isLoading && !errorMessage && (
              <button
                onClick={initModel}
                className="px-8 py-4 bg-blue-600 hover:bg-blue-700 text-white rounded-xl font-bold shadow-lg hover:shadow-blue-600/30 transition-all transform hover:scale-105 active:scale-95"
              >
                Initialize Model
              </button>
            )}

            {/* Input Controls */}
            <div className="flex items-center space-x-6">
              {/* Microphone Button */}
              <button
                onClick={isRecording ? stopRecording : startRecording}
                disabled={!session || isLoading}
                className={`p-8 rounded-full transition-all duration-300 group ${
                  !session || isLoading 
                    ? 'bg-neutral-200 dark:bg-neutral-800 text-neutral-400 dark:text-neutral-600 cursor-not-allowed'
                    : isRecording 
                      ? 'bg-red-500 hover:bg-red-600 animate-pulse text-white shadow-[0_0_40px_rgba(239,68,68,0.5)]' 
                      : 'bg-blue-600 hover:bg-blue-700 text-white shadow-lg hover:shadow-[0_0_30px_rgba(37,99,235,0.4)] transform hover:scale-105 active:scale-95'
                }`}
                title="Record Audio"
              >
                {isRecording ? <Square size={40} className="fill-current" /> : <Mic size={40} />}
              </button>

              {/* Upload Button */}
              <button
                onClick={() => fileInputRef.current?.click()}
                disabled={!session || isLoading || isRecording}
                className={`p-8 rounded-full transition-all duration-300 group ${
                  !session || isLoading || isRecording
                    ? 'bg-neutral-200 dark:bg-neutral-800 text-neutral-400 dark:text-neutral-600 cursor-not-allowed'
                    : 'bg-indigo-600 hover:bg-indigo-700 text-white shadow-lg hover:shadow-[0_0_30px_rgba(79,70,229,0.4)] transform hover:scale-105 active:scale-95'
                }`}
                title="Upload Audio File"
              >
                <Upload size={40} />
              </button>
              <input 
                type="file" 
                ref={fileInputRef} 
                onChange={handleFileUpload} 
                accept="audio/*" 
                className="hidden" 
              />
            </div>
            
            <p className="text-neutral-500 dark:text-neutral-400 font-medium text-center">
              {isRecording ? "Tap to Stop & Transcribe" : (session ? "Tap Mic to Record or Upload an Audio File" : "Model required to process audio")}
            </p>
          </div>

          {/* Transcript Area */}
          <div className="border-t border-neutral-100 dark:border-neutral-700 p-6 bg-neutral-50 dark:bg-neutral-800/50">
            <div className="flex items-center justify-between mb-3">
              <h3 className="font-semibold text-neutral-700 dark:text-neutral-300">Transcript</h3>
              
              {/* Copy Tools */}
              <div className="flex items-center space-x-3">
                {copiedMessage && <span className="text-xs text-green-500 font-medium animate-fade-in">{copiedMessage}</span>}
                <button 
                  onClick={handleCopy}
                  disabled={!transcript}
                  className="p-2 text-neutral-400 hover:text-blue-500 disabled:opacity-50 disabled:cursor-not-allowed transition-colors rounded-lg hover:bg-blue-50 dark:hover:bg-blue-900/20"
                  title="Copy Transcript"
                >
                  <Copy size={18} />
                </button>
                <button 
                  onClick={() => setTranscript("")}
                  disabled={!transcript}
                  className="text-xs font-medium px-3 py-1.5 rounded-lg text-neutral-500 hover:text-red-500 hover:bg-red-50 dark:hover:bg-red-900/20 transition-colors disabled:opacity-50 disabled:cursor-not-allowed"
                >
                  Clear
                </button>
              </div>
            </div>
            
            <div className="w-full min-h-[120px] p-4 bg-white dark:bg-neutral-900 border border-neutral-200 dark:border-neutral-700 rounded-xl text-neutral-800 dark:text-neutral-200 font-medium text-lg leading-relaxed whitespace-pre-wrap">
              {transcript || <span className="text-neutral-400 dark:text-neutral-600 italic">Transcription will appear here...</span>}
            </div>
          </div>

        </div>
      </div>
    </div>
  );
}