julianmack's picture
Upload demo files (#1)
4a5024e
import { useState, useRef, useCallback, useEffect } from "react";
import { useTranscriber } from "./transcriberContext.ts";
import Confetti, { type ConfettiHandle } from "./Confetti.tsx";
import { langToFlag } from "./utils.ts";
import {
CohereLogo,
UploadIcon,
MicrophoneIcon,
CopyIcon,
DownloadIcon,
CheckIcon,
FileIcon,
MicSmallIcon,
} from "./icons.tsx";
type Screen = "landing" | "loading" | "transcription";
type TranscriptionMode = "idle" | "file" | "microphone";
// ---- Constants ----
const SCREEN_TRANSITION_MS = 600; // must match .screen CSS transition duration in index.css
const COPY_FEEDBACK_MS = 2000;
const POST_LOAD_DELAY_MS = 500;
const AUDIO_SAMPLE_RATE = 16000;
const LANGUAGES: { code: string; label: string; native: string }[] = [
{ code: "en", label: "English", native: "English" },
{ code: "fr", label: "French", native: "Français" },
{ code: "de", label: "German", native: "Deutsch" },
{ code: "es", label: "Spanish", native: "Español" },
{ code: "it", label: "Italian", native: "Italiano" },
{ code: "pt", label: "Portuguese", native: "Português" },
{ code: "nl", label: "Dutch", native: "Nederlands" },
{ code: "pl", label: "Polish", native: "Polski" },
{ code: "el", label: "Greek", native: "Ελληνικά" },
{ code: "ar", label: "Arabic", native: "العربية" },
{ code: "ja", label: "Japanese", native: "日本語" },
{ code: "zh", label: "Chinese", native: "中文" },
{ code: "vi", label: "Vietnamese", native: "Tiếng Việt" },
{ code: "ko", label: "Korean", native: "한국어" },
];
// ---- Formatting helpers ----
function formatDuration(seconds: number): string {
if (seconds < 60) return `${seconds.toFixed(1)}s`;
const mins = Math.floor(seconds / 60);
const secs = seconds % 60;
return secs > 0 ? `${mins}m ${secs.toFixed(0)}s` : `${mins}m`;
}
// ---- Audio helpers ----
async function decodeAudio(arrayBuffer: ArrayBuffer): Promise<Float32Array> {
const audioCtx = new AudioContext({ sampleRate: AUDIO_SAMPLE_RATE });
const decoded = await audioCtx.decodeAudioData(arrayBuffer);
const float32 = decoded.getChannelData(0);
await audioCtx.close();
return float32;
}
// ---- Main App ----
function App() {
const [screen, setScreen] = useState<Screen>("landing");
const [prevScreen, setPrevScreen] = useState<Screen | null>(null);
const [mode, setMode] = useState<TranscriptionMode>("idle");
const [language, setLanguage] = useState("en");
const [transcriptionText, setTranscriptionText] = useState("");
const [streamedText, setStreamedText] = useState("");
const [isTranscribing, setIsTranscribing] = useState(false);
const [audioFileName, setAudioFileName] = useState<string | null>(null);
const [isRecording, setIsRecording] = useState(false);
const [copied, setCopied] = useState(false);
const [isDragging, setIsDragging] = useState(false);
const [stats, setStats] = useState<{
audioDuration: number;
elapsed: number;
} | null>(null);
const fileInputRef = useRef<HTMLInputElement>(null);
const videoRef = useRef<HTMLVideoElement>(null);
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const audioChunksRef = useRef<Blob[]>([]);
const outputRef = useRef<HTMLDivElement>(null);
const streamedTextRef = useRef("");
const confettiRef = useRef<ConfettiHandle>(null);
const transcriber = useTranscriber();
const displayText = isTranscribing ? streamedText : transcriptionText;
// ---- Screen transitions ----
const transitionTo = useCallback(
(next: Screen) => {
setPrevScreen(screen);
setScreen(next);
setTimeout(() => setPrevScreen(null), SCREEN_TRANSITION_MS);
},
[screen],
);
const getScreenClass = useCallback(
(s: Screen) => {
if (s === screen) return "screen screen-enter";
if (s === prevScreen) return "screen screen-exit";
return "screen screen-hidden";
},
[screen, prevScreen],
);
// ---- Video autoplay fallback ----
useEffect(() => {
if (screen === "landing" && videoRef.current) {
videoRef.current.play().catch(() => {});
}
}, [screen]);
// ---- Model loading: start when entering loading screen ----
useEffect(() => {
if (screen !== "loading") return;
transcriber.load().then(() => {
setTimeout(() => transitionTo("transcription"), POST_LOAD_DELAY_MS);
});
}, [screen, transcriber, transitionTo]);
// ---- Auto-scroll output during streaming ----
useEffect(() => {
if (isTranscribing && outputRef.current) {
outputRef.current.scrollTop = outputRef.current.scrollHeight;
}
}, [streamedText, isTranscribing]);
// ---- Streaming callback ----
const onToken = useCallback((token: string) => {
streamedTextRef.current += token;
setStreamedText(streamedTextRef.current);
}, []);
// ---- Run transcription (shared by file + mic) ----
const runTranscription = useCallback(
async (audio: Float32Array) => {
setIsTranscribing(true);
setTranscriptionText("");
setStreamedText("");
setStats(null);
streamedTextRef.current = "";
const audioDuration = audio.length / AUDIO_SAMPLE_RATE;
const startTime = performance.now();
try {
const finalText = await transcriber.transcribe(
audio,
language,
onToken,
);
const elapsed = (performance.now() - startTime) / 1000;
setTranscriptionText(finalText);
setStats({ audioDuration, elapsed });
} catch (err) {
setTranscriptionText(
`Error: ${err instanceof Error ? err.message : "Transcription failed"}`,
);
} finally {
setIsTranscribing(false);
}
},
[transcriber, language, onToken],
);
// ---- File handling (shared by input + drag-and-drop) ----
const processFile = useCallback(
async (file: File) => {
setAudioFileName(file.name);
setMode("file");
const audioData = await decodeAudio(await file.arrayBuffer());
runTranscription(audioData);
},
[runTranscription],
);
const handleFileSelect = useCallback(
(e: React.ChangeEvent<HTMLInputElement>) => {
const file = e.target.files?.[0];
if (!file) return;
processFile(file);
},
[processFile],
);
// ---- Drag and drop ----
const dragCounter = useRef(0);
const handleDragEnter = useCallback(
(e: React.DragEvent) => {
e.preventDefault();
if (screen !== "transcription" || mode !== "idle") return;
dragCounter.current++;
if (dragCounter.current === 1) setIsDragging(true);
},
[screen, mode],
);
const handleDragLeave = useCallback((e: React.DragEvent) => {
e.preventDefault();
dragCounter.current--;
if (dragCounter.current === 0) setIsDragging(false);
}, []);
const handleDragOver = useCallback((e: React.DragEvent) => {
e.preventDefault();
}, []);
const handleDrop = useCallback(
(e: React.DragEvent) => {
e.preventDefault();
dragCounter.current = 0;
setIsDragging(false);
if (screen !== "transcription" || mode !== "idle") return;
const file = e.dataTransfer.files?.[0];
if (!file) return;
processFile(file);
},
[screen, mode, processFile],
);
// ---- Microphone ----
const startRecording = useCallback(async () => {
setMode("microphone");
setIsRecording(true);
setTranscriptionText("");
setStreamedText("");
audioChunksRef.current = [];
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const recorder = new MediaRecorder(stream);
mediaRecorderRef.current = recorder;
recorder.ondataavailable = (e) => {
if (e.data.size > 0) {
audioChunksRef.current.push(e.data);
}
};
recorder.onstop = async () => {
stream.getTracks().forEach((t) => t.stop());
setIsRecording(false);
try {
const blob = new Blob(audioChunksRef.current, { type: "audio/webm" });
const float32 = await decodeAudio(await blob.arrayBuffer());
runTranscription(float32);
} catch (err) {
setTranscriptionText(
`Error: ${err instanceof Error ? err.message : "Transcription failed"}`,
);
}
};
recorder.start();
} catch (err) {
setIsRecording(false);
setMode("idle");
console.error("Microphone access denied:", err);
}
}, [runTranscription]);
const stopRecording = useCallback(() => {
mediaRecorderRef.current?.stop();
}, []);
// ---- Copy to clipboard ----
const copyToClipboard = useCallback(() => {
navigator.clipboard.writeText(transcriptionText).then(() => {
setCopied(true);
setTimeout(() => setCopied(false), COPY_FEEDBACK_MS);
});
}, [transcriptionText]);
// ---- Download as .txt ----
const downloadText = useCallback(() => {
const blob = new Blob([transcriptionText], { type: "text/plain" });
const url = URL.createObjectURL(blob);
const a = document.createElement("a");
a.href = url;
a.download = "transcription.txt";
a.click();
URL.revokeObjectURL(url);
}, [transcriptionText]);
// ---- Reset ----
const resetTranscription = useCallback(() => {
setMode("idle");
setTranscriptionText("");
setStreamedText("");
streamedTextRef.current = "";
setIsTranscribing(false);
setAudioFileName(null);
setIsRecording(false);
setCopied(false);
setStats(null);
if (fileInputRef.current) fileInputRef.current.value = "";
}, []);
// ---- Render ----
const isDone = !isTranscribing && !isRecording && !!transcriptionText;
return (
<div
className="relative w-screen h-screen overflow-hidden bg-white"
onDragEnter={handleDragEnter}
onDragLeave={handleDragLeave}
onDragOver={handleDragOver}
onDrop={handleDrop}
>
{/* ==================== Screen 1: Landing ==================== */}
<div
className={`${getScreenClass("landing")} cursor-pointer`}
onClick={() => screen === "landing" && transitionTo("loading")}
>
{/* Video background */}
<video
ref={videoRef}
className="absolute inset-0 w-full h-full object-cover"
src="/video.mp4"
autoPlay
loop
muted
playsInline
/>
{/* Click hint */}
<div className="absolute bottom-12 left-0 right-0 text-center z-10">
<p className="text-xl text-white animate-pulse-glow">
Click anywhere to begin
</p>
</div>
</div>
{/* ==================== Screen 2: Loading ==================== */}
<div
className={`${getScreenClass("loading")} flex flex-col items-center justify-center bg-white`}
>
<div className="flex flex-col items-center gap-8 animate-fade-in-up">
{/* Spinner */}
<div className="relative w-16 h-16">
<div
className="absolute inset-0 rounded-full animate-spin-slow"
style={{
border: "2px solid var(--cohere-border)",
borderTopColor: "var(--cohere-purple)",
borderRightColor: "var(--cohere-cyan)",
}}
/>
<div className="absolute inset-3 flex items-center justify-center">
<CohereLogo size={24} />
</div>
</div>
{/* Status text */}
<div className="flex flex-col items-center gap-4">
<p className="text-xl text-[var(--cohere-text)]">
Loading model...
</p>
{/* Progress bar */}
<div className="w-80 h-1.5 bg-[var(--cohere-border)] rounded-full overflow-hidden">
<div
className="h-full rounded-full transition-all duration-300 ease-out"
style={{
width: `${transcriber.progress}%`,
background:
"linear-gradient(90deg, var(--cohere-deep-purple), var(--cohere-purple), var(--cohere-cyan))",
}}
/>
</div>
<p className="text-sm text-[var(--cohere-text-muted)]">
{transcriber.statusText}
</p>
</div>
</div>
{/* Footer */}
<p className="absolute bottom-8 text-xs text-[var(--cohere-text-muted)]">
Powered by Transformers.js
</p>
</div>
{/* ==================== Screen 3: Transcription ==================== */}
<div
className={`${getScreenClass("transcription")} flex flex-col bg-white`}
>
{/* Header */}
<header className="flex items-center px-8 py-5 border-b border-[var(--cohere-border)]">
<img src="/cohere.svg" alt="Cohere" className="h-6" />
</header>
{/* Main content */}
<div className="flex-1 flex items-center justify-center px-8 py-10">
{mode === "idle" ? (
/* ---- Mode Selection + Language ---- */
<div className="flex flex-col items-center gap-10 animate-fade-in-up">
{/* Upload / Record cards */}
<div className="flex flex-col sm:flex-row gap-8">
{/* Upload File Card */}
<button
onClick={() => fileInputRef.current?.click()}
className="group w-72 h-72 rounded-2xl border border-[var(--cohere-border)] bg-[var(--cohere-surface)] hover:border-[var(--cohere-purple)] transition-all duration-300 cursor-pointer flex flex-col items-center justify-center gap-5 hover:shadow-[0_0_60px_-15px_var(--cohere-purple)]"
>
<div className="text-[var(--cohere-text-muted)] group-hover:text-[var(--cohere-purple)] transition-colors duration-300">
<UploadIcon />
</div>
<span className="text-xl text-[var(--cohere-text)]">
Choose File
</span>
<span className="text-base text-[var(--cohere-text-muted)]">
Select audio/video file
</span>
</button>
{/* Record Audio Card */}
<button
onClick={startRecording}
className="group w-72 h-72 rounded-2xl border border-[var(--cohere-border)] bg-[var(--cohere-surface)] hover:border-[var(--cohere-purple)] transition-all duration-300 cursor-pointer flex flex-col items-center justify-center gap-5 hover:shadow-[0_0_60px_-15px_var(--cohere-purple)]"
>
<div className="text-[var(--cohere-text-muted)] group-hover:text-[var(--cohere-purple)] transition-colors duration-300">
<MicrophoneIcon />
</div>
<span className="text-xl text-[var(--cohere-text)]">
Record Audio
</span>
<span className="text-base text-[var(--cohere-text-muted)]">
Use your microphone
</span>
</button>
</div>
{/* Language selector */}
<div className="flex flex-col items-center gap-3">
<span className="text-sm text-[var(--cohere-text-muted)]">
Language
</span>
<div className="flex flex-wrap justify-center gap-2 max-w-xl">
{LANGUAGES.map((lang) => (
<button
key={lang.code}
onClick={(e) => {
setLanguage(lang.code);
const rect = e.currentTarget.getBoundingClientRect();
confettiRef.current?.burst(
rect.left + rect.width / 2,
rect.top + rect.height / 2,
langToFlag(lang.code),
);
}}
className={`px-4 py-2 rounded-full text-sm transition-colors duration-200 cursor-pointer border ${
language === lang.code
? "bg-[var(--cohere-purple)] text-white border-transparent"
: "bg-[var(--cohere-surface)] text-[var(--cohere-text-muted)] border-[var(--cohere-border)] hover:border-[var(--cohere-purple)] hover:text-[var(--cohere-purple)]"
}`}
>
{lang.label}
{lang.label !== lang.native && (
<span
className={`ml-1 ${
language === lang.code
? "text-white/60"
: "text-[var(--cohere-text-muted)]/50"
}`}
>
/ {lang.native}
</span>
)}
</button>
))}
</div>
</div>
</div>
) : (
/* ---- Transcription Area ---- */
<div className="w-full max-w-3xl flex flex-col gap-6 animate-fade-in-up">
{/* Source indicator + status */}
<div className="flex items-center gap-3">
<div className="text-[var(--cohere-purple)]">
{mode === "file" ? <FileIcon /> : <MicSmallIcon />}
</div>
<span className="text-[var(--cohere-text)] text-base">
{mode === "file" ? audioFileName : "Microphone recording"}
</span>
{/* Recording controls */}
{isRecording && (
<div className="flex items-center gap-3 ml-auto">
<span className="flex items-center gap-2 text-sm text-red-500">
<span className="w-2 h-2 bg-red-500 rounded-full animate-pulse" />
Recording...
</span>
<button
onClick={stopRecording}
className="px-4 py-1.5 text-sm bg-red-50 text-red-500 border border-red-200 rounded-lg hover:bg-red-100 transition-colors cursor-pointer"
>
Stop
</button>
</div>
)}
{/* Status badge */}
{isTranscribing && !isRecording && (
<span className="ml-auto flex items-center gap-2 text-sm text-[var(--cohere-text-muted)]">
<svg
className="animate-spin-slow"
width="14"
height="14"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
strokeWidth="2"
>
<path d="M21 12a9 9 0 1 1-6.219-8.56" />
</svg>
Transcribing...
</span>
)}
{isDone && (
<span className="ml-auto flex items-center gap-2 text-sm text-emerald-600">
<CheckIcon />
{stats
? `Transcribed ${formatDuration(stats.audioDuration)} of audio in ${formatDuration(stats.elapsed)}`
: "Complete"}
</span>
)}
</div>
{/* Transcription output */}
<div
ref={outputRef}
className="bg-[var(--cohere-surface)] rounded-xl p-8 min-h-[280px] max-h-[500px] overflow-y-auto border border-[var(--cohere-border)]"
>
{displayText ? (
<p className="text-xl leading-relaxed text-[var(--cohere-text)] whitespace-pre-wrap">
{displayText.trim()}
</p>
) : isRecording ? (
<p className="text-[var(--cohere-text-muted)] italic">
Listening... Press stop when you're done speaking.
</p>
) : isTranscribing ? (
<div className="space-y-3">
<div className="h-4 w-full rounded animate-shimmer" />
<div className="h-4 w-5/6 rounded animate-shimmer" />
<div className="h-4 w-4/6 rounded animate-shimmer" />
</div>
) : null}
</div>
{/* Actions */}
<div className="flex items-center justify-center gap-4">
{isDone && (
<>
<button
onClick={copyToClipboard}
className="flex items-center gap-2 px-5 py-2.5 text-base border border-[var(--cohere-border)] text-[var(--cohere-text-muted)] rounded-lg hover:border-[var(--cohere-purple)] hover:text-[var(--cohere-purple)] transition-all duration-200 cursor-pointer"
>
{copied ? <CheckIcon /> : <CopyIcon />}
{copied ? "Copied" : "Copy"}
</button>
<button
onClick={downloadText}
className="flex items-center gap-2 px-5 py-2.5 text-base border border-[var(--cohere-border)] text-[var(--cohere-text-muted)] rounded-lg hover:border-[var(--cohere-purple)] hover:text-[var(--cohere-purple)] transition-all duration-200 cursor-pointer"
>
<DownloadIcon />
Download
</button>
<button
onClick={resetTranscription}
className="flex items-center gap-2 px-5 py-2.5 text-base border border-[var(--cohere-purple)] text-[var(--cohere-purple)] rounded-lg hover:bg-[var(--cohere-purple)] hover:text-white transition-all duration-200 cursor-pointer"
>
New Transcription
</button>
</>
)}
</div>
</div>
)}
</div>
{/* Footer */}
<p className="pb-4 text-center text-xs text-[var(--cohere-text-muted)]">
Runs 100% locally in your browser with WebGPU
</p>
</div>
{/* Drag overlay */}
{isDragging && (
<div className="fixed inset-0 z-40 flex items-center justify-center bg-white/80 backdrop-blur-sm pointer-events-none">
<div className="flex flex-col items-center gap-4 rounded-2xl border-2 border-dashed border-[var(--cohere-purple)] bg-[var(--cohere-surface)] px-16 py-12">
<UploadIcon />
<p className="text-lg text-[var(--cohere-text)]">
Drop audio/video file here
</p>
</div>
</div>
)}
{/* Confetti overlay */}
<Confetti ref={confettiRef} />
{/* Hidden file input */}
<input
ref={fileInputRef}
type="file"
accept="audio/*,video/*"
className="hidden"
onChange={handleFileSelect}
/>
</div>
);
}
export default App;