Ministral_3B_WebGPU / src /components /CaptioningView.tsx
Joffrey Thomas
Multiple sources
6709b20
Raw
History Blame Contribute Delete
9.41 kB
import { useState, useRef, useEffect, useCallback } from "react";
import WebcamCapture from "./WebcamCapture";
import PromptInput from "./PromptInput";
import LiveCaption, { type HistoryEntry } from "./LiveCaption";
import { useVLMContext } from "../context/useVLMContext";
import { PROMPTS, TIMING, THEME } from "../constants";
import type { VideoSource } from "../types";
const SOURCE_ICONS: Record<string, React.ReactNode> = {
webcam: (
<svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={2}>
<path strokeLinecap="round" strokeLinejoin="round" d="M15.75 10.5l4.72-4.72a.75.75 0 011.28.53v11.38a.75.75 0 01-1.28.53l-4.72-4.72M4.5 18.75h9a2.25 2.25 0 002.25-2.25v-9a2.25 2.25 0 00-2.25-2.25h-9A2.25 2.25 0 002.25 7.5v9a2.25 2.25 0 002.25 2.25z" />
</svg>
),
screen: (
<svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={2}>
<path strokeLinecap="round" strokeLinejoin="round" d="M9 17.25v1.007a3 3 0 01-.879 2.122L7.5 21h9l-.621-.621A3 3 0 0115 18.257V17.25m6-12V15a2.25 2.25 0 01-2.25 2.25H5.25A2.25 2.25 0 013 15V5.25m18 0A2.25 2.25 0 0018.75 3H5.25A2.25 2.25 0 003 5.25m18 0V12a2.25 2.25 0 01-2.25 2.25H5.25A2.25 2.25 0 013 12V5.25" />
</svg>
),
upload: (
<svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={2}>
<path strokeLinecap="round" strokeLinejoin="round" d="M3 16.5v2.25A2.25 2.25 0 005.25 21h13.5A2.25 2.25 0 0021 18.75V16.5m-13.5-9L12 3m0 0l4.5 4.5M12 3v13.5" />
</svg>
),
example: (
<svg className="w-4 h-4" fill="none" viewBox="0 0 24 24" stroke="currentColor" strokeWidth={2}>
<path strokeLinecap="round" strokeLinejoin="round" d="M21 12a9 9 0 11-18 0 9 9 0 0118 0z" />
<path strokeLinecap="round" strokeLinejoin="round" d="M15.91 11.672a.375.375 0 010 .656l-5.603 3.113a.375.375 0 01-.557-.328V8.887c0-.286.307-.466.557-.327l5.603 3.112z" />
</svg>
),
};
const SOURCE_LABELS: Record<string, string> = {
webcam: "Webcam",
screen: "Screen",
upload: "Upload",
example: "Example",
};
interface CaptioningViewProps {
videoRef: React.RefObject<HTMLVideoElement | null>;
videoSource?: VideoSource | null;
onChangeSource?: () => void;
}
function useCaptioningLoop(
videoRef: React.RefObject<HTMLVideoElement | null>,
isRunning: boolean,
promptRef: React.RefObject<string>,
onCaptionUpdate: (caption: string) => void,
onError: (error: string) => void,
onGenerationComplete: (caption: string) => void,
onStatsUpdate: (stats: { tps?: number; ttft?: number }) => void,
) {
const { isLoaded, runInference } = useVLMContext();
const abortControllerRef = useRef<AbortController | null>(null);
const onCaptionUpdateRef = useRef(onCaptionUpdate);
const onErrorRef = useRef(onError);
const onGenerationCompleteRef = useRef(onGenerationComplete);
const onStatsUpdateRef = useRef(onStatsUpdate);
useEffect(() => {
onCaptionUpdateRef.current = onCaptionUpdate;
}, [onCaptionUpdate]);
useEffect(() => {
onErrorRef.current = onError;
}, [onError]);
useEffect(() => {
onGenerationCompleteRef.current = onGenerationComplete;
}, [onGenerationComplete]);
useEffect(() => {
onStatsUpdateRef.current = onStatsUpdate;
}, [onStatsUpdate]);
useEffect(() => {
abortControllerRef.current?.abort();
if (!isRunning || !isLoaded) return;
abortControllerRef.current = new AbortController();
const signal = abortControllerRef.current.signal;
const video = videoRef.current;
const captureLoop = async () => {
while (!signal.aborted) {
if (
video &&
video.readyState >= 2 &&
!video.paused &&
video.videoWidth > 0
) {
try {
const currentPrompt = promptRef.current || "";
const result = await runInference(
video,
currentPrompt,
onCaptionUpdateRef.current,
(stats) => onStatsUpdateRef.current(stats),
);
if (result && !signal.aborted) {
onCaptionUpdateRef.current(result);
onGenerationCompleteRef.current(result);
}
} catch (error) {
if (!signal.aborted) {
const message =
error instanceof Error ? error.message : String(error);
onErrorRef.current(message);
console.error("Error processing frame:", error);
}
}
}
if (signal.aborted) break;
await new Promise((resolve) =>
setTimeout(resolve, TIMING.FRAME_CAPTURE_DELAY),
);
}
};
// NB: Wrap with a setTimeout to ensure abort controller can run before starting the loop
// This is necessary for React's strict mode which calls effects twice in development.
setTimeout(captureLoop, 0);
return () => {
abortControllerRef.current?.abort();
};
}, [isRunning, isLoaded, runInference, promptRef, videoRef]);
}
export default function CaptioningView({ videoRef, videoSource, onChangeSource }: CaptioningViewProps) {
const { imageSize, setImageSize } = useVLMContext();
const [caption, setCaption] = useState<string>("");
const [isLoopRunning, setIsLoopRunning] = useState<boolean>(true);
const [currentPrompt, setCurrentPrompt] = useState<string>(PROMPTS.default);
const [error, setError] = useState<string | null>(null);
const [history, setHistory] = useState<HistoryEntry[]>([]);
const [stats, setStats] = useState<{ tps?: number; ttft?: number }>({});
// Use ref to store current prompt to avoid loop restarts
const promptRef = useRef<string>(currentPrompt);
// Update prompt ref when state changes
useEffect(() => {
promptRef.current = currentPrompt;
}, [currentPrompt]);
const handleCaptionUpdate = useCallback((newCaption: string) => {
setCaption(newCaption);
setError(null);
}, []);
const handleError = useCallback((errorMessage: string) => {
setError(errorMessage);
setCaption(`Error: ${errorMessage}`);
}, []);
const handleGenerationComplete = useCallback((text: string) => {
const now = new Date();
const timeString = now.toLocaleTimeString("en-US", {
hour12: false,
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
});
setHistory((prev) =>
[
{
timestamp: timeString,
text: text,
},
...prev,
].slice(0, 50),
);
}, []);
const handleStatsUpdate = useCallback(
(newStats: { tps?: number; ttft?: number }) => {
setStats((prev) => ({ ...prev, ...newStats }));
},
[],
);
useCaptioningLoop(
videoRef,
isLoopRunning,
promptRef,
handleCaptionUpdate,
handleError,
handleGenerationComplete,
handleStatsUpdate,
);
const handlePromptChange = useCallback((prompt: string) => {
setCurrentPrompt(prompt);
setError(null);
}, []);
const handleToggleLoop = useCallback(() => {
setIsLoopRunning((prev) => !prev);
if (error) setError(null);
}, [error]);
const getSourceLabel = () => {
if (!videoSource) return "Video";
if (videoSource.name) return videoSource.name;
return SOURCE_LABELS[videoSource.type] || "Video";
};
return (
<div className="absolute inset-0 text-white">
<div className="relative w-full h-full">
<WebcamCapture
isRunning={isLoopRunning}
onToggleRunning={handleToggleLoop}
error={error}
imageSize={imageSize}
onImageSizeChange={setImageSize}
/>
{/* Source Indicator - Top Left */}
{videoSource && onChangeSource && (
<div className="absolute top-5 left-5 z-30">
<button
onClick={onChangeSource}
className="group flex items-center gap-3 px-4 py-2.5 bg-black/60 backdrop-blur-md border border-white/10 hover:border-white/30 transition-all hover:bg-black/70"
title="Change video source"
>
<div
className="flex items-center justify-center"
style={{ color: THEME.mistralOrange }}
>
{SOURCE_ICONS[videoSource.type]}
</div>
<span className="text-sm font-medium text-white/90 max-w-[200px] truncate">
{getSourceLabel()}
</span>
<svg
className="w-4 h-4 text-white/50 group-hover:text-white/80 transition-colors"
fill="none"
viewBox="0 0 24 24"
stroke="currentColor"
strokeWidth={2}
>
<path strokeLinecap="round" strokeLinejoin="round" d="M8.25 15L12 18.75 15.75 15m-7.5-6L12 5.25 15.75 9" />
</svg>
</button>
</div>
)}
{/* Prompt Input - Bottom Left */}
<div className="absolute bottom-5 left-5 z-30 w-[540px]">
<PromptInput onPromptChange={handlePromptChange} />
</div>
{/* Live Caption - Bottom Right */}
<div className="absolute bottom-5 right-5 z-30 w-[720px]">
<LiveCaption
caption={caption}
isRunning={isLoopRunning}
error={error}
history={history}
stats={stats}
/>
</div>
</div>
</div>
);
}