LeLab / src /components /landing /InferenceModal.tsx
GitHub CI
Sync from leLab @ 8420275bdc324fa9e71046ce66c3ea3dd59e60e2
018306c
import React, { useEffect, useRef, useState } from "react";
import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
import {
Dialog,
DialogContent,
DialogHeader,
DialogTitle,
DialogDescription,
} from "@/components/ui/dialog";
import { Alert, AlertDescription } from "@/components/ui/alert";
import {
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
} from "@/components/ui/select";
import { AlertTriangle, CheckCircle, Loader2, Play, VideoOff } from "lucide-react";
import { RobotRecord } from "@/hooks/useRobots";
import { useApi } from "@/contexts/ApiContext";
import { useToast } from "@/hooks/use-toast";
import { useNavigate } from "react-router-dom";
import {
JobCheckpoint,
PolicyConfigSummary,
getCheckpointPolicyConfig,
listJobCheckpoints,
} from "@/lib/checkpointsApi";
import { startInference } from "@/lib/inferenceApi";
import CheckpointDropdown from "@/components/jobs/CheckpointDropdown";
interface AvailableCamera {
index: number;
name: string;
deviceId: string;
available: boolean;
}
const CameraPreview: React.FC<{ deviceId: string; paused: boolean }> = ({
deviceId,
paused,
}) => {
const videoRef = useRef<HTMLVideoElement>(null);
const [error, setError] = useState(false);
useEffect(() => {
if (paused || !deviceId) {
if (!deviceId) setError(true);
return;
}
let cancelled = false;
let stream: MediaStream | null = null;
setError(false);
(async () => {
try {
stream = await navigator.mediaDevices.getUserMedia({
video: { deviceId: { exact: deviceId } },
});
if (cancelled) {
stream.getTracks().forEach((t) => t.stop());
return;
}
if (videoRef.current) {
videoRef.current.srcObject = stream;
await videoRef.current.play().catch(() => {});
}
} catch {
setError(true);
}
})();
return () => {
cancelled = true;
if (stream) stream.getTracks().forEach((t) => t.stop());
};
}, [deviceId, paused]);
if (paused || error || !deviceId) {
return (
<div className="w-32 h-24 bg-gray-800 rounded border border-gray-700 flex flex-col items-center justify-center">
<VideoOff className="w-5 h-5 text-gray-500 mb-1" />
<span className="text-[10px] text-gray-500">
{paused ? "Released" : "No preview"}
</span>
</div>
);
}
return (
<video
ref={videoRef}
autoPlay
muted
playsInline
className="w-32 h-24 object-cover rounded border border-gray-700 bg-black"
/>
);
};
interface Props {
open: boolean;
onOpenChange: (open: boolean) => void;
robot: RobotRecord | null;
jobId: string;
initialStep: number | null;
}
const DEFAULT_FPS = 30;
const InferenceModal: React.FC<Props> = ({
open,
onOpenChange,
robot,
jobId,
initialStep,
}) => {
const { baseUrl, fetchWithHeaders } = useApi();
const { toast } = useToast();
const navigate = useNavigate();
const [checkpoints, setCheckpoints] = useState<JobCheckpoint[]>([]);
const [selectedStep, setSelectedStep] = useState<number | null>(initialStep);
const [task, setTask] = useState("");
const [durationS, setDurationS] = useState(60);
const [submitting, setSubmitting] = useState(false);
const [policyConfig, setPolicyConfig] = useState<PolicyConfigSummary | null>(null);
const [policyConfigLoading, setPolicyConfigLoading] = useState(false);
const [policyConfigError, setPolicyConfigError] = useState<string | null>(null);
// Per expected camera name → user-selected physical camera index (or null).
const [cameraBindings, setCameraBindings] = useState<Record<string, number | null>>({});
const [availableCameras, setAvailableCameras] = useState<AvailableCamera[]>([]);
// Load checkpoints when modal opens.
useEffect(() => {
if (!open) return;
let cancelled = false;
listJobCheckpoints(baseUrl, fetchWithHeaders, jobId)
.then((cks) => {
if (cancelled) return;
setCheckpoints(cks);
if (cks.length > 0) {
const latest = cks[cks.length - 1].step;
setSelectedStep((prev) => (prev != null ? prev : latest));
}
})
.catch(() => {
if (cancelled) return;
setCheckpoints([]);
});
return () => {
cancelled = true;
};
}, [open, baseUrl, fetchWithHeaders, jobId]);
// Load the user's available cameras when the modal opens, and merge each
// backend cv2 index with the matching browser deviceId so we can render a
// live preview alongside the bound dropdowns.
useEffect(() => {
if (!open) return;
let cancelled = false;
(async () => {
try {
// Need a permission grant before enumerateDevices() returns labels.
try {
const probe = await navigator.mediaDevices.getUserMedia({ video: true });
probe.getTracks().forEach((t) => t.stop());
} catch {
// ignore — we'll still try to enumerate, just without labels
}
const browserDevices = (await navigator.mediaDevices.enumerateDevices())
.filter((d) => d.kind === "videoinput")
.map((d) => ({ deviceId: d.deviceId, label: d.label }));
const r = await fetchWithHeaders(`${baseUrl}/available-cameras`);
if (!r.ok) {
if (!cancelled) setAvailableCameras([]);
return;
}
const body = await r.json();
const backend: { index: number; name?: string; available: boolean }[] =
body.cameras ?? [];
const norm = (s: string) => s.toLowerCase().replace(/\s+/g, " ").trim();
const used = new Set<string>();
const merged: AvailableCamera[] = backend.map((cam) => {
const label = cam.name || `Camera ${cam.index}`;
const target = norm(label);
const candidate =
browserDevices.find(
(d) => !used.has(d.deviceId) && d.label && norm(d.label) === target,
) ||
browserDevices.find(
(d) =>
!used.has(d.deviceId) &&
d.label &&
norm(d.label).startsWith(target),
) ||
browserDevices.find(
(d) =>
!used.has(d.deviceId) &&
d.label &&
(norm(d.label).includes(target) || target.includes(norm(d.label))),
);
if (candidate) used.add(candidate.deviceId);
return {
index: cam.index,
name: label,
deviceId: candidate?.deviceId ?? "",
available: cam.available,
};
});
if (!cancelled) setAvailableCameras(merged);
} catch {
if (!cancelled) setAvailableCameras([]);
}
})();
return () => {
cancelled = true;
};
}, [open, baseUrl, fetchWithHeaders]);
// Load policy config when step changes.
useEffect(() => {
if (!open || selectedStep == null) {
setPolicyConfig(null);
setPolicyConfigError(null);
return;
}
let cancelled = false;
setPolicyConfigLoading(true);
setPolicyConfigError(null);
getCheckpointPolicyConfig(baseUrl, fetchWithHeaders, jobId, selectedStep)
.then((cfg) => {
if (cancelled) return;
setPolicyConfig(cfg);
// Reset camera bindings to one entry per expected camera name.
// Preserve any prior selection that's still relevant.
setCameraBindings((prev) => {
const next: Record<string, number | null> = {};
for (const name of Object.keys(cfg.image_features)) {
next[name] = prev[name] ?? null;
}
return next;
});
})
.catch((e) => {
if (cancelled) return;
setPolicyConfig(null);
setPolicyConfigError(e instanceof Error ? e.message : String(e));
})
.finally(() => {
if (!cancelled) setPolicyConfigLoading(false);
});
return () => {
cancelled = true;
};
}, [open, baseUrl, fetchWithHeaders, jobId, selectedStep]);
// If the selected robot has cameras whose names match a policy-expected
// camera, auto-bind them. Prefer matching by browser device_id (stable
// across cv2 index drift); fall back to the saved camera_index.
useEffect(() => {
if (!policyConfig) return;
const robotCams = robot?.cameras ?? [];
if (robotCams.length === 0 || availableCameras.length === 0) return;
setCameraBindings((prev) => {
let changed = false;
const next = { ...prev };
for (const policyName of Object.keys(policyConfig.image_features)) {
if (next[policyName] != null) continue;
const robotCam = robotCams.find(
(c) => c.name.toLowerCase() === policyName.toLowerCase(),
);
if (!robotCam) continue;
const live =
(robotCam.device_id &&
availableCameras.find((c) => c.deviceId === robotCam.device_id)) ||
availableCameras.find((c) => c.index === robotCam.camera_index);
if (live) {
next[policyName] = live.index;
changed = true;
}
}
return changed ? next : prev;
});
}, [policyConfig, robot, availableCameras]);
const selectedRef =
selectedStep != null
? checkpoints.find((c) => c.step === selectedStep)?.ref ?? null
: null;
const expectedCameraNames = policyConfig
? Object.keys(policyConfig.image_features)
: [];
const allCamerasBound = expectedCameraNames.every(
(name) => cameraBindings[name] != null,
);
const canStart =
!!robot &&
robot.is_clean &&
selectedRef != null &&
!!policyConfig &&
allCamerasBound &&
!submitting;
const handleStart = async () => {
if (!robot || selectedRef == null || !policyConfig) return;
// Setting submitting=true makes every CameraPreview drop its
// browser stream — required so the rollout subprocess can open the
// same camera index via OpenCV without colliding on the device.
setSubmitting(true);
await new Promise((r) => setTimeout(r, 300));
const cameraDict: Record<string, {
type: string; camera_index?: number; width: number; height: number; fps?: number;
}> = {};
for (const [name, dims] of Object.entries(policyConfig.image_features)) {
const idx = cameraBindings[name];
if (idx == null) continue;
cameraDict[name] = {
type: "opencv",
camera_index: idx,
width: dims.width,
height: dims.height,
fps: DEFAULT_FPS,
};
}
try {
await startInference(baseUrl, fetchWithHeaders, {
follower_port: robot.follower_port,
follower_config: robot.follower_config,
policy_ref: selectedRef,
task,
cameras: cameraDict,
duration_s: durationS,
});
onOpenChange(false);
navigate("/inference");
} catch (e) {
toast({
title: "Couldn't start inference",
description: e instanceof Error ? e.message : String(e),
variant: "destructive",
});
// Failure: bring the previews back so the user can adjust.
setSubmitting(false);
}
};
const onCameraBindingChange = (name: string, value: string) => {
const idx = Number(value);
setCameraBindings((prev) => ({ ...prev, [name]: idx }));
};
return (
<Dialog open={open} onOpenChange={onOpenChange}>
<DialogContent className="bg-gray-900 border-gray-800 text-white sm:max-w-[600px] p-8 max-h-[90vh] overflow-y-auto">
<DialogHeader>
<div className="flex justify-center items-center mb-4">
<div className="w-8 h-8 bg-green-500 rounded-full flex items-center justify-center">
<Play className="w-4 h-4 text-white" />
</div>
</div>
<DialogTitle className="text-white text-center text-2xl font-bold">
Configure Inference
</DialogTitle>
</DialogHeader>
<div className="space-y-6 py-4">
<DialogDescription className="text-gray-400 text-base leading-relaxed text-center">
Pick a checkpoint and confirm hardware. The selected policy will
drive the follower autonomously for the configured duration.
</DialogDescription>
<div className="space-y-4">
<h3 className="text-lg font-semibold text-white border-b border-gray-700 pb-2">
Robot Configuration
</h3>
{!robot ? (
<Alert className="bg-amber-900/40 border-amber-700 text-amber-100">
<AlertTriangle className="h-4 w-4" />
<AlertDescription>
Select and configure a robot on the Landing page first.
</AlertDescription>
</Alert>
) : !robot.is_clean ? (
<Alert className="bg-amber-900/40 border-amber-700 text-amber-100">
<AlertTriangle className="h-4 w-4" />
<AlertDescription>
<strong>{robot.name}</strong> is missing a calibration.
Configure it before running inference.
</AlertDescription>
</Alert>
) : (
<div className="flex items-center gap-2 text-sm">
<CheckCircle className="w-4 h-4 text-green-400" />
<span className="text-slate-200">
Running on <strong>{robot.name}</strong>
</span>
</div>
)}
</div>
<div className="space-y-4">
<h3 className="text-lg font-semibold text-white border-b border-gray-700 pb-2">
Checkpoint
</h3>
{checkpoints.length === 0 ? (
<Alert className="bg-amber-900/40 border-amber-700 text-amber-100">
<AlertTriangle className="h-4 w-4" />
<AlertDescription>
No checkpoints available for this job yet.
</AlertDescription>
</Alert>
) : (
<CheckpointDropdown
checkpoints={checkpoints}
selectedStep={selectedStep}
onChange={setSelectedStep}
/>
)}
</div>
<div className="space-y-4">
<h3 className="text-lg font-semibold text-white border-b border-gray-700 pb-2">
Run parameters
</h3>
{policyConfig?.requires_task ? (
<div className="space-y-2">
<Label htmlFor="task" className="text-sm font-medium text-gray-300">
Task description
</Label>
<Input
id="task"
value={task}
onChange={(e) => setTask(e.target.value)}
placeholder="e.g., pick up the red block"
className="bg-gray-800 border-gray-700 text-white"
/>
<p className="text-xs text-gray-500">
This policy is language-conditioned ({policyConfig.policy_type}).
</p>
</div>
) : null}
<div className="space-y-2">
<Label htmlFor="durationS" className="text-sm font-medium text-gray-300">
Max duration (seconds)
</Label>
<Input
id="durationS"
type="number"
min={1}
value={durationS}
onChange={(e) => setDurationS(parseInt(e.target.value || "0"))}
className="bg-gray-800 border-gray-700 text-white"
/>
</div>
</div>
<div className="space-y-4">
<h3 className="text-lg font-semibold text-white border-b border-gray-700 pb-2">
Cameras
</h3>
{policyConfigLoading ? (
<div className="flex items-center gap-2 text-sm text-slate-400">
<Loader2 className="w-4 h-4 animate-spin" />
Reading policy config…
</div>
) : policyConfigError ? (
<Alert className="bg-red-900/40 border-red-700 text-red-100">
<AlertTriangle className="h-4 w-4" />
<AlertDescription>
Couldn't load policy config: {policyConfigError}
</AlertDescription>
</Alert>
) : !policyConfig ? null : expectedCameraNames.length === 0 ? (
<p className="text-xs text-gray-500">
This policy doesn't use cameras.
</p>
) : (
<div className="space-y-3">
<p className="text-xs text-gray-500">
Bind a physical camera to each name the policy was trained
with. Resolution comes from the checkpoint.
</p>
{expectedCameraNames.map((name) => {
const dims = policyConfig.image_features[name];
const value = cameraBindings[name];
const bound =
value != null
? availableCameras.find((c) => c.index === value)
: undefined;
return (
<div key={name} className="flex items-center gap-3">
<div className="flex-1">
<Label className="text-sm font-medium text-gray-200">
{name}
</Label>
<p className="text-xs text-gray-500">
{dims.width}×{dims.height}
</p>
</div>
<Select
value={value != null ? String(value) : undefined}
onValueChange={(v) => onCameraBindingChange(name, v)}
>
<SelectTrigger className="bg-gray-800 border-gray-700 text-white w-56">
<SelectValue placeholder="Select a camera" />
</SelectTrigger>
<SelectContent className="bg-gray-900 border-gray-700 text-white">
{availableCameras.length === 0 ? (
<div className="px-2 py-1.5 text-xs text-gray-500">
No cameras detected
</div>
) : (
availableCameras.map((cam) => (
<SelectItem
key={cam.index}
value={String(cam.index)}
>
#{cam.index} — {cam.name}
</SelectItem>
))
)}
</SelectContent>
</Select>
<CameraPreview deviceId={bound?.deviceId ?? ""} paused={submitting} />
</div>
);
})}
</div>
)}
</div>
<div className="flex flex-col sm:flex-row gap-4 justify-center pt-4">
<Button
onClick={handleStart}
disabled={!canStart}
className="w-full sm:w-auto bg-green-500 hover:bg-green-600 text-white px-10 py-6 text-lg disabled:opacity-40 disabled:cursor-not-allowed"
>
<Play className="w-5 h-5 mr-2" />
{submitting ? "Starting…" : "Start Inference"}
</Button>
<Button
onClick={() => onOpenChange(false)}
variant="outline"
className="w-full sm:w-auto border-gray-500 hover:border-gray-200 px-10 py-6 text-lg text-zinc-500 bg-zinc-900 hover:bg-zinc-800"
>
Cancel
</Button>
</div>
</div>
</DialogContent>
</Dialog>
);
};
export default InferenceModal;