Spaces:

lerobot
/

LeLab

Running

LeLab / src /components /landing /InferenceModal.tsx

GitHub CI

Sync from leLab @ 8420275bdc324fa9e71046ce66c3ea3dd59e60e2

018306c 2 days ago

20.2 kB

	import React, { useEffect, useRef, useState } from "react";
	import { Button } from "@/components/ui/button";
	import { Input } from "@/components/ui/input";
	import { Label } from "@/components/ui/label";
	import {
	Dialog,
	DialogContent,
	DialogHeader,
	DialogTitle,
	DialogDescription,
	} from "@/components/ui/dialog";
	import { Alert, AlertDescription } from "@/components/ui/alert";
	import {
	Select,
	SelectContent,
	SelectItem,
	SelectTrigger,
	SelectValue,
	} from "@/components/ui/select";
	import { AlertTriangle, CheckCircle, Loader2, Play, VideoOff } from "lucide-react";
	import { RobotRecord } from "@/hooks/useRobots";
	import { useApi } from "@/contexts/ApiContext";
	import { useToast } from "@/hooks/use-toast";
	import { useNavigate } from "react-router-dom";
	import {
	JobCheckpoint,
	PolicyConfigSummary,
	getCheckpointPolicyConfig,
	listJobCheckpoints,
	} from "@/lib/checkpointsApi";
	import { startInference } from "@/lib/inferenceApi";
	import CheckpointDropdown from "@/components/jobs/CheckpointDropdown";

	interface AvailableCamera {
	index: number;
	name: string;
	deviceId: string;
	available: boolean;
	}

	const CameraPreview: React.FC<{ deviceId: string; paused: boolean }> = ({
	deviceId,
	paused,
	}) => {
	const videoRef = useRef<HTMLVideoElement>(null);
	const [error, setError] = useState(false);

	useEffect(() => {
	if (paused \|\| !deviceId) {
	if (!deviceId) setError(true);
	return;
	}
	let cancelled = false;
	let stream: MediaStream \| null = null;
	setError(false);
	(async () => {
	try {
	stream = await navigator.mediaDevices.getUserMedia({
	video: { deviceId: { exact: deviceId } },
	});
	if (cancelled) {
	stream.getTracks().forEach((t) => t.stop());
	return;
	}
	if (videoRef.current) {
	videoRef.current.srcObject = stream;
	await videoRef.current.play().catch(() => {});
	}
	} catch {
	setError(true);
	}
	})();
	return () => {
	cancelled = true;
	if (stream) stream.getTracks().forEach((t) => t.stop());
	};
	}, [deviceId, paused]);

	if (paused \|\| error \|\| !deviceId) {
	return (
	<div className="w-32 h-24 bg-gray-800 rounded border border-gray-700 flex flex-col items-center justify-center">
	<VideoOff className="w-5 h-5 text-gray-500 mb-1" />
	<span className="text-[10px] text-gray-500">
	{paused ? "Released" : "No preview"}
	</span>
	</div>
	);
	}
	return (
	<video
	ref={videoRef}
	autoPlay
	muted
	playsInline
	className="w-32 h-24 object-cover rounded border border-gray-700 bg-black"
	/>
	);
	};

	interface Props {
	open: boolean;
	onOpenChange: (open: boolean) => void;
	robot: RobotRecord \| null;
	jobId: string;
	initialStep: number \| null;
	}

	const DEFAULT_FPS = 30;

	const InferenceModal: React.FC<Props> = ({
	open,
	onOpenChange,
	robot,
	jobId,
	initialStep,
	}) => {
	const { baseUrl, fetchWithHeaders } = useApi();
	const { toast } = useToast();
	const navigate = useNavigate();

	const [checkpoints, setCheckpoints] = useState<JobCheckpoint[]>([]);
	const [selectedStep, setSelectedStep] = useState<number \| null>(initialStep);
	const [task, setTask] = useState("");
	const [durationS, setDurationS] = useState(60);
	const [submitting, setSubmitting] = useState(false);

	const [policyConfig, setPolicyConfig] = useState<PolicyConfigSummary \| null>(null);
	const [policyConfigLoading, setPolicyConfigLoading] = useState(false);
	const [policyConfigError, setPolicyConfigError] = useState<string \| null>(null);

	// Per expected camera name → user-selected physical camera index (or null).
	const [cameraBindings, setCameraBindings] = useState<Record<string, number \| null>>({});
	const [availableCameras, setAvailableCameras] = useState<AvailableCamera[]>([]);

	// Load checkpoints when modal opens.
	useEffect(() => {
	if (!open) return;
	let cancelled = false;
	listJobCheckpoints(baseUrl, fetchWithHeaders, jobId)
	.then((cks) => {
	if (cancelled) return;
	setCheckpoints(cks);
	if (cks.length > 0) {
	const latest = cks[cks.length - 1].step;
	setSelectedStep((prev) => (prev != null ? prev : latest));
	}
	})
	.catch(() => {
	if (cancelled) return;
	setCheckpoints([]);
	});
	return () => {
	cancelled = true;
	};
	}, [open, baseUrl, fetchWithHeaders, jobId]);

	// Load the user's available cameras when the modal opens, and merge each
	// backend cv2 index with the matching browser deviceId so we can render a
	// live preview alongside the bound dropdowns.
	useEffect(() => {
	if (!open) return;
	let cancelled = false;
	(async () => {
	try {
	// Need a permission grant before enumerateDevices() returns labels.
	try {
	const probe = await navigator.mediaDevices.getUserMedia({ video: true });
	probe.getTracks().forEach((t) => t.stop());
	} catch {
	// ignore — we'll still try to enumerate, just without labels
	}
	const browserDevices = (await navigator.mediaDevices.enumerateDevices())
	.filter((d) => d.kind === "videoinput")
	.map((d) => ({ deviceId: d.deviceId, label: d.label }));
	const r = await fetchWithHeaders(`${baseUrl}/available-cameras`);
	if (!r.ok) {
	if (!cancelled) setAvailableCameras([]);
	return;
	}
	const body = await r.json();
	const backend: { index: number; name?: string; available: boolean }[] =
	body.cameras ?? [];
	const norm = (s: string) => s.toLowerCase().replace(/\s+/g, " ").trim();
	const used = new Set<string>();
	const merged: AvailableCamera[] = backend.map((cam) => {
	const label = cam.name \|\| `Camera ${cam.index}`;
	const target = norm(label);
	const candidate =
	browserDevices.find(
	(d) => !used.has(d.deviceId) && d.label && norm(d.label) === target,
	) \|\|
	browserDevices.find(
	(d) =>
	!used.has(d.deviceId) &&
	d.label &&
	norm(d.label).startsWith(target),
	) \|\|
	browserDevices.find(
	(d) =>
	!used.has(d.deviceId) &&
	d.label &&
	(norm(d.label).includes(target) \|\| target.includes(norm(d.label))),
	);
	if (candidate) used.add(candidate.deviceId);
	return {
	index: cam.index,
	name: label,
	deviceId: candidate?.deviceId ?? "",
	available: cam.available,
	};
	});
	if (!cancelled) setAvailableCameras(merged);
	} catch {
	if (!cancelled) setAvailableCameras([]);
	}
	})();
	return () => {
	cancelled = true;
	};
	}, [open, baseUrl, fetchWithHeaders]);

	// Load policy config when step changes.
	useEffect(() => {
	if (!open \|\| selectedStep == null) {
	setPolicyConfig(null);
	setPolicyConfigError(null);
	return;
	}
	let cancelled = false;
	setPolicyConfigLoading(true);
	setPolicyConfigError(null);
	getCheckpointPolicyConfig(baseUrl, fetchWithHeaders, jobId, selectedStep)
	.then((cfg) => {
	if (cancelled) return;
	setPolicyConfig(cfg);
	// Reset camera bindings to one entry per expected camera name.
	// Preserve any prior selection that's still relevant.
	setCameraBindings((prev) => {
	const next: Record<string, number \| null> = {};
	for (const name of Object.keys(cfg.image_features)) {
	next[name] = prev[name] ?? null;
	}
	return next;
	});
	})
	.catch((e) => {
	if (cancelled) return;
	setPolicyConfig(null);
	setPolicyConfigError(e instanceof Error ? e.message : String(e));
	})
	.finally(() => {
	if (!cancelled) setPolicyConfigLoading(false);
	});
	return () => {
	cancelled = true;
	};
	}, [open, baseUrl, fetchWithHeaders, jobId, selectedStep]);

	// If the selected robot has cameras whose names match a policy-expected
	// camera, auto-bind them. Prefer matching by browser device_id (stable
	// across cv2 index drift); fall back to the saved camera_index.
	useEffect(() => {
	if (!policyConfig) return;
	const robotCams = robot?.cameras ?? [];
	if (robotCams.length === 0 \|\| availableCameras.length === 0) return;
	setCameraBindings((prev) => {
	let changed = false;
	const next = { ...prev };
	for (const policyName of Object.keys(policyConfig.image_features)) {
	if (next[policyName] != null) continue;
	const robotCam = robotCams.find(
	(c) => c.name.toLowerCase() === policyName.toLowerCase(),
	);
	if (!robotCam) continue;
	const live =
	(robotCam.device_id &&
	availableCameras.find((c) => c.deviceId === robotCam.device_id)) \|\|
	availableCameras.find((c) => c.index === robotCam.camera_index);
	if (live) {
	next[policyName] = live.index;
	changed = true;
	}
	}
	return changed ? next : prev;
	});
	}, [policyConfig, robot, availableCameras]);

	const selectedRef =
	selectedStep != null
	? checkpoints.find((c) => c.step === selectedStep)?.ref ?? null
	: null;

	const expectedCameraNames = policyConfig
	? Object.keys(policyConfig.image_features)
	: [];
	const allCamerasBound = expectedCameraNames.every(
	(name) => cameraBindings[name] != null,
	);

	const canStart =
	!!robot &&
	robot.is_clean &&
	selectedRef != null &&
	!!policyConfig &&
	allCamerasBound &&
	!submitting;

	const handleStart = async () => {
	if (!robot \|\| selectedRef == null \|\| !policyConfig) return;
	// Setting submitting=true makes every CameraPreview drop its
	// browser stream — required so the rollout subprocess can open the
	// same camera index via OpenCV without colliding on the device.
	setSubmitting(true);
	await new Promise((r) => setTimeout(r, 300));
	const cameraDict: Record<string, {
	type: string; camera_index?: number; width: number; height: number; fps?: number;
	}> = {};
	for (const [name, dims] of Object.entries(policyConfig.image_features)) {
	const idx = cameraBindings[name];
	if (idx == null) continue;
	cameraDict[name] = {
	type: "opencv",
	camera_index: idx,
	width: dims.width,
	height: dims.height,
	fps: DEFAULT_FPS,
	};
	}
	try {
	await startInference(baseUrl, fetchWithHeaders, {
	follower_port: robot.follower_port,
	follower_config: robot.follower_config,
	policy_ref: selectedRef,
	task,
	cameras: cameraDict,
	duration_s: durationS,
	});
	onOpenChange(false);
	navigate("/inference");
	} catch (e) {
	toast({
	title: "Couldn't start inference",
	description: e instanceof Error ? e.message : String(e),
	variant: "destructive",
	});
	// Failure: bring the previews back so the user can adjust.
	setSubmitting(false);
	}
	};

	const onCameraBindingChange = (name: string, value: string) => {
	const idx = Number(value);
	setCameraBindings((prev) => ({ ...prev, [name]: idx }));
	};

	return (
	<Dialog open={open} onOpenChange={onOpenChange}>
	<DialogContent className="bg-gray-900 border-gray-800 text-white sm:max-w-[600px] p-8 max-h-[90vh] overflow-y-auto">
	<DialogHeader>
	<div className="flex justify-center items-center mb-4">
	<div className="w-8 h-8 bg-green-500 rounded-full flex items-center justify-center">
	<Play className="w-4 h-4 text-white" />
	</div>
	</div>
	<DialogTitle className="text-white text-center text-2xl font-bold">
	Configure Inference
	</DialogTitle>
	</DialogHeader>

	<div className="space-y-6 py-4">
	<DialogDescription className="text-gray-400 text-base leading-relaxed text-center">
	Pick a checkpoint and confirm hardware. The selected policy will
	drive the follower autonomously for the configured duration.
	</DialogDescription>

	<div className="space-y-4">
	<h3 className="text-lg font-semibold text-white border-b border-gray-700 pb-2">
	Robot Configuration
	</h3>
	{!robot ? (
	<Alert className="bg-amber-900/40 border-amber-700 text-amber-100">
	<AlertTriangle className="h-4 w-4" />
	<AlertDescription>
	Select and configure a robot on the Landing page first.
	</AlertDescription>
	</Alert>
	) : !robot.is_clean ? (
	<Alert className="bg-amber-900/40 border-amber-700 text-amber-100">
	<AlertTriangle className="h-4 w-4" />
	<AlertDescription>
	<strong>{robot.name}</strong> is missing a calibration.
	Configure it before running inference.
	</AlertDescription>
	</Alert>
	) : (
	<div className="flex items-center gap-2 text-sm">
	<CheckCircle className="w-4 h-4 text-green-400" />
	<span className="text-slate-200">
	Running on <strong>{robot.name}</strong>
	</span>
	</div>
	)}
	</div>

	<div className="space-y-4">
	<h3 className="text-lg font-semibold text-white border-b border-gray-700 pb-2">
	Checkpoint
	</h3>
	{checkpoints.length === 0 ? (
	<Alert className="bg-amber-900/40 border-amber-700 text-amber-100">
	<AlertTriangle className="h-4 w-4" />
	<AlertDescription>
	No checkpoints available for this job yet.
	</AlertDescription>
	</Alert>
	) : (
	<CheckpointDropdown
	checkpoints={checkpoints}
	selectedStep={selectedStep}
	onChange={setSelectedStep}
	/>
	)}
	</div>

	<div className="space-y-4">
	<h3 className="text-lg font-semibold text-white border-b border-gray-700 pb-2">
	Run parameters
	</h3>
	{policyConfig?.requires_task ? (
	<div className="space-y-2">
	<Label htmlFor="task" className="text-sm font-medium text-gray-300">
	Task description
	</Label>
	<Input
	id="task"
	value={task}
	onChange={(e) => setTask(e.target.value)}
	placeholder="e.g., pick up the red block"
	className="bg-gray-800 border-gray-700 text-white"
	/>
	<p className="text-xs text-gray-500">
	This policy is language-conditioned ({policyConfig.policy_type}).
	</p>
	</div>
	) : null}
	<div className="space-y-2">
	<Label htmlFor="durationS" className="text-sm font-medium text-gray-300">
	Max duration (seconds)
	</Label>
	<Input
	id="durationS"
	type="number"
	min={1}
	value={durationS}
	onChange={(e) => setDurationS(parseInt(e.target.value \|\| "0"))}
	className="bg-gray-800 border-gray-700 text-white"
	/>
	</div>
	</div>

	<div className="space-y-4">
	<h3 className="text-lg font-semibold text-white border-b border-gray-700 pb-2">
	Cameras
	</h3>
	{policyConfigLoading ? (
	<div className="flex items-center gap-2 text-sm text-slate-400">
	<Loader2 className="w-4 h-4 animate-spin" />
	Reading policy config…
	</div>
	) : policyConfigError ? (
	<Alert className="bg-red-900/40 border-red-700 text-red-100">
	<AlertTriangle className="h-4 w-4" />
	<AlertDescription>
	Couldn't load policy config: {policyConfigError}
	</AlertDescription>
	</Alert>
	) : !policyConfig ? null : expectedCameraNames.length === 0 ? (
	<p className="text-xs text-gray-500">
	This policy doesn't use cameras.
	</p>
	) : (
	<div className="space-y-3">
	<p className="text-xs text-gray-500">
	Bind a physical camera to each name the policy was trained
	with. Resolution comes from the checkpoint.
	</p>
	{expectedCameraNames.map((name) => {
	const dims = policyConfig.image_features[name];
	const value = cameraBindings[name];
	const bound =
	value != null
	? availableCameras.find((c) => c.index === value)
	: undefined;
	return (
	<div key={name} className="flex items-center gap-3">
	<div className="flex-1">
	<Label className="text-sm font-medium text-gray-200">
	{name}
	</Label>
	<p className="text-xs text-gray-500">
	{dims.width}×{dims.height}
	</p>
	</div>
	<Select
	value={value != null ? String(value) : undefined}
	onValueChange={(v) => onCameraBindingChange(name, v)}
	>
	<SelectTrigger className="bg-gray-800 border-gray-700 text-white w-56">
	<SelectValue placeholder="Select a camera" />
	</SelectTrigger>
	<SelectContent className="bg-gray-900 border-gray-700 text-white">
	{availableCameras.length === 0 ? (
	<div className="px-2 py-1.5 text-xs text-gray-500">
	No cameras detected
	</div>
	) : (
	availableCameras.map((cam) => (
	<SelectItem
	key={cam.index}
	value={String(cam.index)}
	>
	#{cam.index} — {cam.name}
	</SelectItem>
	))
	)}
	</SelectContent>
	</Select>
	<CameraPreview deviceId={bound?.deviceId ?? ""} paused={submitting} />
	</div>
	);
	})}
	</div>
	)}
	</div>

	<div className="flex flex-col sm:flex-row gap-4 justify-center pt-4">
	<Button
	onClick={handleStart}
	disabled={!canStart}
	className="w-full sm:w-auto bg-green-500 hover:bg-green-600 text-white px-10 py-6 text-lg disabled:opacity-40 disabled:cursor-not-allowed"
	>
	<Play className="w-5 h-5 mr-2" />
	{submitting ? "Starting…" : "Start Inference"}
	</Button>
	<Button
	onClick={() => onOpenChange(false)}
	variant="outline"
	className="w-full sm:w-auto border-gray-500 hover:border-gray-200 px-10 py-6 text-lg text-zinc-500 bg-zinc-900 hover:bg-zinc-800"
	>
	Cancel
	</Button>
	</div>
	</div>
	</DialogContent>
	</Dialog>
	);
	};

	export default InferenceModal;