feat: local dataset paths, API route, and versioned fetch routing

#4
by alexis779 - opened
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ node_modules
2
+ .next
3
+ out
4
+ tsconfig.tsbuildinfo
5
+ next-env.d.ts
6
+ .env*.local
7
+ *.log
8
+ .DS_Store
.prettierignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .next
2
+ node_modules
3
+ tsconfig.tsbuildinfo
src/app/[org]/[dataset]/[episode]/episode-viewer.tsx CHANGED
@@ -34,6 +34,7 @@ import {
34
  type CrossEpisodeVarianceData,
35
  } from "./fetch-data";
36
  import { getDatasetVersionAndInfo } from "@/utils/versionUtils";
 
37
  import type { DatasetMetadata } from "@/utils/parquetUtils";
38
 
39
  const URDFViewer = lazy(() => import("@/components/urdf-viewer"));
@@ -614,14 +615,24 @@ function EpisodeViewerInner({
614
  </a>
615
 
616
  <div>
617
- <a
618
- href={`https://huggingface.co/datasets/${datasetInfo.repoId}`}
619
- target="_blank"
620
- >
621
- <p className="text-lg font-semibold">
622
- {datasetInfo.repoId}
623
  </p>
624
- </a>
 
 
 
 
 
 
 
 
 
 
625
 
626
  <p className="font-mono text-lg font-semibold">
627
  episode {episodeId}
@@ -701,6 +712,7 @@ function EpisodeViewerInner({
701
  <Suspense fallback={<Loading />}>
702
  <FilteringPanel
703
  repoId={datasetInfo.repoId}
 
704
  crossEpisodeData={crossEpData}
705
  crossEpisodeLoading={insightsLoading}
706
  episodeLengthStats={episodeLengthStats}
 
34
  type CrossEpisodeVarianceData,
35
  } from "./fetch-data";
36
  import { getDatasetVersionAndInfo } from "@/utils/versionUtils";
37
+ import { isLocalRepoId } from "@/utils/localDataset";
38
  import type { DatasetMetadata } from "@/utils/parquetUtils";
39
 
40
  const URDFViewer = lazy(() => import("@/components/urdf-viewer"));
 
615
  </a>
616
 
617
  <div>
618
+ {isLocalRepoId(datasetInfo.repoId) ? (
619
+ <p
620
+ className="text-lg font-semibold text-left max-w-md break-all"
621
+ title={datasetInfo.localDiskPath}
622
+ >
623
+ {datasetInfo.displayLabel ?? datasetInfo.repoId}
624
  </p>
625
+ ) : (
626
+ <a
627
+ href={`https://huggingface.co/datasets/${datasetInfo.repoId}`}
628
+ target="_blank"
629
+ rel="noopener noreferrer"
630
+ >
631
+ <p className="text-lg font-semibold">
632
+ {datasetInfo.displayLabel ?? datasetInfo.repoId}
633
+ </p>
634
+ </a>
635
+ )}
636
 
637
  <p className="font-mono text-lg font-semibold">
638
  episode {episodeId}
 
712
  <Suspense fallback={<Loading />}>
713
  <FilteringPanel
714
  repoId={datasetInfo.repoId}
715
+ localDiskPath={datasetInfo.localDiskPath}
716
  crossEpisodeData={crossEpData}
717
  crossEpisodeLoading={insightsLoading}
718
  episodeLengthStats={episodeLengthStats}
src/app/[org]/[dataset]/[episode]/fetch-data.ts CHANGED
@@ -20,6 +20,7 @@ import {
20
  buildV3EpisodesMetadataPath,
21
  } from "@/utils/stringFormatting";
22
  import { bigIntToNumber } from "@/utils/typeGuards";
 
23
  import type { VideoInfo, AdjacentEpisodeVideos } from "@/types";
24
 
25
  const SERIES_NAME_DELIMITER = CHART_CONFIG.SERIES_NAME_DELIMITER;
@@ -28,6 +29,10 @@ export type CameraInfo = { name: string; width: number; height: number };
28
 
29
  export type DatasetDisplayInfo = {
30
  repoId: string;
 
 
 
 
31
  total_frames: number;
32
  total_episodes: number;
33
  fps: number;
@@ -321,6 +326,9 @@ export async function getEpisodeData(
321
  .filter(([, f]) => f.dtype === "video" && f.shape.length >= 2)
322
  .map(([name, f]) => ({ name, height: f.shape[0], width: f.shape[1] }));
323
 
 
 
 
324
  result.datasetInfo = {
325
  ...result.datasetInfo,
326
  robot_type: rawInfo.robot_type ?? null,
@@ -333,6 +341,8 @@ export async function getEpisodeData(
333
  10,
334
  ) / 10,
335
  cameras,
 
 
336
  };
337
 
338
  const progressGroup = await loadEpisodeProgressGroup(
 
20
  buildV3EpisodesMetadataPath,
21
  } from "@/utils/stringFormatting";
22
  import { bigIntToNumber } from "@/utils/typeGuards";
23
+ import { LOCAL_ORG, decodeLocalDatasetSegment } from "@/utils/localDataset";
24
  import type { VideoInfo, AdjacentEpisodeVideos } from "@/types";
25
 
26
  const SERIES_NAME_DELIMITER = CHART_CONFIG.SERIES_NAME_DELIMITER;
 
29
 
30
  export type DatasetDisplayInfo = {
31
  repoId: string;
32
+ /** Shown in the UI for local datasets (includes absolute path). */
33
+ displayLabel?: string;
34
+ /** Decoded dataset root on disk when using a local dataset. */
35
+ localDiskPath?: string;
36
  total_frames: number;
37
  total_episodes: number;
38
  fps: number;
 
326
  .filter(([, f]) => f.dtype === "video" && f.shape.length >= 2)
327
  .map(([name, f]) => ({ name, height: f.shape[0], width: f.shape[1] }));
328
 
329
+ const decodedPath =
330
+ org === LOCAL_ORG ? decodeLocalDatasetSegment(dataset) : null;
331
+
332
  result.datasetInfo = {
333
  ...result.datasetInfo,
334
  robot_type: rawInfo.robot_type ?? null,
 
341
  10,
342
  ) / 10,
343
  cameras,
344
+ displayLabel: decodedPath !== null ? `Local: ${decodedPath}` : undefined,
345
+ localDiskPath: decodedPath ?? undefined,
346
  };
347
 
348
  const progressGroup = await loadEpisodeProgressGroup(
src/app/[org]/[dataset]/[episode]/page.tsx CHANGED
@@ -1,5 +1,6 @@
1
  import EpisodeViewer from "./episode-viewer";
2
  import { Suspense } from "react";
 
3
 
4
  export const dynamic = "force-dynamic";
5
 
@@ -9,8 +10,12 @@ export async function generateMetadata({
9
  params: Promise<{ org: string; dataset: string; episode: string }>;
10
  }) {
11
  const { org, dataset, episode } = await params;
 
 
 
 
12
  return {
13
- title: `${org}/${dataset} | episode ${episode}`,
14
  };
15
  }
16
 
 
1
  import EpisodeViewer from "./episode-viewer";
2
  import { Suspense } from "react";
3
+ import { LOCAL_ORG, decodeLocalDatasetSegment } from "@/utils/localDataset";
4
 
5
  export const dynamic = "force-dynamic";
6
 
 
10
  params: Promise<{ org: string; dataset: string; episode: string }>;
11
  }) {
12
  const { org, dataset, episode } = await params;
13
+ const label =
14
+ org === LOCAL_ORG
15
+ ? (decodeLocalDatasetSegment(dataset) ?? `${org}/${dataset}`)
16
+ : `${org}/${dataset}`;
17
  return {
18
+ title: `${label} | episode ${episode}`,
19
  };
20
  }
21
 
src/app/api/local-dataset/file/route.ts ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { readLocalDatasetFileResponse } from "@/lib/local-dataset-fs";
2
+
3
+ export async function GET(request: Request) {
4
+ const { searchParams } = new URL(request.url);
5
+ const rootEncoded = searchParams.get("root");
6
+ const rel = searchParams.get("rel");
7
+ if (!rootEncoded || !rel) {
8
+ return Response.json(
9
+ { error: "Missing root or rel query parameter" },
10
+ { status: 400 },
11
+ );
12
+ }
13
+ return readLocalDatasetFileResponse(rootEncoded, rel, request);
14
+ }
src/app/page.tsx CHANGED
@@ -3,6 +3,11 @@ import { useEffect, useRef, useState, useCallback, Suspense } from "react";
3
  import Link from "next/link";
4
  import { useRouter } from "next/navigation";
5
  import { useSearchParams } from "next/navigation";
 
 
 
 
 
6
 
7
  declare global {
8
  interface Window {
@@ -189,10 +194,16 @@ function HomeInner() {
189
 
190
  const handleSubmit = (e: { preventDefault: () => void }) => {
191
  e.preventDefault();
 
 
 
 
 
 
192
  const target =
193
  activeIndex >= 0 && suggestions[activeIndex]
194
  ? suggestions[activeIndex]
195
- : query.trim();
196
  if (target) navigate(target);
197
  };
198
 
@@ -261,7 +272,7 @@ function HomeInner() {
261
  onChange={(e) => setQuery(e.target.value)}
262
  onKeyDown={handleKeyDown}
263
  onFocus={() => query.trim() && setShowSuggestions(true)}
264
- placeholder="Enter dataset id (e.g. lerobot/pusht)"
265
  className="pl-10 pr-4 py-2.5 rounded-md text-base text-white bg-white/10 backdrop-blur-sm border border-white/30 focus:outline-none focus:border-sky-400 focus:bg-white/15 w-[380px] shadow-md placeholder:text-white/40 transition-colors"
266
  autoComplete="off"
267
  />
 
3
  import Link from "next/link";
4
  import { useRouter } from "next/navigation";
5
  import { useSearchParams } from "next/navigation";
6
+ import {
7
+ encodeLocalDatasetRoot,
8
+ looksLikeAbsoluteFilesystemPath,
9
+ normalizeUserLocalPathInput,
10
+ } from "@/utils/localDataset";
11
 
12
  declare global {
13
  interface Window {
 
194
 
195
  const handleSubmit = (e: { preventDefault: () => void }) => {
196
  e.preventDefault();
197
+ const typed = query.trim();
198
+ if (looksLikeAbsoluteFilesystemPath(typed)) {
199
+ const path = normalizeUserLocalPathInput(typed);
200
+ navigate(`/local/${encodeLocalDatasetRoot(path)}`);
201
+ return;
202
+ }
203
  const target =
204
  activeIndex >= 0 && suggestions[activeIndex]
205
  ? suggestions[activeIndex]
206
+ : typed;
207
  if (target) navigate(target);
208
  };
209
 
 
272
  onChange={(e) => setQuery(e.target.value)}
273
  onKeyDown={handleKeyDown}
274
  onFocus={() => query.trim() && setShowSuggestions(true)}
275
+ placeholder="Dataset id or local path (e.g. /tmp/my_dataset)"
276
  className="pl-10 pr-4 py-2.5 rounded-md text-base text-white bg-white/10 backdrop-blur-sm border border-white/30 focus:outline-none focus:border-sky-400 focus:bg-white/15 w-[380px] shadow-md placeholder:text-white/40 transition-colors"
277
  autoComplete="off"
278
  />
src/components/filtering-panel.tsx CHANGED
@@ -225,6 +225,8 @@ function EpisodeLengthFilter({ episodes }: { episodes: EpisodeLengthInfo[] }) {
225
 
226
  interface FilteringPanelProps {
227
  repoId: string;
 
 
228
  crossEpisodeData: CrossEpisodeVarianceData | null;
229
  crossEpisodeLoading: boolean;
230
  episodeLengthStats: EpisodeLengthStats | null;
@@ -234,9 +236,11 @@ interface FilteringPanelProps {
234
 
235
  function FlaggedIdsCopyBar({
236
  repoId,
 
237
  onViewEpisodes,
238
  }: {
239
  repoId: string;
 
240
  onViewEpisodes?: () => void;
241
  }) {
242
  const { flagged, count, clear } = useFlaggedEpisodes();
@@ -331,19 +335,32 @@ function FlaggedIdsCopyBar({
331
  </button>
332
  )}
333
  <div className="bg-slate-900/60 rounded-md px-3 py-2 border border-slate-700/60 space-y-2.5">
334
- <p className="text-xs text-slate-400">
335
- <a
336
- href="https://github.com/huggingface/lerobot"
337
- target="_blank"
338
- rel="noopener noreferrer"
339
- className="text-orange-400 underline"
340
- >
341
- LeRobot CLI
342
- </a>{" "}
343
- delete flagged episodes:
344
- </p>
345
- <pre className="text-xs text-slate-300 bg-slate-950/50 rounded px-2 py-1.5 overflow-x-auto select-all">{`# Delete episodes (modifies original dataset)\nlerobot-edit-dataset \\\n --repo_id ${repoId} \\\n --operation.type delete_episodes \\\n --operation.episode_indices "[${ids.join(", ")}]"`}</pre>
346
- <pre className="text-xs text-slate-300 bg-slate-950/50 rounded px-2 py-1.5 overflow-x-auto select-all">{`# Delete episodes and save to a new dataset (preserves original)\nlerobot-edit-dataset \\\n --repo_id ${repoId} \\\n --new_repo_id ${repoId}_filtered \\\n --operation.type delete_episodes \\\n --operation.episode_indices "[${ids.join(", ")}]"`}</pre>
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  </div>
348
  </div>
349
  );
@@ -351,6 +368,7 @@ function FlaggedIdsCopyBar({
351
 
352
  function FilteringPanel({
353
  repoId,
 
354
  crossEpisodeData,
355
  crossEpisodeLoading,
356
  episodeLengthStats,
@@ -369,6 +387,7 @@ function FilteringPanel({
369
 
370
  <FlaggedIdsCopyBar
371
  repoId={repoId}
 
372
  onViewEpisodes={onViewFlaggedEpisodes}
373
  />
374
 
 
225
 
226
  interface FilteringPanelProps {
227
  repoId: string;
228
+ /** When set, Hugging Face CLI snippets are hidden (local folder dataset). */
229
+ localDiskPath?: string;
230
  crossEpisodeData: CrossEpisodeVarianceData | null;
231
  crossEpisodeLoading: boolean;
232
  episodeLengthStats: EpisodeLengthStats | null;
 
236
 
237
  function FlaggedIdsCopyBar({
238
  repoId,
239
+ localDiskPath,
240
  onViewEpisodes,
241
  }: {
242
  repoId: string;
243
+ localDiskPath?: string;
244
  onViewEpisodes?: () => void;
245
  }) {
246
  const { flagged, count, clear } = useFlaggedEpisodes();
 
335
  </button>
336
  )}
337
  <div className="bg-slate-900/60 rounded-md px-3 py-2 border border-slate-700/60 space-y-2.5">
338
+ {localDiskPath ? (
339
+ <p className="text-xs text-slate-400 leading-relaxed">
340
+ This is a local folder dataset (not a Hugging Face repo). Flagged
341
+ indices are listed above; edit or remove episodes on disk under{" "}
342
+ <span className="text-slate-200 font-mono break-all">
343
+ {localDiskPath}
344
+ </span>
345
+ .
346
+ </p>
347
+ ) : (
348
+ <>
349
+ <p className="text-xs text-slate-400">
350
+ <a
351
+ href="https://github.com/huggingface/lerobot"
352
+ target="_blank"
353
+ rel="noopener noreferrer"
354
+ className="text-orange-400 underline"
355
+ >
356
+ LeRobot CLI
357
+ </a>{" "}
358
+ — delete flagged episodes:
359
+ </p>
360
+ <pre className="text-xs text-slate-300 bg-slate-950/50 rounded px-2 py-1.5 overflow-x-auto select-all">{`# Delete episodes (modifies original dataset)\nlerobot-edit-dataset \\\n --repo_id ${repoId} \\\n --operation.type delete_episodes \\\n --operation.episode_indices "[${ids.join(", ")}]"`}</pre>
361
+ <pre className="text-xs text-slate-300 bg-slate-950/50 rounded px-2 py-1.5 overflow-x-auto select-all">{`# Delete episodes and save to a new dataset (preserves original)\nlerobot-edit-dataset \\\n --repo_id ${repoId} \\\n --new_repo_id ${repoId}_filtered \\\n --operation.type delete_episodes \\\n --operation.episode_indices "[${ids.join(", ")}]"`}</pre>
362
+ </>
363
+ )}
364
  </div>
365
  </div>
366
  );
 
368
 
369
  function FilteringPanel({
370
  repoId,
371
+ localDiskPath,
372
  crossEpisodeData,
373
  crossEpisodeLoading,
374
  episodeLengthStats,
 
387
 
388
  <FlaggedIdsCopyBar
389
  repoId={repoId}
390
+ localDiskPath={localDiskPath}
391
  onViewEpisodes={onViewFlaggedEpisodes}
392
  />
393
 
src/components/stats-panel.tsx CHANGED
@@ -123,8 +123,8 @@ function StatsPanel({
123
  <div>
124
  <h2 className="text-xl text-slate-100">
125
  <span className="font-bold">Dataset Statistics:</span>{" "}
126
- <span className="font-normal text-slate-400">
127
- {datasetInfo.repoId}
128
  </span>
129
  </h2>
130
  </div>
 
123
  <div>
124
  <h2 className="text-xl text-slate-100">
125
  <span className="font-bold">Dataset Statistics:</span>{" "}
126
+ <span className="font-normal text-slate-400 break-all">
127
+ {datasetInfo.displayLabel ?? datasetInfo.repoId}
128
  </span>
129
  </h2>
130
  </div>
src/lib/local-dataset-fs.ts ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fs from "fs/promises";
2
+ import path from "path";
3
+ import { decodeLocalDatasetSegment } from "@/utils/localDataset";
4
+
5
+ export function resolveLocalDatasetRoot(rootEncoded: string): string | null {
6
+ const decoded = decodeLocalDatasetSegment(rootEncoded);
7
+ if (!decoded) return null;
8
+ return path.resolve(decoded);
9
+ }
10
+
11
+ /**
12
+ * Join `rel` under `root` and ensure the result stays inside `root` (no path traversal).
13
+ */
14
+ export function safePathUnderRoot(root: string, rel: string): string | null {
15
+ if (!rel || rel.includes("\0")) return null;
16
+ const relNorm = rel.replace(/\\/g, "/");
17
+ const segments = relNorm.split("/").filter((s) => s.length > 0);
18
+ if (segments.length === 0) return null;
19
+ if (segments.some((s) => s === "..")) return null;
20
+
21
+ const rootResolved = path.resolve(root);
22
+ const joined = path.resolve(rootResolved, ...segments);
23
+
24
+ const prefix =
25
+ rootResolved.endsWith(path.sep) || rootResolved.endsWith("/")
26
+ ? rootResolved
27
+ : rootResolved + path.sep;
28
+ if (joined !== rootResolved && !joined.startsWith(prefix)) {
29
+ return null;
30
+ }
31
+ return joined;
32
+ }
33
+
34
+ function guessContentType(filePath: string): string {
35
+ const ext = path.extname(filePath).toLowerCase();
36
+ switch (ext) {
37
+ case ".json":
38
+ return "application/json";
39
+ case ".jsonl":
40
+ return "application/x-ndjson";
41
+ case ".mp4":
42
+ return "video/mp4";
43
+ case ".webm":
44
+ return "video/webm";
45
+ case ".parquet":
46
+ return "application/octet-stream";
47
+ default:
48
+ return "application/octet-stream";
49
+ }
50
+ }
51
+
52
+ function parseBytesRange(
53
+ rangeHeader: string | null,
54
+ size: number,
55
+ ): { start: number; end: number } | null {
56
+ if (!rangeHeader || !rangeHeader.startsWith("bytes=")) return null;
57
+ const spec = rangeHeader.slice(6).split(",")[0]?.trim();
58
+ if (!spec) return null;
59
+ const dash = spec.indexOf("-");
60
+ if (dash < 0) return null;
61
+ const startStr = spec.slice(0, dash);
62
+ const endStr = spec.slice(dash + 1);
63
+
64
+ let start: number;
65
+ let end: number;
66
+
67
+ if (startStr === "") {
68
+ const suffix = parseInt(endStr, 10);
69
+ if (!Number.isFinite(suffix) || suffix <= 0) return null;
70
+ start = Math.max(0, size - suffix);
71
+ end = size - 1;
72
+ } else {
73
+ start = parseInt(startStr, 10);
74
+ end = endStr !== "" ? parseInt(endStr, 10) : size - 1;
75
+ if (!Number.isFinite(start) || !Number.isFinite(end)) return null;
76
+ end = Math.min(end, size - 1);
77
+ if (start > end || start < 0 || start >= size) return null;
78
+ }
79
+
80
+ return { start, end };
81
+ }
82
+
83
+ export async function readLocalDatasetFileResponse(
84
+ rootEncoded: string,
85
+ rel: string,
86
+ request: Request,
87
+ ): Promise<Response> {
88
+ const root = resolveLocalDatasetRoot(rootEncoded);
89
+ if (!root) {
90
+ return Response.json({ error: "Invalid root" }, { status: 400 });
91
+ }
92
+
93
+ const fullPath = safePathUnderRoot(root, rel);
94
+ if (!fullPath) {
95
+ return Response.json({ error: "Invalid path" }, { status: 400 });
96
+ }
97
+
98
+ let stat;
99
+ try {
100
+ stat = await fs.stat(fullPath);
101
+ } catch {
102
+ return Response.json({ error: "Not found" }, { status: 404 });
103
+ }
104
+
105
+ if (!stat.isFile()) {
106
+ return Response.json({ error: "Not a file" }, { status: 400 });
107
+ }
108
+
109
+ const size = stat.size;
110
+ const contentType = guessContentType(fullPath);
111
+ const range = parseBytesRange(request.headers.get("range"), size);
112
+
113
+ if (range && contentType.startsWith("video/")) {
114
+ const { start, end } = range;
115
+ const length = end - start + 1;
116
+ const fh = await fs.open(fullPath, "r");
117
+ try {
118
+ const buf = Buffer.alloc(length);
119
+ await fh.read(buf, 0, length, start);
120
+ return new Response(new Uint8Array(buf), {
121
+ status: 206,
122
+ headers: {
123
+ "Content-Type": contentType,
124
+ "Content-Length": String(length),
125
+ "Content-Range": `bytes ${start}-${end}/${size}`,
126
+ "Accept-Ranges": "bytes",
127
+ },
128
+ });
129
+ } finally {
130
+ await fh.close();
131
+ }
132
+ }
133
+
134
+ const body = await fs.readFile(fullPath);
135
+ return new Response(new Uint8Array(body), {
136
+ status: 200,
137
+ headers: {
138
+ "Content-Type": contentType,
139
+ "Content-Length": String(size),
140
+ "Accept-Ranges": "bytes",
141
+ },
142
+ });
143
+ }
src/utils/__tests__/localDataset.test.ts ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { describe, expect, test } from "bun:test";
2
+ import {
3
+ decodeLocalDatasetSegment,
4
+ encodeLocalDatasetRoot,
5
+ isLocalRepoId,
6
+ localDatasetFileApiPath,
7
+ localRepoIdFromAbsolutePath,
8
+ looksLikeAbsoluteFilesystemPath,
9
+ normalizeUserLocalPathInput,
10
+ tryParseLocalDatasetFileApiUrl,
11
+ } from "@/utils/localDataset";
12
+
13
+ describe("localDataset", () => {
14
+ test("encode/decode round trip for unix path", () => {
15
+ const p = "/tmp/aic_lerobot";
16
+ const enc = encodeLocalDatasetRoot(p);
17
+ expect(decodeLocalDatasetSegment(enc)).toBe(p);
18
+ });
19
+
20
+ test("localRepoIdFromAbsolutePath and isLocalRepoId", () => {
21
+ const id = localRepoIdFromAbsolutePath("/data/foo");
22
+ expect(isLocalRepoId(id)).toBe(true);
23
+ expect(isLocalRepoId("lerobot/pusht")).toBe(false);
24
+ });
25
+
26
+ test("looksLikeAbsoluteFilesystemPath", () => {
27
+ expect(looksLikeAbsoluteFilesystemPath("/tmp/x")).toBe(true);
28
+ expect(looksLikeAbsoluteFilesystemPath("file:///tmp/x")).toBe(true);
29
+ expect(looksLikeAbsoluteFilesystemPath("C:\\data\\x")).toBe(true);
30
+ expect(looksLikeAbsoluteFilesystemPath("lerobot/foo")).toBe(false);
31
+ });
32
+
33
+ test("normalizeUserLocalPathInput strips file://", () => {
34
+ expect(normalizeUserLocalPathInput("file:///tmp/a")).toBe("/tmp/a");
35
+ });
36
+
37
+ test("localDatasetFileApiPath builds query string", () => {
38
+ const q = localDatasetFileApiPath("abc", "meta/info.json");
39
+ expect(q).toContain("/api/local-dataset/file?");
40
+ expect(q).toContain("root=abc");
41
+ expect(q).toContain("rel=");
42
+ });
43
+
44
+ test("tryParseLocalDatasetFileApiUrl", () => {
45
+ const u =
46
+ "http://127.0.0.1:3000/api/local-dataset/file?root=x&rel=meta%2Finfo.json";
47
+ expect(tryParseLocalDatasetFileApiUrl(u)).toEqual({
48
+ rootEncoded: "x",
49
+ relPath: "meta/info.json",
50
+ });
51
+ expect(
52
+ tryParseLocalDatasetFileApiUrl(
53
+ "/api/local-dataset/file?root=y&rel=data%2F0.parquet",
54
+ ),
55
+ ).toEqual({ rootEncoded: "y", relPath: "data/0.parquet" });
56
+ });
57
+ });
src/utils/__tests__/versionUtils.test.ts CHANGED
@@ -55,6 +55,14 @@ describe("buildVersionedUrl", () => {
55
  "https://huggingface.co/datasets/myorg/mydataset/resolve/main/meta/info.json",
56
  );
57
  });
 
 
 
 
 
 
 
 
58
  });
59
 
60
  // ---------------------------------------------------------------------------
 
55
  "https://huggingface.co/datasets/myorg/mydataset/resolve/main/meta/info.json",
56
  );
57
  });
58
+
59
+ test("builds local API URL for dataset files", () => {
60
+ const url = buildVersionedUrl("local/abc123", "v3.0", "meta/info.json");
61
+ expect(url.startsWith("/api/local-dataset/file?")).toBe(true);
62
+ const params = new URLSearchParams(url.split("?")[1]!);
63
+ expect(params.get("root")).toBe("abc123");
64
+ expect(params.get("rel")).toBe("meta/info.json");
65
+ });
66
  });
67
 
68
  // ---------------------------------------------------------------------------
src/utils/internalFetch.ts ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Server-side code (e.g. server actions) cannot use relative fetch URLs; prefix with the app origin.
3
+ */
4
+ export function resolveInternalFetchUrl(url: string): string {
5
+ if (typeof window !== "undefined") return url;
6
+ if (url.startsWith("http://") || url.startsWith("https://")) return url;
7
+ if (!url.startsWith("/")) return url;
8
+ const base =
9
+ process.env.NEXT_PUBLIC_APP_URL?.replace(/\/$/, "") ??
10
+ `http://127.0.0.1:${process.env.PORT ?? "3000"}`;
11
+ return `${base}${url}`;
12
+ }
src/utils/localDataset.ts ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Local LeRobot dataset roots (absolute filesystem paths) are encoded in URLs as
3
+ * `/local/<base64url(utf8 path)>/episode_0`. File bytes are served via
4
+ * `/api/local-dataset/file?root=...&rel=...`.
5
+ */
6
+
7
+ export const LOCAL_ORG = "local";
8
+ export const LOCAL_REPO_PREFIX = `${LOCAL_ORG}/`;
9
+
10
+ function utf8ToBase64Url(s: string): string {
11
+ const bytes = new TextEncoder().encode(s);
12
+ let bin = "";
13
+ for (let i = 0; i < bytes.length; i++) {
14
+ bin += String.fromCharCode(bytes[i]!);
15
+ }
16
+ const b64 =
17
+ typeof btoa !== "undefined"
18
+ ? btoa(bin)
19
+ : Buffer.from(bytes).toString("base64");
20
+ return b64.replace(/\+/g, "-").replace(/\//g, "_").replace(/=+$/, "");
21
+ }
22
+
23
+ function base64UrlToUtf8(s: string): string {
24
+ const pad = s.length % 4 === 0 ? "" : "=".repeat(4 - (s.length % 4));
25
+ const b64 = s.replace(/-/g, "+").replace(/_/g, "/") + pad;
26
+ let bin: string;
27
+ if (typeof atob !== "undefined") {
28
+ bin = atob(b64);
29
+ } else {
30
+ bin = Buffer.from(b64, "base64").toString("binary");
31
+ }
32
+ const bytes = new Uint8Array(bin.length);
33
+ for (let i = 0; i < bin.length; i++) {
34
+ bytes[i] = bin.charCodeAt(i);
35
+ }
36
+ return new TextDecoder().decode(bytes);
37
+ }
38
+
39
+ export function normalizeUserLocalPathInput(input: string): string {
40
+ const t = input.trim();
41
+ if (t.startsWith("file://")) {
42
+ try {
43
+ const u = new URL(t);
44
+ return decodeURIComponent(u.pathname);
45
+ } catch {
46
+ return t;
47
+ }
48
+ }
49
+ return t;
50
+ }
51
+
52
+ export function looksLikeAbsoluteFilesystemPath(input: string): boolean {
53
+ const t = normalizeUserLocalPathInput(input);
54
+ if (!t) return false;
55
+ if (t.startsWith("/")) return true;
56
+ return /^[A-Za-z]:[\\/]/.test(t);
57
+ }
58
+
59
+ export function encodeLocalDatasetRoot(absolutePath: string): string {
60
+ return utf8ToBase64Url(absolutePath);
61
+ }
62
+
63
+ export function decodeLocalDatasetSegment(
64
+ encodedSegment: string,
65
+ ): string | null {
66
+ if (!encodedSegment) return null;
67
+ try {
68
+ return base64UrlToUtf8(encodedSegment);
69
+ } catch {
70
+ return null;
71
+ }
72
+ }
73
+
74
+ export function localRepoIdFromAbsolutePath(absolutePath: string): string {
75
+ return `${LOCAL_REPO_PREFIX}${encodeLocalDatasetRoot(absolutePath)}`;
76
+ }
77
+
78
+ export function isLocalRepoId(repoId: string): boolean {
79
+ return (
80
+ repoId.startsWith(LOCAL_REPO_PREFIX) &&
81
+ repoId.length > LOCAL_REPO_PREFIX.length
82
+ );
83
+ }
84
+
85
+ export function localDatasetFileApiPath(
86
+ rootEncoded: string,
87
+ relPath: string,
88
+ ): string {
89
+ const q = new URLSearchParams({
90
+ root: rootEncoded,
91
+ rel: relPath,
92
+ });
93
+ return `/api/local-dataset/file?${q.toString()}`;
94
+ }
95
+
96
+ /** Parse our local file API URL (absolute or relative) for server-side fs shortcut. */
97
+ export function tryParseLocalDatasetFileApiUrl(
98
+ url: string,
99
+ ): { rootEncoded: string; relPath: string } | null {
100
+ const marker = "/api/local-dataset/file";
101
+ const idx = url.indexOf(marker);
102
+ if (idx === -1) return null;
103
+ const after = url.slice(idx + marker.length);
104
+ const queryStart = after.indexOf("?");
105
+ if (queryStart === -1) return null;
106
+ const search = after.slice(queryStart + 1);
107
+ try {
108
+ const params = new URLSearchParams(search);
109
+ const root = params.get("root");
110
+ const rel = params.get("rel");
111
+ if (!root || !rel) return null;
112
+ return { rootEncoded: root, relPath: rel };
113
+ } catch {
114
+ return null;
115
+ }
116
+ }
src/utils/parquetUtils.ts CHANGED
@@ -5,6 +5,7 @@ import {
5
  parquetReadObjects,
6
  type AsyncBuffer,
7
  } from "hyparquet";
 
8
 
9
  export interface DatasetMetadata {
10
  codebase_version: string;
@@ -56,8 +57,9 @@ export async function fetchParquetFile(url: string): Promise<ParquetFile> {
56
  const cached = parquetFileCache.get(url);
57
  if (cached) return cached;
58
 
 
59
  const file = await asyncBufferFromUrl({
60
- url,
61
  requestInit: { cache: "no-store" },
62
  });
63
  const wrapped = cachedAsyncBuffer(file);
 
5
  parquetReadObjects,
6
  type AsyncBuffer,
7
  } from "hyparquet";
8
+ import { resolveInternalFetchUrl } from "@/utils/internalFetch";
9
 
10
  export interface DatasetMetadata {
11
  codebase_version: string;
 
57
  const cached = parquetFileCache.get(url);
58
  if (cached) return cached;
59
 
60
+ const fetchUrl = resolveInternalFetchUrl(url);
61
  const file = await asyncBufferFromUrl({
62
+ url: fetchUrl,
63
  requestInit: { cache: "no-store" },
64
  });
65
  const wrapped = cachedAsyncBuffer(file);
src/utils/versionUtils.ts CHANGED
@@ -2,6 +2,13 @@
2
  * Utility functions for checking dataset version compatibility
3
  */
4
 
 
 
 
 
 
 
 
5
  const DATASET_URL =
6
  process.env.DATASET_URL || "https://huggingface.co/datasets";
7
 
@@ -73,16 +80,26 @@ export async function getDatasetInfo(repoId: string): Promise<DatasetInfo> {
73
  console.log(`[perf] getDatasetInfo cache MISS for ${repoId} — fetching`);
74
 
75
  try {
76
- const testUrl = `${DATASET_URL}/${repoId}/resolve/main/meta/info.json`;
77
-
78
  const controller = new AbortController();
79
  const timeoutId = setTimeout(() => controller.abort(), 10000);
80
 
81
- const response = await fetch(testUrl, {
82
- method: "GET",
83
- cache: "no-store",
84
- signal: controller.signal,
85
- });
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  clearTimeout(timeoutId);
88
 
@@ -149,5 +166,9 @@ export function buildVersionedUrl(
149
  version: string,
150
  path: string,
151
  ): string {
 
 
 
 
152
  return `${DATASET_URL}/${repoId}/resolve/main/${path}`;
153
  }
 
2
  * Utility functions for checking dataset version compatibility
3
  */
4
 
5
+ import {
6
+ isLocalRepoId,
7
+ localDatasetFileApiPath,
8
+ LOCAL_REPO_PREFIX,
9
+ } from "@/utils/localDataset";
10
+ import { resolveInternalFetchUrl } from "@/utils/internalFetch";
11
+
12
  const DATASET_URL =
13
  process.env.DATASET_URL || "https://huggingface.co/datasets";
14
 
 
80
  console.log(`[perf] getDatasetInfo cache MISS for ${repoId} — fetching`);
81
 
82
  try {
 
 
83
  const controller = new AbortController();
84
  const timeoutId = setTimeout(() => controller.abort(), 10000);
85
 
86
+ let response: Response;
87
+ if (isLocalRepoId(repoId)) {
88
+ const rootEncoded = repoId.slice(LOCAL_REPO_PREFIX.length);
89
+ const localUrl = localDatasetFileApiPath(rootEncoded, "meta/info.json");
90
+ response = await fetch(resolveInternalFetchUrl(localUrl), {
91
+ method: "GET",
92
+ cache: "no-store",
93
+ signal: controller.signal,
94
+ });
95
+ } else {
96
+ const testUrl = `${DATASET_URL}/${repoId}/resolve/main/meta/info.json`;
97
+ response = await fetch(testUrl, {
98
+ method: "GET",
99
+ cache: "no-store",
100
+ signal: controller.signal,
101
+ });
102
+ }
103
 
104
  clearTimeout(timeoutId);
105
 
 
166
  version: string,
167
  path: string,
168
  ): string {
169
+ if (isLocalRepoId(repoId)) {
170
+ const rootEncoded = repoId.slice(LOCAL_REPO_PREFIX.length);
171
+ return localDatasetFileApiPath(rootEncoded, path);
172
+ }
173
  return `${DATASET_URL}/${repoId}/resolve/main/${path}`;
174
  }