// Hugging Face Dataset Server API Service import type { DatasetRowsResponse, DatasetFirstRowsResponse, DatasetSplitsResponse, DatasetError, } from "./types"; import yaml from "js-yaml"; const BASE_URL = "https://datasets-server.huggingface.co"; /** * Fetch the first 100 rows of a dataset */ export async function fetchFirstRows( dataset: string, config: string = "default", split: string = "train", token?: string ): Promise { const url = `${BASE_URL}/first-rows?dataset=${encodeURIComponent( dataset )}&config=${encodeURIComponent(config)}&split=${encodeURIComponent(split)}`; const headers: HeadersInit = {}; if (token) { headers["Authorization"] = `Bearer ${token}`; } const response = await fetch(url, { headers }); if (!response.ok) { const error: DatasetError = await response.json(); throw new Error( error.error || `Failed to fetch dataset: ${response.statusText}` ); } return response.json(); } /** * Fetch paginated rows from a dataset * @param offset Starting row index (0-based) * @param length Number of rows to fetch (max 100) */ export async function fetchRows( dataset: string, config: string = "default", split: string = "train", offset: number = 0, length: number = 100, token?: string ): Promise { const url = `${BASE_URL}/rows?dataset=${encodeURIComponent( dataset )}&config=${encodeURIComponent(config)}&split=${encodeURIComponent( split )}&offset=${offset}&length=${length}`; const headers: HeadersInit = {}; if (token) { headers["Authorization"] = `Bearer ${token}`; } const response = await fetch(url, { headers }); if (!response.ok) { const error: DatasetError = await response.json(); throw new Error( error.error || `Failed to fetch dataset: ${response.statusText}` ); } return response.json(); } /** * List available splits for a dataset */ export async function fetchSplits( dataset: string, token?: string ): Promise { const url = `${BASE_URL}/splits?dataset=${encodeURIComponent(dataset)}`; const headers: HeadersInit = {}; if (token) { headers["Authorization"] = `Bearer ${token}`; } const response = await fetch(url, { headers }); if (!response.ok) { const error: DatasetError = await response.json(); throw new Error( error.error || `Failed to fetch splits: ${response.statusText}` ); } return response.json(); } /** * Check if a dataset is valid and accessible */ export async function checkDatasetValid( dataset: string, token?: string ): Promise { const url = `${BASE_URL}/is-valid?dataset=${encodeURIComponent(dataset)}`; const headers: HeadersInit = {}; if (token) { headers["Authorization"] = `Bearer ${token}`; } const response = await fetch(url, { headers }); if (!response.ok) { return false; } const data = await response.json(); return data.valid === true; } /** * Fetch all rows from a jurisdiction config (for term/definition data) */ export async function fetchAllRows( dataset: string, config: string, split: string = "train", token?: string ): Promise { const allRows: any[] = []; let offset = 0; const batchSize = 100; let hasMore = true; while (hasMore) { const response = await fetchRows( dataset, config, split, offset, batchSize, token ); const rows = response.rows.map((r) => r.row); allRows.push(...rows); offset += rows.length; hasMore = offset < response.num_rows_total; } return allRows; } export async function fetchDatasetFileRaw( dataset: string, // "owner/dataset-name" filePath: string, // "config.yaml" branch = "main", token?: string ): Promise { const url = `https://huggingface.co/datasets/${dataset}/raw/${encodeURIComponent( branch )}/${filePath}`; const headers: HeadersInit = {}; if (token) { headers["Authorization"] = `Bearer ${token}`; } const res = await fetch(url, { headers }); if (!res.ok) throw new Error(`Failed to fetch file: ${res.status} ${res.statusText}`); return res.text(); } export interface DatasetConfigYaml { scoring_model: string; scoring_provider: string; synthesis_model: string; synthesis_provider: string; jurisdiction1: string; jurisdiction2: string; description: string; grading_templates_1_uid: string; grading_templates_2_uid: string; generation_date?: string; } export async function fetchDatasetConfigYaml( dataset: string, filePath = "config.yaml", branch = "main", token?: string ): Promise { const text = await fetchDatasetFileRaw(dataset, filePath, branch, token); return yaml.load(text) as DatasetConfigYaml; }