open-navigator / frontend /src /utils /huggingface.ts
jcbowyer's picture
Clean HuggingFace deployment without binary files
61d29fc
/**
* HuggingFace Datasets Server API Client
*
* Query datasets hosted on HuggingFace Hub using the free Datasets Server API.
* No authentication required for public datasets!
*
* API Docs: https://huggingface.co/docs/datasets-server
*/
const DATASETS_SERVER_URL = 'https://datasets-server.huggingface.co';
export interface HFDatasetConfig {
dataset: string; // e.g., "CommunityOne/oral-health-nonprofits"
config?: string; // Default: "default"
split: string; // e.g., "organizations", "financials"
}
export interface HFRow {
row_idx: number;
row: Record<string, any>;
truncated_cells: string[];
}
export interface HFRowsResponse {
features: Array<{
feature_idx: number;
name: string;
type: any;
}>;
rows: HFRow[];
num_rows_total: number;
num_rows_per_page: number;
partial: boolean;
}
export interface HFSearchResponse {
features: Array<{
feature_idx: number;
name: string;
type: any;
}>;
rows: HFRow[];
num_rows_total: number;
num_rows_per_page: number;
partial: boolean;
truncated: boolean;
}
/**
* Fetch rows from a HuggingFace dataset
*
* @param config - Dataset configuration
* @param offset - Starting row (default: 0)
* @param length - Number of rows to fetch (max: 100)
* @returns Promise with dataset rows
*
* @example
* const nonprofits = await fetchHFRows({
* dataset: "CommunityOne/oral-health-nonprofits",
* split: "organizations"
* }, 0, 100);
*/
export async function fetchHFRows(
config: HFDatasetConfig,
offset = 0,
length = 100
): Promise<HFRowsResponse> {
const params = new URLSearchParams({
dataset: config.dataset,
config: config.config || 'default',
split: config.split,
offset: offset.toString(),
length: Math.min(length, 100).toString() // API max is 100
});
const url = `${DATASETS_SERVER_URL}/rows?${params}`;
const response = await fetch(url);
if (!response.ok) {
throw new Error(`HuggingFace API error: ${response.statusText}`);
}
return response.json();
}
/**
* Search within a HuggingFace dataset
*
* @param config - Dataset configuration
* @param query - Search query string
* @param offset - Starting row (default: 0)
* @param length - Number of results (max: 100)
* @returns Promise with search results
*
* @example
* const dentalOrgs = await searchHFDataset({
* dataset: "CommunityOne/oral-health-nonprofits",
* split: "organizations"
* }, "dental");
*/
export async function searchHFDataset(
config: HFDatasetConfig,
query: string,
offset = 0,
length = 100
): Promise<HFSearchResponse> {
const params = new URLSearchParams({
dataset: config.dataset,
config: config.config || 'default',
split: config.split,
query: query,
offset: offset.toString(),
length: Math.min(length, 100).toString()
});
const url = `${DATASETS_SERVER_URL}/search?${params}`;
const response = await fetch(url);
if (!response.ok) {
throw new Error(`HuggingFace search error: ${response.statusText}`);
}
return response.json();
}
/**
* Get dataset size (number of rows)
*
* @param config - Dataset configuration
* @returns Promise with dataset size info
*/
export async function getHFDatasetSize(config: HFDatasetConfig): Promise<{
dataset: string;
config: string;
split: string;
num_rows_total: number;
num_rows_per_page: number;
partial: boolean;
}> {
const params = new URLSearchParams({
dataset: config.dataset,
config: config.config || 'default',
split: config.split
});
const url = `${DATASETS_SERVER_URL}/size?${params}`;
const response = await fetch(url);
if (!response.ok) {
throw new Error(`HuggingFace API error: ${response.statusText}`);
}
return response.json();
}
/**
* Fetch all nonprofits (paginated helper)
*
* @param dataset - HuggingFace dataset name
* @param split - Dataset split (default: "organizations")
* @param maxRows - Maximum total rows to fetch
* @returns Promise with all rows combined
*
* @example
* const allNonprofits = await fetchAllNonprofits(
* "CommunityOne/oral-health-nonprofits",
* "organizations",
* 1000
* );
*/
export async function fetchAllNonprofits(
dataset: string,
split = 'organizations',
maxRows = 1000
): Promise<any[]> {
const allRows: any[] = [];
let offset = 0;
const batchSize = 100; // API max per request
while (offset < maxRows) {
const response = await fetchHFRows(
{ dataset, split },
offset,
Math.min(batchSize, maxRows - offset)
);
const rows = response.rows.map(r => r.row);
allRows.push(...rows);
// Stop if we've fetched all available rows
if (rows.length < batchSize) {
break;
}
offset += batchSize;
}
return allRows;
}
/**
* Filter nonprofits by state
*
* @param dataset - HuggingFace dataset name
* @param stateCode - Two-letter state code (e.g., "AL", "CA")
* @param maxRows - Maximum rows to fetch
* @returns Promise with filtered nonprofits
*
* @example
* const alabamaNonprofits = await fetchNonprofitsByState(
* "CommunityOne/one-nonprofits-organizations",
* "AL",
* 5000
* );
*/
export async function fetchNonprofitsByState(
dataset: string,
stateCode: string,
maxRows = 5000
): Promise<any[]> {
const allRows = await fetchAllNonprofits(dataset, 'organizations', maxRows);
return allRows.filter(row => row.state === stateCode);
}
/**
* Filter nonprofits by NTEE code
*
* @param dataset - HuggingFace dataset name
* @param nteePrefix - NTEE code prefix (e.g., "E" for health, "X" for religion)
* @param maxRows - Maximum rows to fetch
* @returns Promise with filtered nonprofits
*
* @example
* const healthOrgs = await fetchNonprofitsByNTEE(
* "CommunityOne/one-nonprofits-organizations",
* "E",
* 10000
* );
*/
export async function fetchNonprofitsByNTEE(
dataset: string,
nteePrefix: string,
maxRows = 10000
): Promise<any[]> {
const allRows = await fetchAllNonprofits(dataset, 'organizations', maxRows);
return allRows.filter(row => row.ntee_code?.startsWith(nteePrefix));
}
/**
* Nonprofit search with filters
*
* @param options - Search options
* @returns Promise with search results
*
* @example
* const results = await searchNonprofits({
* dataset: "CommunityOne/one-nonprofits-organizations",
* query: "dental",
* state: "CA",
* nteeCode: "E",
* limit: 100
* });
*/
export async function searchNonprofits(options: {
dataset: string;
query?: string;
state?: string;
nteeCode?: string;
limit?: number;
}): Promise<any[]> {
const { dataset, query, state, nteeCode, limit = 100 } = options;
let results: any[];
if (query) {
// Use HuggingFace search API
const response = await searchHFDataset(
{ dataset, split: 'organizations' },
query,
0,
limit
);
results = response.rows.map(r => r.row);
} else {
// Fetch and filter
results = await fetchAllNonprofits(dataset, 'organizations', limit);
}
// Apply filters
if (state) {
results = results.filter(row => row.state === state);
}
if (nteeCode) {
results = results.filter(row => row.ntee_code?.startsWith(nteeCode));
}
return results.slice(0, limit);
}