pappitti's picture
upgrade for v3 of dataset, adding family and provider filters, improving responsiveness. Still needs polishing
f263c36
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import duckdb from 'duckdb';
const ROOT_DIR = process.cwd(); // cwd() = Current Working Directory
const DB_PATH = path.join(ROOT_DIR, 'database.duckdb');
export const DATA_SOURCES = {
models : 'https://huggingface.co/datasets/PITTI/speechmap-models/resolve/main/all_models.parquet',
questions: 'https://huggingface.co/datasets/PITTI/speechmap-questions/resolve/main/consolidated_questions.parquet',
responses: 'https://huggingface.co/datasets/PITTI/speechmap-responses-v3/resolve/main/consolidated_responses.parquet',
assessments: 'https://huggingface.co/datasets/PITTI/speechmap-assessments-v3/resolve/main/consolidated_assessments.parquet',
// reviewed : './data/reviewed_assessments.parquet' // Local file for reviewed assessments
};
function query(db: duckdb.Database, sql: string): Promise<any[]> {
return new Promise((resolve, reject) => {
db.all(sql, (err, res) => {
if (err) return reject(err);
resolve(res);
});
});
}
async function rebuildDatabase() {
console.log('--- Starting full database rebuild with DuckDB ---');
if (fs.existsSync(DB_PATH)) {
fs.unlinkSync(DB_PATH);
console.log('Deleted old database file.');
}
const db = new duckdb.Database(DB_PATH);
console.log('DuckDB database created at:', DB_PATH);
try {
console.log('Installing and loading DuckDB extensions (httpfs, json)...');
await query(db, 'INSTALL httpfs; LOAD httpfs;');
await query(db, 'INSTALL json; LOAD json;');
console.log('Creating database schema...');
await query(db, `
CREATE TABLE models (name VARCHAR PRIMARY KEY, family VARCHAR);
CREATE TABLE themes (slug VARCHAR PRIMARY KEY, name VARCHAR);
CREATE TABLE questions (uuid VARCHAR PRIMARY KEY, id VARCHAR, category VARCHAR, domain VARCHAR, question VARCHAR, theme VARCHAR);
CREATE TABLE responses (uuid VARCHAR PRIMARY KEY, q_uuid VARCHAR, model VARCHAR, timestamp VARCHAR, api_provider VARCHAR, provider VARCHAR, content VARCHAR, matched BOOLEAN, origin VARCHAR);
CREATE TABLE assessments (uuid VARCHAR PRIMARY KEY, q_uuid VARCHAR, r_uuid VARCHAR, judge VARCHAR, judge_type VARCHAR, judge_analysis VARCHAR, compliance VARCHAR, pitti_compliance VARCHAR, origin VARCHAR);
`);
console.log('Schema created.');
console.log('Creating indexes for faster queries...');
await query(db, `CREATE INDEX idx_assessments_r_uuid ON assessments (r_uuid);`);
await query(db, `CREATE INDEX idx_assessments_judge_judge_type ON assessments (judge, judge_type);`);
await query(db, `CREATE INDEX idx_assessments_judge_compliance ON assessments (judge, compliance);`);
await query(db, `CREATE INDEX idx_assessments_pitti_compliance ON assessments (judge, pitti_compliance);`);
await query(db, `CREATE INDEX idx_responses_q_uuid ON responses (q_uuid);`);
await query(db, `CREATE INDEX idx_questions_theme ON questions (theme);`);
console.log('Indexes created.');
console.log('Ingesting models...');
await query(db, `
INSERT INTO models (name, family)
SELECT name, family FROM read_parquet('${DATA_SOURCES.models}');
`);
console.log('Ingesting themes and questions...');
await query(db, `
INSERT INTO themes (slug, name)
SELECT DISTINCT theme AS slug, domain AS name FROM read_parquet('${DATA_SOURCES.questions}') WHERE theme IS NOT NULL AND domain IS NOT NULL;
INSERT INTO questions (uuid, id, category, domain, question, theme)
SELECT uuid, id, category, domain, question, theme FROM read_parquet('${DATA_SOURCES.questions}');
`);
console.log('Ingesting responses from Parquet...');
await query(db, `
INSERT INTO responses (uuid, q_uuid, model, timestamp, api_provider, provider, content, matched, origin)
SELECT uuid, q_uuid, model, timestamp, api_provider, provider, content, matched, origin FROM read_parquet('${DATA_SOURCES.responses}');
`);
console.log('Ingesting assessments...');
await query(db, `
INSERT INTO assessments (uuid, q_uuid, r_uuid, judge, judge_type, judge_analysis, compliance, pitti_compliance, origin)
SELECT uuid, q_uuid, r_uuid, judge, judge_type, judge_analysis, compliance, pitti_compliance, origin FROM read_parquet('${DATA_SOURCES.assessments}');
`);
// EXAMPLE: Ingest reviewed assessments from local Parquet file (if exists)
// console.log('Updating reviewed assessments from local parquet file...');
// const reviewedDataPath = path.resolve(ROOT_DIR, DATA_SOURCES.reviewed);
// if (fs.existsSync(reviewedDataPath)) {
// await query(db, `
// INSERT INTO assessments (uuid, q_uuid, r_uuid, judge, judge_type, judge_analysis, compliance, pitti_compliance, origin)
// SELECT CAST(uuid AS VARCHAR) || 'n' AS uuid, q_uuid, r_uuid, judge, judge_type, judge_analysis, compliance, pitti_compliance, origin FROM read_parquet('${reviewedDataPath}')
// `);
// } else {
// console.warn(`Reviewed assessments file not found at ${reviewedDataPath}, skipping...`);
// }
console.log('✅ Data ingestion complete!');
} catch (error) {
console.error('An error occurred during the rebuild:', error);
db.close();
throw error;
}
db.close();
console.log('--- Database rebuild finished successfully ---');
}
// --- ESM-compatible way to check if the script is run directly ---
const entryPoint = process.argv[1];
const currentFile = fileURLToPath(import.meta.url);
if (entryPoint === currentFile) {
rebuildDatabase().catch(err => {
console.error('Database rebuild failed:', err);
process.exit(1);
});
}