import fs from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; import duckdb from 'duckdb'; const ROOT_DIR = process.cwd(); // cwd() = Current Working Directory const DB_PATH = path.join(ROOT_DIR, 'database.duckdb'); export const DATA_SOURCES = { models : 'https://huggingface.co/datasets/PITTI/speechmap-models/resolve/main/all_models.parquet', questions: 'https://huggingface.co/datasets/PITTI/speechmap-questions/resolve/main/consolidated_questions.parquet', responses: 'https://huggingface.co/datasets/PITTI/speechmap-responses-v3/resolve/main/consolidated_responses.parquet', assessments: 'https://huggingface.co/datasets/PITTI/speechmap-assessments-v3/resolve/main/consolidated_assessments.parquet', // reviewed : './data/reviewed_assessments.parquet' // Local file for reviewed assessments }; function query(db: duckdb.Database, sql: string): Promise { return new Promise((resolve, reject) => { db.all(sql, (err, res) => { if (err) return reject(err); resolve(res); }); }); } async function rebuildDatabase() { console.log('--- Starting full database rebuild with DuckDB ---'); if (fs.existsSync(DB_PATH)) { fs.unlinkSync(DB_PATH); console.log('Deleted old database file.'); } const db = new duckdb.Database(DB_PATH); console.log('DuckDB database created at:', DB_PATH); try { console.log('Installing and loading DuckDB extensions (httpfs, json)...'); await query(db, 'INSTALL httpfs; LOAD httpfs;'); await query(db, 'INSTALL json; LOAD json;'); console.log('Creating database schema...'); await query(db, ` CREATE TABLE models (name VARCHAR PRIMARY KEY, family VARCHAR); CREATE TABLE themes (slug VARCHAR PRIMARY KEY, name VARCHAR); CREATE TABLE questions (uuid VARCHAR PRIMARY KEY, id VARCHAR, category VARCHAR, domain VARCHAR, question VARCHAR, theme VARCHAR); CREATE TABLE responses (uuid VARCHAR PRIMARY KEY, q_uuid VARCHAR, model VARCHAR, timestamp VARCHAR, api_provider VARCHAR, provider VARCHAR, content VARCHAR, matched BOOLEAN, origin VARCHAR); CREATE TABLE assessments (uuid VARCHAR PRIMARY KEY, q_uuid VARCHAR, r_uuid VARCHAR, judge VARCHAR, judge_type VARCHAR, judge_analysis VARCHAR, compliance VARCHAR, pitti_compliance VARCHAR, origin VARCHAR); `); console.log('Schema created.'); console.log('Creating indexes for faster queries...'); await query(db, `CREATE INDEX idx_assessments_r_uuid ON assessments (r_uuid);`); await query(db, `CREATE INDEX idx_assessments_judge_judge_type ON assessments (judge, judge_type);`); await query(db, `CREATE INDEX idx_assessments_judge_compliance ON assessments (judge, compliance);`); await query(db, `CREATE INDEX idx_assessments_pitti_compliance ON assessments (judge, pitti_compliance);`); await query(db, `CREATE INDEX idx_responses_q_uuid ON responses (q_uuid);`); await query(db, `CREATE INDEX idx_questions_theme ON questions (theme);`); console.log('Indexes created.'); console.log('Ingesting models...'); await query(db, ` INSERT INTO models (name, family) SELECT name, family FROM read_parquet('${DATA_SOURCES.models}'); `); console.log('Ingesting themes and questions...'); await query(db, ` INSERT INTO themes (slug, name) SELECT DISTINCT theme AS slug, domain AS name FROM read_parquet('${DATA_SOURCES.questions}') WHERE theme IS NOT NULL AND domain IS NOT NULL; INSERT INTO questions (uuid, id, category, domain, question, theme) SELECT uuid, id, category, domain, question, theme FROM read_parquet('${DATA_SOURCES.questions}'); `); console.log('Ingesting responses from Parquet...'); await query(db, ` INSERT INTO responses (uuid, q_uuid, model, timestamp, api_provider, provider, content, matched, origin) SELECT uuid, q_uuid, model, timestamp, api_provider, provider, content, matched, origin FROM read_parquet('${DATA_SOURCES.responses}'); `); console.log('Ingesting assessments...'); await query(db, ` INSERT INTO assessments (uuid, q_uuid, r_uuid, judge, judge_type, judge_analysis, compliance, pitti_compliance, origin) SELECT uuid, q_uuid, r_uuid, judge, judge_type, judge_analysis, compliance, pitti_compliance, origin FROM read_parquet('${DATA_SOURCES.assessments}'); `); // EXAMPLE: Ingest reviewed assessments from local Parquet file (if exists) // console.log('Updating reviewed assessments from local parquet file...'); // const reviewedDataPath = path.resolve(ROOT_DIR, DATA_SOURCES.reviewed); // if (fs.existsSync(reviewedDataPath)) { // await query(db, ` // INSERT INTO assessments (uuid, q_uuid, r_uuid, judge, judge_type, judge_analysis, compliance, pitti_compliance, origin) // SELECT CAST(uuid AS VARCHAR) || 'n' AS uuid, q_uuid, r_uuid, judge, judge_type, judge_analysis, compliance, pitti_compliance, origin FROM read_parquet('${reviewedDataPath}') // `); // } else { // console.warn(`Reviewed assessments file not found at ${reviewedDataPath}, skipping...`); // } console.log('✅ Data ingestion complete!'); } catch (error) { console.error('An error occurred during the rebuild:', error); db.close(); throw error; } db.close(); console.log('--- Database rebuild finished successfully ---'); } // --- ESM-compatible way to check if the script is run directly --- const entryPoint = process.argv[1]; const currentFile = fileURLToPath(import.meta.url); if (entryPoint === currentFile) { rebuildDatabase().catch(err => { console.error('Database rebuild failed:', err); process.exit(1); }); }