File size: 5,725 Bytes
d26f541
 
5b91336
d26f541
 
 
 
 
 
 
f263c36
d26f541
f263c36
 
8614f30
d26f541
 
5b91336
d26f541
 
 
 
 
 
 
 
 
5b91336
d26f541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f263c36
 
 
 
 
 
d26f541
 
 
cbd716e
 
d35e9ec
 
 
cbd716e
 
 
 
f263c36
 
 
 
 
 
d26f541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d35e9ec
 
d26f541
 
f263c36
8614f30
 
 
 
 
 
 
 
 
 
 
d26f541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url'; 
import duckdb from 'duckdb';


const ROOT_DIR = process.cwd(); // cwd() = Current Working Directory
const DB_PATH = path.join(ROOT_DIR, 'database.duckdb');

export const DATA_SOURCES = {
  models : 'https://huggingface.co/datasets/PITTI/speechmap-models/resolve/main/all_models.parquet',
  questions: 'https://huggingface.co/datasets/PITTI/speechmap-questions/resolve/main/consolidated_questions.parquet',
  responses: 'https://huggingface.co/datasets/PITTI/speechmap-responses-v3/resolve/main/consolidated_responses.parquet',
  assessments: 'https://huggingface.co/datasets/PITTI/speechmap-assessments-v3/resolve/main/consolidated_assessments.parquet',
  // reviewed : './data/reviewed_assessments.parquet' // Local file for reviewed assessments
};


function query(db: duckdb.Database, sql: string): Promise<any[]> {
  return new Promise((resolve, reject) => {
    db.all(sql, (err, res) => {
      if (err) return reject(err);
      resolve(res);
    });
  });
}


async function rebuildDatabase() {
  console.log('--- Starting full database rebuild with DuckDB ---');

  if (fs.existsSync(DB_PATH)) {
    fs.unlinkSync(DB_PATH);
    console.log('Deleted old database file.');
  }

  const db = new duckdb.Database(DB_PATH);
  console.log('DuckDB database created at:', DB_PATH);

  try {
    console.log('Installing and loading DuckDB extensions (httpfs, json)...');
    await query(db, 'INSTALL httpfs; LOAD httpfs;');
    await query(db, 'INSTALL json; LOAD json;');

    console.log('Creating database schema...');
    await query(db, `
      CREATE TABLE models (name VARCHAR PRIMARY KEY, family VARCHAR);
      CREATE TABLE themes (slug VARCHAR PRIMARY KEY, name VARCHAR);
      CREATE TABLE questions (uuid VARCHAR PRIMARY KEY, id VARCHAR, category VARCHAR, domain VARCHAR, question VARCHAR, theme VARCHAR);
      CREATE TABLE responses (uuid VARCHAR PRIMARY KEY, q_uuid VARCHAR, model VARCHAR, timestamp VARCHAR, api_provider VARCHAR, provider VARCHAR, content VARCHAR, matched BOOLEAN, origin VARCHAR);
      CREATE TABLE assessments (uuid VARCHAR PRIMARY KEY, q_uuid VARCHAR, r_uuid VARCHAR, judge VARCHAR, judge_type VARCHAR, judge_analysis VARCHAR, compliance VARCHAR, pitti_compliance VARCHAR, origin VARCHAR);
        
    `);
    console.log('Schema created.');

    console.log('Creating indexes for faster queries...');
    await query(db, `CREATE INDEX idx_assessments_r_uuid ON assessments (r_uuid);`);
    await query(db, `CREATE INDEX idx_assessments_judge_judge_type ON assessments (judge, judge_type);`);
    await query(db, `CREATE INDEX idx_assessments_judge_compliance ON assessments (judge, compliance);`);
    await query(db, `CREATE INDEX idx_assessments_pitti_compliance ON assessments (judge, pitti_compliance);`);
    await query(db, `CREATE INDEX idx_responses_q_uuid ON responses (q_uuid);`);
    await query(db, `CREATE INDEX idx_questions_theme ON questions (theme);`);
    console.log('Indexes created.');

    console.log('Ingesting models...');
    await query(db, `
        INSERT INTO models (name, family)
        SELECT name, family FROM read_parquet('${DATA_SOURCES.models}');
    `);

    console.log('Ingesting themes and questions...');
    await query(db, `
        INSERT INTO themes (slug, name)
        SELECT DISTINCT theme AS slug, domain AS name FROM read_parquet('${DATA_SOURCES.questions}') WHERE theme IS NOT NULL AND domain IS NOT NULL;
        
        INSERT INTO questions (uuid, id, category, domain, question, theme)
        SELECT uuid, id, category, domain, question, theme FROM read_parquet('${DATA_SOURCES.questions}');
    `);
    
    console.log('Ingesting responses from Parquet...');
    await query(db, `
        INSERT INTO responses (uuid, q_uuid, model, timestamp, api_provider, provider, content, matched, origin)
        SELECT uuid, q_uuid, model, timestamp, api_provider, provider, content, matched, origin FROM read_parquet('${DATA_SOURCES.responses}');
    `);

    console.log('Ingesting assessments...');
    await query(db, `
        INSERT INTO assessments (uuid, q_uuid, r_uuid, judge, judge_type, judge_analysis, compliance, pitti_compliance, origin)
        SELECT uuid, q_uuid, r_uuid, judge, judge_type, judge_analysis, compliance, pitti_compliance, origin FROM read_parquet('${DATA_SOURCES.assessments}');
    `);

    // EXAMPLE: Ingest reviewed assessments from local Parquet file (if exists)
    // console.log('Updating reviewed assessments from local parquet file...');
    // const reviewedDataPath = path.resolve(ROOT_DIR, DATA_SOURCES.reviewed);
    // if (fs.existsSync(reviewedDataPath)) {
    //   await query(db, `
    //     INSERT INTO assessments (uuid, q_uuid, r_uuid, judge, judge_type, judge_analysis, compliance, pitti_compliance, origin)
    //     SELECT CAST(uuid AS VARCHAR) || 'n' AS uuid, q_uuid, r_uuid, judge, judge_type, judge_analysis, compliance, pitti_compliance, origin FROM read_parquet('${reviewedDataPath}')
    //   `);
    // } else {
    //   console.warn(`Reviewed assessments file not found at ${reviewedDataPath}, skipping...`);
    // }

    console.log('✅ Data ingestion complete!');
  } catch (error) {
    console.error('An error occurred during the rebuild:', error);
    db.close();
    throw error;
  }
  
  db.close();
  console.log('--- Database rebuild finished successfully ---');
}


// --- ESM-compatible way to check if the script is run directly ---
const entryPoint = process.argv[1];
const currentFile = fileURLToPath(import.meta.url);

if (entryPoint === currentFile) {
  rebuildDatabase().catch(err => {
    console.error('Database rebuild failed:', err);
    process.exit(1);
  });
}