File size: 3,115 Bytes
f39c319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import DatabaseConstructor, { Database } from 'better-sqlite3';
import path from 'path';
import fs from 'fs';

const dbDir = path.join(process.cwd(), 'data');
if (!fs.existsSync(dbDir)) {
  fs.mkdirSync(dbDir, { recursive: true });
}

const db: Database = new DatabaseConstructor(path.join(dbDir, 'crawler.db'));

// Initialize database tables
export function initDB() {
  db.exec(`
    CREATE TABLE IF NOT EXISTS source_registry (
      source_id VARCHAR(64) PRIMARY KEY,
      source_name VARCHAR(255),
      source_type VARCHAR(32),
      domain VARCHAR(255),
      entry_url TEXT,
      url_pattern TEXT,
      parser_type VARCHAR(64),
      crawl_frequency VARCHAR(32),
      priority VARCHAR(16),
      enabled BOOLEAN,
      topic_tags TEXT,
      created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
      updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
    );

    CREATE TABLE IF NOT EXISTS crawl_job (
      job_id VARCHAR(64) PRIMARY KEY,
      source_id VARCHAR(64),
      trigger_type VARCHAR(32),
      status VARCHAR(32),
      started_at DATETIME,
      ended_at DATETIME,
      error_code VARCHAR(32),
      error_message TEXT,
      retry_count INTEGER DEFAULT 0,
      FOREIGN KEY(source_id) REFERENCES source_registry(source_id)
    );

    CREATE TABLE IF NOT EXISTS raw_snapshot (
      snapshot_id VARCHAR(64) PRIMARY KEY,
      source_id VARCHAR(64),
      job_id VARCHAR(64),
      fetched_at DATETIME,
      content_type VARCHAR(64),
      raw_body TEXT,
      raw_hash VARCHAR(128),
      http_status INTEGER,
      final_url TEXT,
      FOREIGN KEY(source_id) REFERENCES source_registry(source_id),
      FOREIGN KEY(job_id) REFERENCES crawl_job(job_id)
    );

    CREATE TABLE IF NOT EXISTS normalized_document (
      doc_id VARCHAR(64) PRIMARY KEY,
      source_id VARCHAR(64),
      snapshot_id VARCHAR(64),
      title VARCHAR(500),
      version_date DATE,
      effective_date DATE,
      normalized_text TEXT,
      normalized_hash VARCHAR(128),
      doc_status VARCHAR(32),
      created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
      FOREIGN KEY(source_id) REFERENCES source_registry(source_id),
      FOREIGN KEY(snapshot_id) REFERENCES raw_snapshot(snapshot_id)
    );

    CREATE TABLE IF NOT EXISTS clause_chunk (
      chunk_id VARCHAR(64) PRIMARY KEY,
      doc_id VARCHAR(64),
      section_path VARCHAR(500),
      section_title VARCHAR(255),
      clause_text TEXT,
      topic_tags TEXT,
      embedding_status VARCHAR(32),
      chunk_order INTEGER,
      created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
      FOREIGN KEY(doc_id) REFERENCES normalized_document(doc_id)
    );

    CREATE TABLE IF NOT EXISTS diff_event (
      event_id VARCHAR(64) PRIMARY KEY,
      source_id VARCHAR(64),
      from_doc_id VARCHAR(64),
      to_doc_id VARCHAR(64),
      change_type VARCHAR(32),
      section_title VARCHAR(255),
      old_excerpt TEXT,
      new_excerpt TEXT,
      topic_tags TEXT,
      impact_level VARCHAR(16),
      detected_at DATETIME DEFAULT CURRENT_TIMESTAMP,
      FOREIGN KEY(source_id) REFERENCES source_registry(source_id)
    );
  `);
}

export default db;