| import DatabaseConstructor, { Database } from 'better-sqlite3'; |
| import path from 'path'; |
| import fs from 'fs'; |
|
|
| const dbDir = path.join(process.cwd(), 'data'); |
| if (!fs.existsSync(dbDir)) { |
| fs.mkdirSync(dbDir, { recursive: true }); |
| } |
|
|
| const db: Database = new DatabaseConstructor(path.join(dbDir, 'crawler.db')); |
|
|
| |
| export function initDB() { |
| db.exec(` |
| CREATE TABLE IF NOT EXISTS source_registry ( |
| source_id VARCHAR(64) PRIMARY KEY, |
| source_name VARCHAR(255), |
| source_type VARCHAR(32), |
| domain VARCHAR(255), |
| entry_url TEXT, |
| url_pattern TEXT, |
| parser_type VARCHAR(64), |
| crawl_frequency VARCHAR(32), |
| priority VARCHAR(16), |
| enabled BOOLEAN, |
| topic_tags TEXT, |
| created_at DATETIME DEFAULT CURRENT_TIMESTAMP, |
| updated_at DATETIME DEFAULT CURRENT_TIMESTAMP |
| ); |
| |
| CREATE TABLE IF NOT EXISTS crawl_job ( |
| job_id VARCHAR(64) PRIMARY KEY, |
| source_id VARCHAR(64), |
| trigger_type VARCHAR(32), |
| status VARCHAR(32), |
| started_at DATETIME, |
| ended_at DATETIME, |
| error_code VARCHAR(32), |
| error_message TEXT, |
| retry_count INTEGER DEFAULT 0, |
| FOREIGN KEY(source_id) REFERENCES source_registry(source_id) |
| ); |
| |
| CREATE TABLE IF NOT EXISTS raw_snapshot ( |
| snapshot_id VARCHAR(64) PRIMARY KEY, |
| source_id VARCHAR(64), |
| job_id VARCHAR(64), |
| fetched_at DATETIME, |
| content_type VARCHAR(64), |
| raw_body TEXT, |
| raw_hash VARCHAR(128), |
| http_status INTEGER, |
| final_url TEXT, |
| FOREIGN KEY(source_id) REFERENCES source_registry(source_id), |
| FOREIGN KEY(job_id) REFERENCES crawl_job(job_id) |
| ); |
| |
| CREATE TABLE IF NOT EXISTS normalized_document ( |
| doc_id VARCHAR(64) PRIMARY KEY, |
| source_id VARCHAR(64), |
| snapshot_id VARCHAR(64), |
| title VARCHAR(500), |
| version_date DATE, |
| effective_date DATE, |
| normalized_text TEXT, |
| normalized_hash VARCHAR(128), |
| doc_status VARCHAR(32), |
| created_at DATETIME DEFAULT CURRENT_TIMESTAMP, |
| FOREIGN KEY(source_id) REFERENCES source_registry(source_id), |
| FOREIGN KEY(snapshot_id) REFERENCES raw_snapshot(snapshot_id) |
| ); |
| |
| CREATE TABLE IF NOT EXISTS clause_chunk ( |
| chunk_id VARCHAR(64) PRIMARY KEY, |
| doc_id VARCHAR(64), |
| section_path VARCHAR(500), |
| section_title VARCHAR(255), |
| clause_text TEXT, |
| topic_tags TEXT, |
| embedding_status VARCHAR(32), |
| chunk_order INTEGER, |
| created_at DATETIME DEFAULT CURRENT_TIMESTAMP, |
| FOREIGN KEY(doc_id) REFERENCES normalized_document(doc_id) |
| ); |
| |
| CREATE TABLE IF NOT EXISTS diff_event ( |
| event_id VARCHAR(64) PRIMARY KEY, |
| source_id VARCHAR(64), |
| from_doc_id VARCHAR(64), |
| to_doc_id VARCHAR(64), |
| change_type VARCHAR(32), |
| section_title VARCHAR(255), |
| old_excerpt TEXT, |
| new_excerpt TEXT, |
| topic_tags TEXT, |
| impact_level VARCHAR(16), |
| detected_at DATETIME DEFAULT CURRENT_TIMESTAMP, |
| FOREIGN KEY(source_id) REFERENCES source_registry(source_id) |
| ); |
| `); |
| } |
|
|
| export default db; |
|
|