import db, { initDB } from './src/crawler/db'; import { fetchPage } from './src/crawler/fetcher'; import { extractAndNormalize } from './src/crawler/extractor'; import { sliceAndDiff } from './src/crawler/differ'; async function test() { initDB(); const source_id = 'test_source'; const job_id = 'test_job'; db.prepare(` INSERT OR IGNORE INTO source_registry ( source_id, source_name, source_type, domain, entry_url, url_pattern, parser_type, crawl_frequency, priority, enabled, topic_tags ) VALUES ( ?, '测试官网', 'peer_bank', 'httpbin.org', 'http://httpbin.org/html', '', 'html_main_content', 'daily', 'high', 1, '[]' ) `).run(source_id); db.prepare(` INSERT OR IGNORE INTO crawl_job ( job_id, source_id, trigger_type, status, started_at ) VALUES ( ?, ?, 'manual', 'queued', ? ) `).run(job_id, source_id, new Date().toISOString()); console.log('Fetching...'); const snapshot = await fetchPage(job_id, source_id); console.log('Fetched snapshot:', snapshot.snapshot_id); console.log('Extracting...'); const doc = extractAndNormalize(snapshot); if (doc) { console.log('Normalized document:', doc.doc_id); console.log('Diffing...'); sliceAndDiff(doc); console.log('Diff done.'); } else { console.log('No new version.'); } // print stats const chunks = db.prepare('SELECT count(*) as c FROM clause_chunk').get() as any; const events = db.prepare('SELECT count(*) as c FROM diff_event').get() as any; console.log('Chunks:', chunks.c); console.log('Events:', events.c); } test().catch(console.error);