| import db, { initDB } from './src/crawler/db'; |
| import { fetchPage } from './src/crawler/fetcher'; |
| import { extractAndNormalize } from './src/crawler/extractor'; |
| import { sliceAndDiff } from './src/crawler/differ'; |
|
|
| async function test() { |
| initDB(); |
|
|
| const source_id = 'test_source'; |
| const job_id = 'test_job'; |
|
|
| db.prepare(` |
| INSERT OR IGNORE INTO source_registry ( |
| source_id, source_name, source_type, domain, entry_url, url_pattern, |
| parser_type, crawl_frequency, priority, enabled, topic_tags |
| ) VALUES ( |
| ?, '测试官网', 'peer_bank', 'httpbin.org', 'http://httpbin.org/html', '', |
| 'html_main_content', 'daily', 'high', 1, '[]' |
| ) |
| `).run(source_id); |
|
|
| db.prepare(` |
| INSERT OR IGNORE INTO crawl_job ( |
| job_id, source_id, trigger_type, status, started_at |
| ) VALUES ( |
| ?, ?, 'manual', 'queued', ? |
| ) |
| `).run(job_id, source_id, new Date().toISOString()); |
|
|
| console.log('Fetching...'); |
| const snapshot = await fetchPage(job_id, source_id); |
| console.log('Fetched snapshot:', snapshot.snapshot_id); |
|
|
| console.log('Extracting...'); |
| const doc = extractAndNormalize(snapshot); |
| if (doc) { |
| console.log('Normalized document:', doc.doc_id); |
| console.log('Diffing...'); |
| sliceAndDiff(doc); |
| console.log('Diff done.'); |
| } else { |
| console.log('No new version.'); |
| } |
|
|
| |
| const chunks = db.prepare('SELECT count(*) as c FROM clause_chunk').get() as any; |
| const events = db.prepare('SELECT count(*) as c FROM diff_event').get() as any; |
| console.log('Chunks:', chunks.c); |
| console.log('Events:', events.c); |
| } |
|
|
| test().catch(console.error); |
|
|