File size: 1,629 Bytes
f39c319 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import db, { initDB } from './src/crawler/db';
import { fetchPage } from './src/crawler/fetcher';
import { extractAndNormalize } from './src/crawler/extractor';
import { sliceAndDiff } from './src/crawler/differ';
async function test() {
initDB();
const source_id = 'test_source';
const job_id = 'test_job';
db.prepare(`
INSERT OR IGNORE INTO source_registry (
source_id, source_name, source_type, domain, entry_url, url_pattern,
parser_type, crawl_frequency, priority, enabled, topic_tags
) VALUES (
?, '测试官网', 'peer_bank', 'httpbin.org', 'http://httpbin.org/html', '',
'html_main_content', 'daily', 'high', 1, '[]'
)
`).run(source_id);
db.prepare(`
INSERT OR IGNORE INTO crawl_job (
job_id, source_id, trigger_type, status, started_at
) VALUES (
?, ?, 'manual', 'queued', ?
)
`).run(job_id, source_id, new Date().toISOString());
console.log('Fetching...');
const snapshot = await fetchPage(job_id, source_id);
console.log('Fetched snapshot:', snapshot.snapshot_id);
console.log('Extracting...');
const doc = extractAndNormalize(snapshot);
if (doc) {
console.log('Normalized document:', doc.doc_id);
console.log('Diffing...');
sliceAndDiff(doc);
console.log('Diff done.');
} else {
console.log('No new version.');
}
// print stats
const chunks = db.prepare('SELECT count(*) as c FROM clause_chunk').get() as any;
const events = db.prepare('SELECT count(*) as c FROM diff_event').get() as any;
console.log('Chunks:', chunks.c);
console.log('Events:', events.c);
}
test().catch(console.error);
|