Spaces:

luoleyuan
/

agent01

Sleeping

agent01 / test_crawler.ts

Auto Deployer

Deploy compliance agent services

f39c319 26 days ago

1.63 kB

	import db, { initDB } from './src/crawler/db';
	import { fetchPage } from './src/crawler/fetcher';
	import { extractAndNormalize } from './src/crawler/extractor';
	import { sliceAndDiff } from './src/crawler/differ';

	async function test() {
	initDB();

	const source_id = 'test_source';
	const job_id = 'test_job';

	db.prepare(`
	INSERT OR IGNORE INTO source_registry (
	source_id, source_name, source_type, domain, entry_url, url_pattern,
	parser_type, crawl_frequency, priority, enabled, topic_tags
	) VALUES (
	?, '测试官网', 'peer_bank', 'httpbin.org', 'http://httpbin.org/html', '',
	'html_main_content', 'daily', 'high', 1, '[]'
	)
	`).run(source_id);

	db.prepare(`
	INSERT OR IGNORE INTO crawl_job (
	job_id, source_id, trigger_type, status, started_at
	) VALUES (
	?, ?, 'manual', 'queued', ?
	)
	`).run(job_id, source_id, new Date().toISOString());

	console.log('Fetching...');
	const snapshot = await fetchPage(job_id, source_id);
	console.log('Fetched snapshot:', snapshot.snapshot_id);

	console.log('Extracting...');
	const doc = extractAndNormalize(snapshot);
	if (doc) {
	console.log('Normalized document:', doc.doc_id);
	console.log('Diffing...');
	sliceAndDiff(doc);
	console.log('Diff done.');
	} else {
	console.log('No new version.');
	}

	// print stats
	const chunks = db.prepare('SELECT count(*) as c FROM clause_chunk').get() as any;
	const events = db.prepare('SELECT count(*) as c FROM diff_event').get() as any;
	console.log('Chunks:', chunks.c);
	console.log('Events:', events.c);
	}

	test().catch(console.error);