agent01 / verify_crawler.ts
Auto Deployer
Deploy compliance agent services
f39c319
import axios from 'axios';
import express from 'express';
import { Server } from 'http';
import app from './src/index';
import db, { initDB } from './src/crawler/db';
const PORT = 3005;
const BASE_URL = `http://localhost:${PORT}`;
// mock axios.get to ensure stable tests without relying on external network
const originalGet = axios.get;
(axios as any).get = async (url: string, config?: any) => {
let content = '';
if (url.includes('icbc')) {
content = '工行隐私政策正文。包含SDK和敏感权限说明。';
} else if (url.includes('boc')) {
content = '中行隐私政策正文。收集您的定位权限。';
} else if (url.includes('ccb')) {
content = '建行隐私政策。未成年人保护条款。';
} else if (url.includes('cac.gov.cn')) {
content = '网信办通报。关于15款App和16款SDK个人信息收集使用问题。';
} else if (url.includes('nfra.gov.cn')) {
content = '金监总局数据安全管理办法。数据出境要求。';
} else if (url.includes('jiguang')) {
content = '极光SDK合规指引。第三方共享数据说明。';
} else {
content = '默认正文内容。包含SDK和权限。';
}
// Return dummy HTML to be parsed by extractor
return {
status: 200,
headers: { 'content-type': 'text/html' },
data: `<html><body><h1>标题</h1><p>${content}</p></body></html>`
};
};
const seedSources = [
{
source_name: "中国工商银行(工银融e行个人信息保护政策)",
source_type: "peer_bank",
domain: "m.icbc.com.cn",
entry_url: "https://m.icbc.com.cn/ICBC/disclaimer/2.htm",
url_pattern: "disclaimer",
parser_type: "html_main_content",
crawl_frequency: "daily",
priority: "high",
enabled: true,
topic_tags: ["privacy_policy"]
},
{
source_name: "中国银行(手机银行隐私政策)",
source_type: "peer_bank",
domain: "ebsnew.boc.cn",
entry_url: "https://ebsnew.boc.cn/bocphone/VuePhone/tools/privacyPolicy/privacyPolicyA.html",
url_pattern: "privacyPolicy",
parser_type: "html_main_content",
crawl_frequency: "daily",
priority: "high",
enabled: true,
topic_tags: ["privacy_policy"]
},
{
source_name: "中国建设银行(隐私协议)",
source_type: "peer_bank",
domain: "ccb.com",
entry_url: "https://ccb.com/chn/mycom/register_xy_secret.shtml",
url_pattern: "register_xy_secret",
parser_type: "html_main_content",
crawl_frequency: "daily",
priority: "high",
enabled: true,
topic_tags: ["privacy_policy"]
},
{
source_name: "关于15款App和16款SDK个人信息收集使用问题的通报",
source_type: "regulator",
domain: "www.cac.gov.cn",
entry_url: "https://www.cac.gov.cn/2025-05/06/c_1748239411359045.htm",
url_pattern: "cac",
parser_type: "html_main_content",
crawl_frequency: "4h",
priority: "high",
enabled: true,
topic_tags: ["regulatory_update"]
},
{
source_name: "银行保险机构数据安全管理办法",
source_type: "regulator",
domain: "www.nfra.gov.cn",
entry_url: "https://www.nfra.gov.cn/cn/view/pages/ItemDetail.html?docId=1192308&generaltype=0&itemId=926",
url_pattern: "nfra",
parser_type: "html_main_content",
crawl_frequency: "4h",
priority: "high",
enabled: true,
topic_tags: ["regulatory_update"]
},
{
source_name: "极光 SDK 产品合规指引说明",
source_type: "sdk_vendor",
domain: "docs.jiguang.cn",
entry_url: "https://docs.jiguang.cn/compliance_guide/sdk_compliance_guide/sdk_compliance_guide",
url_pattern: "jiguang",
parser_type: "html_main_content",
crawl_frequency: "daily",
priority: "high",
enabled: true,
topic_tags: ["sdk_compliance"]
}
];
async function post(url: string, data: any) {
const res = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(data)
});
const json = await res.json();
if (!res.ok) throw new Error(`POST ${url} failed: ${JSON.stringify(json)}`);
return json;
}
async function get(url: string) {
const res = await fetch(url);
const json = await res.json();
if (!res.ok) throw new Error(`GET ${url} failed: ${JSON.stringify(json)}`);
return json;
}
function delay(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms));
}
let server: Server;
async function run() {
console.log("=== 1. 启动 Express app ===");
initDB(); // ensure DB is initialized
// Clear tables for clean test
db.prepare('DELETE FROM diff_event').run();
db.prepare('DELETE FROM clause_chunk').run();
db.prepare('DELETE FROM normalized_document').run();
db.prepare('DELETE FROM raw_snapshot').run();
db.prepare('DELETE FROM crawl_job').run();
db.prepare('DELETE FROM source_registry').run();
server = app.listen(PORT, async () => {
console.log(`Server started on ${PORT}`);
try {
console.log("\\n=== 2. 插入白名单种子来源 ===");
const sourceIds: string[] = [];
for (const src of seedSources) {
const res = await post(`${BASE_URL}/api/crawler/sources`, src);
sourceIds.push(res.source_id);
console.log(`Inserted ${src.source_name} -> ${res.source_id}`);
}
console.log("\\n=== 3. 手动触发抓取任务 ===");
const jobRes = await post(`${BASE_URL}/api/crawler/jobs`, {
source_ids: sourceIds,
trigger_type: "manual"
});
console.log(`Triggered jobs:`, jobRes.job_ids);
console.log("\\n=== 4. 等待抓取完成,查询结构化更新 ===");
// Wait for async jobs to complete
await delay(2000);
// Check job status
for (const jid of jobRes.job_ids) {
const j = await get(`${BASE_URL}/api/crawler/jobs/${jid}`);
console.log(`Job ${jid} status: ${j.status}`);
}
const updates = await get(`${BASE_URL}/api/crawler/updates`);
console.log("\\n[结构化更新输出]");
console.log(JSON.stringify(updates, null, 2));
const peerCount = updates.peer_updates?.length || 0;
const regCount = updates.regulatory_updates?.length || 0;
const sdkCount = updates.sdk_updates?.length || 0;
if (peerCount > 0 || regCount > 0 || sdkCount > 0) {
console.log(`✅ 成功输出结构化更新: ${peerCount} 同业, ${regCount} 监管, ${sdkCount} SDK`);
} else {
console.error(`❌ 未输出任何结构化更新`);
}
console.log("\\n=== 5. 再次触发抓取,验证去重机制 ===");
const jobRes2 = await post(`${BASE_URL}/api/crawler/jobs`, {
source_ids: sourceIds,
trigger_type: "manual"
});
console.log(`Triggered jobs (2nd time):`, jobRes2.job_ids);
await delay(2000);
// Check how many documents exist per source
console.log("\\n[验证版本去重]");
let dedupSuccess = true;
for (const sid of sourceIds) {
const docs = await get(`${BASE_URL}/api/crawler/documents?source_id=${sid}`);
if (docs.items.length !== 1) {
console.error(`❌ Source ${sid} has ${docs.items.length} versions, expected 1`);
dedupSuccess = false;
}
}
if (dedupSuccess) {
console.log(`✅ 去重验证通过,没有生成新版本`);
}
console.log("\\n=== 测试完成 ===");
} catch (err) {
console.error("Test failed:", err);
process.exitCode = 1;
} finally {
server.close();
process.exit();
}
});
}
run();