File size: 7,593 Bytes
f39c319 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | import axios from 'axios';
import express from 'express';
import { Server } from 'http';
import app from './src/index';
import db, { initDB } from './src/crawler/db';
const PORT = 3005;
const BASE_URL = `http://localhost:${PORT}`;
// mock axios.get to ensure stable tests without relying on external network
const originalGet = axios.get;
(axios as any).get = async (url: string, config?: any) => {
let content = '';
if (url.includes('icbc')) {
content = '工行隐私政策正文。包含SDK和敏感权限说明。';
} else if (url.includes('boc')) {
content = '中行隐私政策正文。收集您的定位权限。';
} else if (url.includes('ccb')) {
content = '建行隐私政策。未成年人保护条款。';
} else if (url.includes('cac.gov.cn')) {
content = '网信办通报。关于15款App和16款SDK个人信息收集使用问题。';
} else if (url.includes('nfra.gov.cn')) {
content = '金监总局数据安全管理办法。数据出境要求。';
} else if (url.includes('jiguang')) {
content = '极光SDK合规指引。第三方共享数据说明。';
} else {
content = '默认正文内容。包含SDK和权限。';
}
// Return dummy HTML to be parsed by extractor
return {
status: 200,
headers: { 'content-type': 'text/html' },
data: `<html><body><h1>标题</h1><p>${content}</p></body></html>`
};
};
const seedSources = [
{
source_name: "中国工商银行(工银融e行个人信息保护政策)",
source_type: "peer_bank",
domain: "m.icbc.com.cn",
entry_url: "https://m.icbc.com.cn/ICBC/disclaimer/2.htm",
url_pattern: "disclaimer",
parser_type: "html_main_content",
crawl_frequency: "daily",
priority: "high",
enabled: true,
topic_tags: ["privacy_policy"]
},
{
source_name: "中国银行(手机银行隐私政策)",
source_type: "peer_bank",
domain: "ebsnew.boc.cn",
entry_url: "https://ebsnew.boc.cn/bocphone/VuePhone/tools/privacyPolicy/privacyPolicyA.html",
url_pattern: "privacyPolicy",
parser_type: "html_main_content",
crawl_frequency: "daily",
priority: "high",
enabled: true,
topic_tags: ["privacy_policy"]
},
{
source_name: "中国建设银行(隐私协议)",
source_type: "peer_bank",
domain: "ccb.com",
entry_url: "https://ccb.com/chn/mycom/register_xy_secret.shtml",
url_pattern: "register_xy_secret",
parser_type: "html_main_content",
crawl_frequency: "daily",
priority: "high",
enabled: true,
topic_tags: ["privacy_policy"]
},
{
source_name: "关于15款App和16款SDK个人信息收集使用问题的通报",
source_type: "regulator",
domain: "www.cac.gov.cn",
entry_url: "https://www.cac.gov.cn/2025-05/06/c_1748239411359045.htm",
url_pattern: "cac",
parser_type: "html_main_content",
crawl_frequency: "4h",
priority: "high",
enabled: true,
topic_tags: ["regulatory_update"]
},
{
source_name: "银行保险机构数据安全管理办法",
source_type: "regulator",
domain: "www.nfra.gov.cn",
entry_url: "https://www.nfra.gov.cn/cn/view/pages/ItemDetail.html?docId=1192308&generaltype=0&itemId=926",
url_pattern: "nfra",
parser_type: "html_main_content",
crawl_frequency: "4h",
priority: "high",
enabled: true,
topic_tags: ["regulatory_update"]
},
{
source_name: "极光 SDK 产品合规指引说明",
source_type: "sdk_vendor",
domain: "docs.jiguang.cn",
entry_url: "https://docs.jiguang.cn/compliance_guide/sdk_compliance_guide/sdk_compliance_guide",
url_pattern: "jiguang",
parser_type: "html_main_content",
crawl_frequency: "daily",
priority: "high",
enabled: true,
topic_tags: ["sdk_compliance"]
}
];
async function post(url: string, data: any) {
const res = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(data)
});
const json = await res.json();
if (!res.ok) throw new Error(`POST ${url} failed: ${JSON.stringify(json)}`);
return json;
}
async function get(url: string) {
const res = await fetch(url);
const json = await res.json();
if (!res.ok) throw new Error(`GET ${url} failed: ${JSON.stringify(json)}`);
return json;
}
function delay(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms));
}
let server: Server;
async function run() {
console.log("=== 1. 启动 Express app ===");
initDB(); // ensure DB is initialized
// Clear tables for clean test
db.prepare('DELETE FROM diff_event').run();
db.prepare('DELETE FROM clause_chunk').run();
db.prepare('DELETE FROM normalized_document').run();
db.prepare('DELETE FROM raw_snapshot').run();
db.prepare('DELETE FROM crawl_job').run();
db.prepare('DELETE FROM source_registry').run();
server = app.listen(PORT, async () => {
console.log(`Server started on ${PORT}`);
try {
console.log("\\n=== 2. 插入白名单种子来源 ===");
const sourceIds: string[] = [];
for (const src of seedSources) {
const res = await post(`${BASE_URL}/api/crawler/sources`, src);
sourceIds.push(res.source_id);
console.log(`Inserted ${src.source_name} -> ${res.source_id}`);
}
console.log("\\n=== 3. 手动触发抓取任务 ===");
const jobRes = await post(`${BASE_URL}/api/crawler/jobs`, {
source_ids: sourceIds,
trigger_type: "manual"
});
console.log(`Triggered jobs:`, jobRes.job_ids);
console.log("\\n=== 4. 等待抓取完成,查询结构化更新 ===");
// Wait for async jobs to complete
await delay(2000);
// Check job status
for (const jid of jobRes.job_ids) {
const j = await get(`${BASE_URL}/api/crawler/jobs/${jid}`);
console.log(`Job ${jid} status: ${j.status}`);
}
const updates = await get(`${BASE_URL}/api/crawler/updates`);
console.log("\\n[结构化更新输出]");
console.log(JSON.stringify(updates, null, 2));
const peerCount = updates.peer_updates?.length || 0;
const regCount = updates.regulatory_updates?.length || 0;
const sdkCount = updates.sdk_updates?.length || 0;
if (peerCount > 0 || regCount > 0 || sdkCount > 0) {
console.log(`✅ 成功输出结构化更新: ${peerCount} 同业, ${regCount} 监管, ${sdkCount} SDK`);
} else {
console.error(`❌ 未输出任何结构化更新`);
}
console.log("\\n=== 5. 再次触发抓取,验证去重机制 ===");
const jobRes2 = await post(`${BASE_URL}/api/crawler/jobs`, {
source_ids: sourceIds,
trigger_type: "manual"
});
console.log(`Triggered jobs (2nd time):`, jobRes2.job_ids);
await delay(2000);
// Check how many documents exist per source
console.log("\\n[验证版本去重]");
let dedupSuccess = true;
for (const sid of sourceIds) {
const docs = await get(`${BASE_URL}/api/crawler/documents?source_id=${sid}`);
if (docs.items.length !== 1) {
console.error(`❌ Source ${sid} has ${docs.items.length} versions, expected 1`);
dedupSuccess = false;
}
}
if (dedupSuccess) {
console.log(`✅ 去重验证通过,没有生成新版本`);
}
console.log("\\n=== 测试完成 ===");
} catch (err) {
console.error("Test failed:", err);
process.exitCode = 1;
} finally {
server.close();
process.exit();
}
});
}
run();
|