import axios from 'axios'; import express from 'express'; import { Server } from 'http'; import app from './src/index'; import db, { initDB } from './src/crawler/db'; const PORT = 3005; const BASE_URL = `http://localhost:${PORT}`; // mock axios.get to ensure stable tests without relying on external network const originalGet = axios.get; (axios as any).get = async (url: string, config?: any) => { let content = ''; if (url.includes('icbc')) { content = '工行隐私政策正文。包含SDK和敏感权限说明。'; } else if (url.includes('boc')) { content = '中行隐私政策正文。收集您的定位权限。'; } else if (url.includes('ccb')) { content = '建行隐私政策。未成年人保护条款。'; } else if (url.includes('cac.gov.cn')) { content = '网信办通报。关于15款App和16款SDK个人信息收集使用问题。'; } else if (url.includes('nfra.gov.cn')) { content = '金监总局数据安全管理办法。数据出境要求。'; } else if (url.includes('jiguang')) { content = '极光SDK合规指引。第三方共享数据说明。'; } else { content = '默认正文内容。包含SDK和权限。'; } // Return dummy HTML to be parsed by extractor return { status: 200, headers: { 'content-type': 'text/html' }, data: `
${content}
` }; }; const seedSources = [ { source_name: "中国工商银行(工银融e行个人信息保护政策)", source_type: "peer_bank", domain: "m.icbc.com.cn", entry_url: "https://m.icbc.com.cn/ICBC/disclaimer/2.htm", url_pattern: "disclaimer", parser_type: "html_main_content", crawl_frequency: "daily", priority: "high", enabled: true, topic_tags: ["privacy_policy"] }, { source_name: "中国银行(手机银行隐私政策)", source_type: "peer_bank", domain: "ebsnew.boc.cn", entry_url: "https://ebsnew.boc.cn/bocphone/VuePhone/tools/privacyPolicy/privacyPolicyA.html", url_pattern: "privacyPolicy", parser_type: "html_main_content", crawl_frequency: "daily", priority: "high", enabled: true, topic_tags: ["privacy_policy"] }, { source_name: "中国建设银行(隐私协议)", source_type: "peer_bank", domain: "ccb.com", entry_url: "https://ccb.com/chn/mycom/register_xy_secret.shtml", url_pattern: "register_xy_secret", parser_type: "html_main_content", crawl_frequency: "daily", priority: "high", enabled: true, topic_tags: ["privacy_policy"] }, { source_name: "关于15款App和16款SDK个人信息收集使用问题的通报", source_type: "regulator", domain: "www.cac.gov.cn", entry_url: "https://www.cac.gov.cn/2025-05/06/c_1748239411359045.htm", url_pattern: "cac", parser_type: "html_main_content", crawl_frequency: "4h", priority: "high", enabled: true, topic_tags: ["regulatory_update"] }, { source_name: "银行保险机构数据安全管理办法", source_type: "regulator", domain: "www.nfra.gov.cn", entry_url: "https://www.nfra.gov.cn/cn/view/pages/ItemDetail.html?docId=1192308&generaltype=0&itemId=926", url_pattern: "nfra", parser_type: "html_main_content", crawl_frequency: "4h", priority: "high", enabled: true, topic_tags: ["regulatory_update"] }, { source_name: "极光 SDK 产品合规指引说明", source_type: "sdk_vendor", domain: "docs.jiguang.cn", entry_url: "https://docs.jiguang.cn/compliance_guide/sdk_compliance_guide/sdk_compliance_guide", url_pattern: "jiguang", parser_type: "html_main_content", crawl_frequency: "daily", priority: "high", enabled: true, topic_tags: ["sdk_compliance"] } ]; async function post(url: string, data: any) { const res = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(data) }); const json = await res.json(); if (!res.ok) throw new Error(`POST ${url} failed: ${JSON.stringify(json)}`); return json; } async function get(url: string) { const res = await fetch(url); const json = await res.json(); if (!res.ok) throw new Error(`GET ${url} failed: ${JSON.stringify(json)}`); return json; } function delay(ms: number) { return new Promise(resolve => setTimeout(resolve, ms)); } let server: Server; async function run() { console.log("=== 1. 启动 Express app ==="); initDB(); // ensure DB is initialized // Clear tables for clean test db.prepare('DELETE FROM diff_event').run(); db.prepare('DELETE FROM clause_chunk').run(); db.prepare('DELETE FROM normalized_document').run(); db.prepare('DELETE FROM raw_snapshot').run(); db.prepare('DELETE FROM crawl_job').run(); db.prepare('DELETE FROM source_registry').run(); server = app.listen(PORT, async () => { console.log(`Server started on ${PORT}`); try { console.log("\\n=== 2. 插入白名单种子来源 ==="); const sourceIds: string[] = []; for (const src of seedSources) { const res = await post(`${BASE_URL}/api/crawler/sources`, src); sourceIds.push(res.source_id); console.log(`Inserted ${src.source_name} -> ${res.source_id}`); } console.log("\\n=== 3. 手动触发抓取任务 ==="); const jobRes = await post(`${BASE_URL}/api/crawler/jobs`, { source_ids: sourceIds, trigger_type: "manual" }); console.log(`Triggered jobs:`, jobRes.job_ids); console.log("\\n=== 4. 等待抓取完成,查询结构化更新 ==="); // Wait for async jobs to complete await delay(2000); // Check job status for (const jid of jobRes.job_ids) { const j = await get(`${BASE_URL}/api/crawler/jobs/${jid}`); console.log(`Job ${jid} status: ${j.status}`); } const updates = await get(`${BASE_URL}/api/crawler/updates`); console.log("\\n[结构化更新输出]"); console.log(JSON.stringify(updates, null, 2)); const peerCount = updates.peer_updates?.length || 0; const regCount = updates.regulatory_updates?.length || 0; const sdkCount = updates.sdk_updates?.length || 0; if (peerCount > 0 || regCount > 0 || sdkCount > 0) { console.log(`✅ 成功输出结构化更新: ${peerCount} 同业, ${regCount} 监管, ${sdkCount} SDK`); } else { console.error(`❌ 未输出任何结构化更新`); } console.log("\\n=== 5. 再次触发抓取,验证去重机制 ==="); const jobRes2 = await post(`${BASE_URL}/api/crawler/jobs`, { source_ids: sourceIds, trigger_type: "manual" }); console.log(`Triggered jobs (2nd time):`, jobRes2.job_ids); await delay(2000); // Check how many documents exist per source console.log("\\n[验证版本去重]"); let dedupSuccess = true; for (const sid of sourceIds) { const docs = await get(`${BASE_URL}/api/crawler/documents?source_id=${sid}`); if (docs.items.length !== 1) { console.error(`❌ Source ${sid} has ${docs.items.length} versions, expected 1`); dedupSuccess = false; } } if (dedupSuccess) { console.log(`✅ 去重验证通过,没有生成新版本`); } console.log("\\n=== 测试完成 ==="); } catch (err) { console.error("Test failed:", err); process.exitCode = 1; } finally { server.close(); process.exit(); } }); } run();