| import axios from 'axios'; |
| import express from 'express'; |
| import { Server } from 'http'; |
| import app from './src/index'; |
| import db, { initDB } from './src/crawler/db'; |
|
|
| const PORT = 3005; |
| const BASE_URL = `http://localhost:${PORT}`; |
|
|
| |
| const originalGet = axios.get; |
| (axios as any).get = async (url: string, config?: any) => { |
| let content = ''; |
| if (url.includes('icbc')) { |
| content = '工行隐私政策正文。包含SDK和敏感权限说明。'; |
| } else if (url.includes('boc')) { |
| content = '中行隐私政策正文。收集您的定位权限。'; |
| } else if (url.includes('ccb')) { |
| content = '建行隐私政策。未成年人保护条款。'; |
| } else if (url.includes('cac.gov.cn')) { |
| content = '网信办通报。关于15款App和16款SDK个人信息收集使用问题。'; |
| } else if (url.includes('nfra.gov.cn')) { |
| content = '金监总局数据安全管理办法。数据出境要求。'; |
| } else if (url.includes('jiguang')) { |
| content = '极光SDK合规指引。第三方共享数据说明。'; |
| } else { |
| content = '默认正文内容。包含SDK和权限。'; |
| } |
| |
| |
| return { |
| status: 200, |
| headers: { 'content-type': 'text/html' }, |
| data: `<html><body><h1>标题</h1><p>${content}</p></body></html>` |
| }; |
| }; |
|
|
| const seedSources = [ |
| { |
| source_name: "中国工商银行(工银融e行个人信息保护政策)", |
| source_type: "peer_bank", |
| domain: "m.icbc.com.cn", |
| entry_url: "https://m.icbc.com.cn/ICBC/disclaimer/2.htm", |
| url_pattern: "disclaimer", |
| parser_type: "html_main_content", |
| crawl_frequency: "daily", |
| priority: "high", |
| enabled: true, |
| topic_tags: ["privacy_policy"] |
| }, |
| { |
| source_name: "中国银行(手机银行隐私政策)", |
| source_type: "peer_bank", |
| domain: "ebsnew.boc.cn", |
| entry_url: "https://ebsnew.boc.cn/bocphone/VuePhone/tools/privacyPolicy/privacyPolicyA.html", |
| url_pattern: "privacyPolicy", |
| parser_type: "html_main_content", |
| crawl_frequency: "daily", |
| priority: "high", |
| enabled: true, |
| topic_tags: ["privacy_policy"] |
| }, |
| { |
| source_name: "中国建设银行(隐私协议)", |
| source_type: "peer_bank", |
| domain: "ccb.com", |
| entry_url: "https://ccb.com/chn/mycom/register_xy_secret.shtml", |
| url_pattern: "register_xy_secret", |
| parser_type: "html_main_content", |
| crawl_frequency: "daily", |
| priority: "high", |
| enabled: true, |
| topic_tags: ["privacy_policy"] |
| }, |
| { |
| source_name: "关于15款App和16款SDK个人信息收集使用问题的通报", |
| source_type: "regulator", |
| domain: "www.cac.gov.cn", |
| entry_url: "https://www.cac.gov.cn/2025-05/06/c_1748239411359045.htm", |
| url_pattern: "cac", |
| parser_type: "html_main_content", |
| crawl_frequency: "4h", |
| priority: "high", |
| enabled: true, |
| topic_tags: ["regulatory_update"] |
| }, |
| { |
| source_name: "银行保险机构数据安全管理办法", |
| source_type: "regulator", |
| domain: "www.nfra.gov.cn", |
| entry_url: "https://www.nfra.gov.cn/cn/view/pages/ItemDetail.html?docId=1192308&generaltype=0&itemId=926", |
| url_pattern: "nfra", |
| parser_type: "html_main_content", |
| crawl_frequency: "4h", |
| priority: "high", |
| enabled: true, |
| topic_tags: ["regulatory_update"] |
| }, |
| { |
| source_name: "极光 SDK 产品合规指引说明", |
| source_type: "sdk_vendor", |
| domain: "docs.jiguang.cn", |
| entry_url: "https://docs.jiguang.cn/compliance_guide/sdk_compliance_guide/sdk_compliance_guide", |
| url_pattern: "jiguang", |
| parser_type: "html_main_content", |
| crawl_frequency: "daily", |
| priority: "high", |
| enabled: true, |
| topic_tags: ["sdk_compliance"] |
| } |
| ]; |
|
|
| async function post(url: string, data: any) { |
| const res = await fetch(url, { |
| method: 'POST', |
| headers: { 'Content-Type': 'application/json' }, |
| body: JSON.stringify(data) |
| }); |
| const json = await res.json(); |
| if (!res.ok) throw new Error(`POST ${url} failed: ${JSON.stringify(json)}`); |
| return json; |
| } |
|
|
| async function get(url: string) { |
| const res = await fetch(url); |
| const json = await res.json(); |
| if (!res.ok) throw new Error(`GET ${url} failed: ${JSON.stringify(json)}`); |
| return json; |
| } |
|
|
| function delay(ms: number) { |
| return new Promise(resolve => setTimeout(resolve, ms)); |
| } |
|
|
| let server: Server; |
|
|
| async function run() { |
| console.log("=== 1. 启动 Express app ==="); |
| initDB(); |
| |
| |
| db.prepare('DELETE FROM diff_event').run(); |
| db.prepare('DELETE FROM clause_chunk').run(); |
| db.prepare('DELETE FROM normalized_document').run(); |
| db.prepare('DELETE FROM raw_snapshot').run(); |
| db.prepare('DELETE FROM crawl_job').run(); |
| db.prepare('DELETE FROM source_registry').run(); |
|
|
| server = app.listen(PORT, async () => { |
| console.log(`Server started on ${PORT}`); |
| |
| try { |
| console.log("\\n=== 2. 插入白名单种子来源 ==="); |
| const sourceIds: string[] = []; |
| for (const src of seedSources) { |
| const res = await post(`${BASE_URL}/api/crawler/sources`, src); |
| sourceIds.push(res.source_id); |
| console.log(`Inserted ${src.source_name} -> ${res.source_id}`); |
| } |
|
|
| console.log("\\n=== 3. 手动触发抓取任务 ==="); |
| const jobRes = await post(`${BASE_URL}/api/crawler/jobs`, { |
| source_ids: sourceIds, |
| trigger_type: "manual" |
| }); |
| console.log(`Triggered jobs:`, jobRes.job_ids); |
|
|
| console.log("\\n=== 4. 等待抓取完成,查询结构化更新 ==="); |
| |
| await delay(2000); |
|
|
| |
| for (const jid of jobRes.job_ids) { |
| const j = await get(`${BASE_URL}/api/crawler/jobs/${jid}`); |
| console.log(`Job ${jid} status: ${j.status}`); |
| } |
|
|
| const updates = await get(`${BASE_URL}/api/crawler/updates`); |
| console.log("\\n[结构化更新输出]"); |
| console.log(JSON.stringify(updates, null, 2)); |
| |
| const peerCount = updates.peer_updates?.length || 0; |
| const regCount = updates.regulatory_updates?.length || 0; |
| const sdkCount = updates.sdk_updates?.length || 0; |
| |
| if (peerCount > 0 || regCount > 0 || sdkCount > 0) { |
| console.log(`✅ 成功输出结构化更新: ${peerCount} 同业, ${regCount} 监管, ${sdkCount} SDK`); |
| } else { |
| console.error(`❌ 未输出任何结构化更新`); |
| } |
|
|
| console.log("\\n=== 5. 再次触发抓取,验证去重机制 ==="); |
| const jobRes2 = await post(`${BASE_URL}/api/crawler/jobs`, { |
| source_ids: sourceIds, |
| trigger_type: "manual" |
| }); |
| console.log(`Triggered jobs (2nd time):`, jobRes2.job_ids); |
| |
| await delay(2000); |
|
|
| |
| console.log("\\n[验证版本去重]"); |
| let dedupSuccess = true; |
| for (const sid of sourceIds) { |
| const docs = await get(`${BASE_URL}/api/crawler/documents?source_id=${sid}`); |
| if (docs.items.length !== 1) { |
| console.error(`❌ Source ${sid} has ${docs.items.length} versions, expected 1`); |
| dedupSuccess = false; |
| } |
| } |
| |
| if (dedupSuccess) { |
| console.log(`✅ 去重验证通过,没有生成新版本`); |
| } |
|
|
| console.log("\\n=== 测试完成 ==="); |
| } catch (err) { |
| console.error("Test failed:", err); |
| process.exitCode = 1; |
| } finally { |
| server.close(); |
| process.exit(); |
| } |
| }); |
| } |
|
|
| run(); |
|
|