File size: 7,593 Bytes
f39c319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import axios from 'axios';
import express from 'express';
import { Server } from 'http';
import app from './src/index';
import db, { initDB } from './src/crawler/db';

const PORT = 3005;
const BASE_URL = `http://localhost:${PORT}`;

// mock axios.get to ensure stable tests without relying on external network
const originalGet = axios.get;
(axios as any).get = async (url: string, config?: any) => {
  let content = '';
  if (url.includes('icbc')) {
    content = '工行隐私政策正文。包含SDK和敏感权限说明。';
  } else if (url.includes('boc')) {
    content = '中行隐私政策正文。收集您的定位权限。';
  } else if (url.includes('ccb')) {
    content = '建行隐私政策。未成年人保护条款。';
  } else if (url.includes('cac.gov.cn')) {
    content = '网信办通报。关于15款App和16款SDK个人信息收集使用问题。';
  } else if (url.includes('nfra.gov.cn')) {
    content = '金监总局数据安全管理办法。数据出境要求。';
  } else if (url.includes('jiguang')) {
    content = '极光SDK合规指引。第三方共享数据说明。';
  } else {
    content = '默认正文内容。包含SDK和权限。';
  }
  
  // Return dummy HTML to be parsed by extractor
  return {
    status: 200,
    headers: { 'content-type': 'text/html' },
    data: `<html><body><h1>标题</h1><p>${content}</p></body></html>`
  };
};

const seedSources = [
  {
    source_name: "中国工商银行(工银融e行个人信息保护政策)",
    source_type: "peer_bank",
    domain: "m.icbc.com.cn",
    entry_url: "https://m.icbc.com.cn/ICBC/disclaimer/2.htm",
    url_pattern: "disclaimer",
    parser_type: "html_main_content",
    crawl_frequency: "daily",
    priority: "high",
    enabled: true,
    topic_tags: ["privacy_policy"]
  },
  {
    source_name: "中国银行(手机银行隐私政策)",
    source_type: "peer_bank",
    domain: "ebsnew.boc.cn",
    entry_url: "https://ebsnew.boc.cn/bocphone/VuePhone/tools/privacyPolicy/privacyPolicyA.html",
    url_pattern: "privacyPolicy",
    parser_type: "html_main_content",
    crawl_frequency: "daily",
    priority: "high",
    enabled: true,
    topic_tags: ["privacy_policy"]
  },
  {
    source_name: "中国建设银行(隐私协议)",
    source_type: "peer_bank",
    domain: "ccb.com",
    entry_url: "https://ccb.com/chn/mycom/register_xy_secret.shtml",
    url_pattern: "register_xy_secret",
    parser_type: "html_main_content",
    crawl_frequency: "daily",
    priority: "high",
    enabled: true,
    topic_tags: ["privacy_policy"]
  },
  {
    source_name: "关于15款App和16款SDK个人信息收集使用问题的通报",
    source_type: "regulator",
    domain: "www.cac.gov.cn",
    entry_url: "https://www.cac.gov.cn/2025-05/06/c_1748239411359045.htm",
    url_pattern: "cac",
    parser_type: "html_main_content",
    crawl_frequency: "4h",
    priority: "high",
    enabled: true,
    topic_tags: ["regulatory_update"]
  },
  {
    source_name: "银行保险机构数据安全管理办法",
    source_type: "regulator",
    domain: "www.nfra.gov.cn",
    entry_url: "https://www.nfra.gov.cn/cn/view/pages/ItemDetail.html?docId=1192308&generaltype=0&itemId=926",
    url_pattern: "nfra",
    parser_type: "html_main_content",
    crawl_frequency: "4h",
    priority: "high",
    enabled: true,
    topic_tags: ["regulatory_update"]
  },
  {
    source_name: "极光 SDK 产品合规指引说明",
    source_type: "sdk_vendor",
    domain: "docs.jiguang.cn",
    entry_url: "https://docs.jiguang.cn/compliance_guide/sdk_compliance_guide/sdk_compliance_guide",
    url_pattern: "jiguang",
    parser_type: "html_main_content",
    crawl_frequency: "daily",
    priority: "high",
    enabled: true,
    topic_tags: ["sdk_compliance"]
  }
];

async function post(url: string, data: any) {
  const res = await fetch(url, {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify(data)
  });
  const json = await res.json();
  if (!res.ok) throw new Error(`POST ${url} failed: ${JSON.stringify(json)}`);
  return json;
}

async function get(url: string) {
  const res = await fetch(url);
  const json = await res.json();
  if (!res.ok) throw new Error(`GET ${url} failed: ${JSON.stringify(json)}`);
  return json;
}

function delay(ms: number) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

let server: Server;

async function run() {
  console.log("=== 1. 启动 Express app ===");
  initDB(); // ensure DB is initialized
  
  // Clear tables for clean test
  db.prepare('DELETE FROM diff_event').run();
  db.prepare('DELETE FROM clause_chunk').run();
  db.prepare('DELETE FROM normalized_document').run();
  db.prepare('DELETE FROM raw_snapshot').run();
  db.prepare('DELETE FROM crawl_job').run();
  db.prepare('DELETE FROM source_registry').run();

  server = app.listen(PORT, async () => {
    console.log(`Server started on ${PORT}`);
    
    try {
      console.log("\\n=== 2. 插入白名单种子来源 ===");
      const sourceIds: string[] = [];
      for (const src of seedSources) {
        const res = await post(`${BASE_URL}/api/crawler/sources`, src);
        sourceIds.push(res.source_id);
        console.log(`Inserted ${src.source_name} -> ${res.source_id}`);
      }

      console.log("\\n=== 3. 手动触发抓取任务 ===");
      const jobRes = await post(`${BASE_URL}/api/crawler/jobs`, {
        source_ids: sourceIds,
        trigger_type: "manual"
      });
      console.log(`Triggered jobs:`, jobRes.job_ids);

      console.log("\\n=== 4. 等待抓取完成,查询结构化更新 ===");
      // Wait for async jobs to complete
      await delay(2000); 

      // Check job status
      for (const jid of jobRes.job_ids) {
        const j = await get(`${BASE_URL}/api/crawler/jobs/${jid}`);
        console.log(`Job ${jid} status: ${j.status}`);
      }

      const updates = await get(`${BASE_URL}/api/crawler/updates`);
      console.log("\\n[结构化更新输出]");
      console.log(JSON.stringify(updates, null, 2));
      
      const peerCount = updates.peer_updates?.length || 0;
      const regCount = updates.regulatory_updates?.length || 0;
      const sdkCount = updates.sdk_updates?.length || 0;
      
      if (peerCount > 0 || regCount > 0 || sdkCount > 0) {
        console.log(`✅ 成功输出结构化更新: ${peerCount} 同业, ${regCount} 监管, ${sdkCount} SDK`);
      } else {
        console.error(`❌ 未输出任何结构化更新`);
      }

      console.log("\\n=== 5. 再次触发抓取,验证去重机制 ===");
      const jobRes2 = await post(`${BASE_URL}/api/crawler/jobs`, {
        source_ids: sourceIds,
        trigger_type: "manual"
      });
      console.log(`Triggered jobs (2nd time):`, jobRes2.job_ids);
      
      await delay(2000);

      // Check how many documents exist per source
      console.log("\\n[验证版本去重]");
      let dedupSuccess = true;
      for (const sid of sourceIds) {
        const docs = await get(`${BASE_URL}/api/crawler/documents?source_id=${sid}`);
        if (docs.items.length !== 1) {
          console.error(`❌ Source ${sid} has ${docs.items.length} versions, expected 1`);
          dedupSuccess = false;
        }
      }
      
      if (dedupSuccess) {
        console.log(`✅ 去重验证通过,没有生成新版本`);
      }

      console.log("\\n=== 测试完成 ===");
    } catch (err) {
      console.error("Test failed:", err);
      process.exitCode = 1;
    } finally {
      server.close();
      process.exit();
    }
  });
}

run();