File size: 2,023 Bytes
f39c319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import axios from 'axios';
import crypto from 'crypto';
import { v4 as uuidv4 } from 'uuid';
import db from './db';
import { RawSnapshot } from './models';

export async function fetchPage(job_id: string, source_id: string): Promise<RawSnapshot> {
  const source = db.prepare('SELECT * FROM source_registry WHERE source_id = ?').get(source_id) as any;
  if (!source) {
    throw new Error(`Source ${source_id} not found`);
  }

  const url = source.entry_url;
  let raw_body = '';
  let content_type = '';
  let http_status = 0;
  let final_url = url;

  try {
    const response = await axios.get(url, {
      timeout: 10000,
      headers: {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
      }
    });
    raw_body = typeof response.data === 'string' ? response.data : JSON.stringify(response.data);
    const ct = response.headers['content-type'] as string | string[] | undefined;
    content_type = (Array.isArray(ct) ? ct[0] : ct) || 'text/html';
    http_status = response.status;
    final_url = response.request?.res?.responseUrl || url;
  } catch (error: any) {
    http_status = error.response?.status || 500;
    throw new Error(`Fetch failed: ${error.message}`);
  }

  const raw_hash = crypto.createHash('sha256').update(raw_body).digest('hex');
  const snapshot_id = `snap_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
  const fetched_at = new Date().toISOString();

  const snapshot: RawSnapshot = {
    snapshot_id,
    source_id,
    job_id,
    fetched_at,
    content_type,
    raw_body,
    raw_hash,
    http_status,
    final_url
  };

  const stmt = db.prepare(`
    INSERT INTO raw_snapshot (
      snapshot_id, source_id, job_id, fetched_at, content_type,
      raw_body, raw_hash, http_status, final_url
    ) VALUES (
      @snapshot_id, @source_id, @job_id, @fetched_at, @content_type,
      @raw_body, @raw_hash, @http_status, @final_url
    )
  `);

  stmt.run(snapshot);

  return snapshot;
}