import axios from 'axios'; import crypto from 'crypto'; import { v4 as uuidv4 } from 'uuid'; import db from './db'; import { RawSnapshot } from './models'; export async function fetchPage(job_id: string, source_id: string): Promise { const source = db.prepare('SELECT * FROM source_registry WHERE source_id = ?').get(source_id) as any; if (!source) { throw new Error(`Source ${source_id} not found`); } const url = source.entry_url; let raw_body = ''; let content_type = ''; let http_status = 0; let final_url = url; try { const response = await axios.get(url, { timeout: 10000, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } }); raw_body = typeof response.data === 'string' ? response.data : JSON.stringify(response.data); const ct = response.headers['content-type'] as string | string[] | undefined; content_type = (Array.isArray(ct) ? ct[0] : ct) || 'text/html'; http_status = response.status; final_url = response.request?.res?.responseUrl || url; } catch (error: any) { http_status = error.response?.status || 500; throw new Error(`Fetch failed: ${error.message}`); } const raw_hash = crypto.createHash('sha256').update(raw_body).digest('hex'); const snapshot_id = `snap_${uuidv4().replace(/-/g, '').substring(0, 16)}`; const fetched_at = new Date().toISOString(); const snapshot: RawSnapshot = { snapshot_id, source_id, job_id, fetched_at, content_type, raw_body, raw_hash, http_status, final_url }; const stmt = db.prepare(` INSERT INTO raw_snapshot ( snapshot_id, source_id, job_id, fetched_at, content_type, raw_body, raw_hash, http_status, final_url ) VALUES ( @snapshot_id, @source_id, @job_id, @fetched_at, @content_type, @raw_body, @raw_hash, @http_status, @final_url ) `); stmt.run(snapshot); return snapshot; }