| import axios from 'axios'; |
| import crypto from 'crypto'; |
| import { v4 as uuidv4 } from 'uuid'; |
| import db from './db'; |
| import { RawSnapshot } from './models'; |
|
|
| export async function fetchPage(job_id: string, source_id: string): Promise<RawSnapshot> { |
| const source = db.prepare('SELECT * FROM source_registry WHERE source_id = ?').get(source_id) as any; |
| if (!source) { |
| throw new Error(`Source ${source_id} not found`); |
| } |
|
|
| const url = source.entry_url; |
| let raw_body = ''; |
| let content_type = ''; |
| let http_status = 0; |
| let final_url = url; |
|
|
| try { |
| const response = await axios.get(url, { |
| timeout: 10000, |
| headers: { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
| } |
| }); |
| raw_body = typeof response.data === 'string' ? response.data : JSON.stringify(response.data); |
| const ct = response.headers['content-type'] as string | string[] | undefined; |
| content_type = (Array.isArray(ct) ? ct[0] : ct) || 'text/html'; |
| http_status = response.status; |
| final_url = response.request?.res?.responseUrl || url; |
| } catch (error: any) { |
| http_status = error.response?.status || 500; |
| throw new Error(`Fetch failed: ${error.message}`); |
| } |
|
|
| const raw_hash = crypto.createHash('sha256').update(raw_body).digest('hex'); |
| const snapshot_id = `snap_${uuidv4().replace(/-/g, '').substring(0, 16)}`; |
| const fetched_at = new Date().toISOString(); |
|
|
| const snapshot: RawSnapshot = { |
| snapshot_id, |
| source_id, |
| job_id, |
| fetched_at, |
| content_type, |
| raw_body, |
| raw_hash, |
| http_status, |
| final_url |
| }; |
|
|
| const stmt = db.prepare(` |
| INSERT INTO raw_snapshot ( |
| snapshot_id, source_id, job_id, fetched_at, content_type, |
| raw_body, raw_hash, http_status, final_url |
| ) VALUES ( |
| @snapshot_id, @source_id, @job_id, @fetched_at, @content_type, |
| @raw_body, @raw_hash, @http_status, @final_url |
| ) |
| `); |
|
|
| stmt.run(snapshot); |
|
|
| return snapshot; |
| } |
|
|