agent01 / src /crawler /fetcher.ts
Auto Deployer
Deploy compliance agent services
f39c319
import axios from 'axios';
import crypto from 'crypto';
import { v4 as uuidv4 } from 'uuid';
import db from './db';
import { RawSnapshot } from './models';
export async function fetchPage(job_id: string, source_id: string): Promise<RawSnapshot> {
const source = db.prepare('SELECT * FROM source_registry WHERE source_id = ?').get(source_id) as any;
if (!source) {
throw new Error(`Source ${source_id} not found`);
}
const url = source.entry_url;
let raw_body = '';
let content_type = '';
let http_status = 0;
let final_url = url;
try {
const response = await axios.get(url, {
timeout: 10000,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
raw_body = typeof response.data === 'string' ? response.data : JSON.stringify(response.data);
const ct = response.headers['content-type'] as string | string[] | undefined;
content_type = (Array.isArray(ct) ? ct[0] : ct) || 'text/html';
http_status = response.status;
final_url = response.request?.res?.responseUrl || url;
} catch (error: any) {
http_status = error.response?.status || 500;
throw new Error(`Fetch failed: ${error.message}`);
}
const raw_hash = crypto.createHash('sha256').update(raw_body).digest('hex');
const snapshot_id = `snap_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
const fetched_at = new Date().toISOString();
const snapshot: RawSnapshot = {
snapshot_id,
source_id,
job_id,
fetched_at,
content_type,
raw_body,
raw_hash,
http_status,
final_url
};
const stmt = db.prepare(`
INSERT INTO raw_snapshot (
snapshot_id, source_id, job_id, fetched_at, content_type,
raw_body, raw_hash, http_status, final_url
) VALUES (
@snapshot_id, @source_id, @job_id, @fetched_at, @content_type,
@raw_body, @raw_hash, @http_status, @final_url
)
`);
stmt.run(snapshot);
return snapshot;
}