import * as cheerio from 'cheerio'; import crypto from 'crypto'; import { v4 as uuidv4 } from 'uuid'; import db from './db'; import { RawSnapshot, NormalizedDocument } from './models'; export function extractAndNormalize(snapshot: RawSnapshot): NormalizedDocument | null { const source = db.prepare('SELECT * FROM source_registry WHERE source_id = ?').get(snapshot.source_id) as any; if (!source) { throw new Error(`Source ${snapshot.source_id} not found`); } // Basic HTML extraction using cheerio const $ = cheerio.load(snapshot.raw_body); // Remove unwanted elements $('nav, footer, header, script, style, .share, .comments, .recommend').remove(); // Extract title let title = $('title').text().trim(); if (!title) { title = $('h1').first().text().trim() || 'Untitled Document'; } // Extract main text (simplified version) let rawText = $('body').text(); // Normalize text: // 1. Fold whitespace // 2. Full-width to half-width (simplified) let normalizedText = rawText .replace(/\s+/g, ' ') .replace(/[\uFF01-\uFF5E]/g, (char: string) => String.fromCharCode(char.charCodeAt(0) - 0xFEE0)) .trim(); const normalizedHash = crypto.createHash('sha256').update(normalizedText).digest('hex'); // Check deduplication const lastDoc = db.prepare(` SELECT * FROM normalized_document WHERE source_id = ? AND doc_status = 'active' ORDER BY created_at DESC LIMIT 1 `).get(snapshot.source_id) as any; if (lastDoc && lastDoc.normalized_hash === normalizedHash) { console.log(`Document for source ${snapshot.source_id} is unchanged.`); return null; // No new version } // It's a new version, so we create a new normalized document // If there was an active doc, we can optionally archive it, but typically we just keep it active or rely on version date. // The spec says doc_status = active/archived. if (lastDoc) { db.prepare(`UPDATE normalized_document SET doc_status = 'archived' WHERE doc_id = ?`).run(lastDoc.doc_id); } const doc_id = `doc_${uuidv4().replace(/-/g, '').substring(0, 16)}`; const now = new Date().toISOString(); // Simplified date extraction, defaulting to today const versionDate = new Date().toISOString().split('T')[0] || new Date().toISOString(); const newDoc: NormalizedDocument = { doc_id, source_id: snapshot.source_id, snapshot_id: snapshot.snapshot_id, title, version_date: versionDate, effective_date: versionDate, normalized_text: normalizedText, normalized_hash: normalizedHash, doc_status: 'active', created_at: now }; const stmt = db.prepare(` INSERT INTO normalized_document ( doc_id, source_id, snapshot_id, title, version_date, effective_date, normalized_text, normalized_hash, doc_status, created_at ) VALUES ( @doc_id, @source_id, @snapshot_id, @title, @version_date, @effective_date, @normalized_text, @normalized_hash, @doc_status, @created_at ) `); stmt.run(newDoc); return newDoc; }