| import * as cheerio from 'cheerio'; |
| import crypto from 'crypto'; |
| import { v4 as uuidv4 } from 'uuid'; |
| import db from './db'; |
| import { RawSnapshot, NormalizedDocument } from './models'; |
|
|
| export function extractAndNormalize(snapshot: RawSnapshot): NormalizedDocument | null { |
| const source = db.prepare('SELECT * FROM source_registry WHERE source_id = ?').get(snapshot.source_id) as any; |
| if (!source) { |
| throw new Error(`Source ${snapshot.source_id} not found`); |
| } |
|
|
| |
| const $ = cheerio.load(snapshot.raw_body); |
|
|
| |
| $('nav, footer, header, script, style, .share, .comments, .recommend').remove(); |
|
|
| |
| let title = $('title').text().trim(); |
| if (!title) { |
| title = $('h1').first().text().trim() || 'Untitled Document'; |
| } |
|
|
| |
| let rawText = $('body').text(); |
| |
| |
| |
| |
| let normalizedText = rawText |
| .replace(/\s+/g, ' ') |
| .replace(/[\uFF01-\uFF5E]/g, (char: string) => String.fromCharCode(char.charCodeAt(0) - 0xFEE0)) |
| .trim(); |
|
|
| const normalizedHash = crypto.createHash('sha256').update(normalizedText).digest('hex'); |
|
|
| |
| const lastDoc = db.prepare(` |
| SELECT * FROM normalized_document |
| WHERE source_id = ? AND doc_status = 'active' |
| ORDER BY created_at DESC LIMIT 1 |
| `).get(snapshot.source_id) as any; |
|
|
| if (lastDoc && lastDoc.normalized_hash === normalizedHash) { |
| console.log(`Document for source ${snapshot.source_id} is unchanged.`); |
| return null; |
| } |
|
|
| |
| |
| |
| |
| if (lastDoc) { |
| db.prepare(`UPDATE normalized_document SET doc_status = 'archived' WHERE doc_id = ?`).run(lastDoc.doc_id); |
| } |
|
|
| const doc_id = `doc_${uuidv4().replace(/-/g, '').substring(0, 16)}`; |
| const now = new Date().toISOString(); |
| |
| const versionDate = new Date().toISOString().split('T')[0] || new Date().toISOString(); |
| |
| const newDoc: NormalizedDocument = { |
| doc_id, |
| source_id: snapshot.source_id, |
| snapshot_id: snapshot.snapshot_id, |
| title, |
| version_date: versionDate, |
| effective_date: versionDate, |
| normalized_text: normalizedText, |
| normalized_hash: normalizedHash, |
| doc_status: 'active', |
| created_at: now |
| }; |
|
|
| const stmt = db.prepare(` |
| INSERT INTO normalized_document ( |
| doc_id, source_id, snapshot_id, title, version_date, effective_date, |
| normalized_text, normalized_hash, doc_status, created_at |
| ) VALUES ( |
| @doc_id, @source_id, @snapshot_id, @title, @version_date, @effective_date, |
| @normalized_text, @normalized_hash, @doc_status, @created_at |
| ) |
| `); |
|
|
| stmt.run(newDoc); |
|
|
| return newDoc; |
| } |
|
|