File size: 3,032 Bytes
f39c319 a273844 f39c319 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | import * as cheerio from 'cheerio';
import crypto from 'crypto';
import { v4 as uuidv4 } from 'uuid';
import db from './db';
import { RawSnapshot, NormalizedDocument } from './models';
export function extractAndNormalize(snapshot: RawSnapshot): NormalizedDocument | null {
const source = db.prepare('SELECT * FROM source_registry WHERE source_id = ?').get(snapshot.source_id) as any;
if (!source) {
throw new Error(`Source ${snapshot.source_id} not found`);
}
// Basic HTML extraction using cheerio
const $ = cheerio.load(snapshot.raw_body);
// Remove unwanted elements
$('nav, footer, header, script, style, .share, .comments, .recommend').remove();
// Extract title
let title = $('title').text().trim();
if (!title) {
title = $('h1').first().text().trim() || 'Untitled Document';
}
// Extract main text (simplified version)
let rawText = $('body').text();
// Normalize text:
// 1. Fold whitespace
// 2. Full-width to half-width (simplified)
let normalizedText = rawText
.replace(/\s+/g, ' ')
.replace(/[\uFF01-\uFF5E]/g, (char: string) => String.fromCharCode(char.charCodeAt(0) - 0xFEE0))
.trim();
const normalizedHash = crypto.createHash('sha256').update(normalizedText).digest('hex');
// Check deduplication
const lastDoc = db.prepare(`
SELECT * FROM normalized_document
WHERE source_id = ? AND doc_status = 'active'
ORDER BY created_at DESC LIMIT 1
`).get(snapshot.source_id) as any;
if (lastDoc && lastDoc.normalized_hash === normalizedHash) {
console.log(`Document for source ${snapshot.source_id} is unchanged.`);
return null; // No new version
}
// It's a new version, so we create a new normalized document
// If there was an active doc, we can optionally archive it, but typically we just keep it active or rely on version date.
// The spec says doc_status = active/archived.
if (lastDoc) {
db.prepare(`UPDATE normalized_document SET doc_status = 'archived' WHERE doc_id = ?`).run(lastDoc.doc_id);
}
const doc_id = `doc_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
const now = new Date().toISOString();
// Simplified date extraction, defaulting to today
const versionDate = new Date().toISOString().split('T')[0] || new Date().toISOString();
const newDoc: NormalizedDocument = {
doc_id,
source_id: snapshot.source_id,
snapshot_id: snapshot.snapshot_id,
title,
version_date: versionDate,
effective_date: versionDate,
normalized_text: normalizedText,
normalized_hash: normalizedHash,
doc_status: 'active',
created_at: now
};
const stmt = db.prepare(`
INSERT INTO normalized_document (
doc_id, source_id, snapshot_id, title, version_date, effective_date,
normalized_text, normalized_hash, doc_status, created_at
) VALUES (
@doc_id, @source_id, @snapshot_id, @title, @version_date, @effective_date,
@normalized_text, @normalized_hash, @doc_status, @created_at
)
`);
stmt.run(newDoc);
return newDoc;
}
|