File size: 3,032 Bytes
f39c319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a273844
f39c319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import * as cheerio from 'cheerio';
import crypto from 'crypto';
import { v4 as uuidv4 } from 'uuid';
import db from './db';
import { RawSnapshot, NormalizedDocument } from './models';

export function extractAndNormalize(snapshot: RawSnapshot): NormalizedDocument | null {
  const source = db.prepare('SELECT * FROM source_registry WHERE source_id = ?').get(snapshot.source_id) as any;
  if (!source) {
    throw new Error(`Source ${snapshot.source_id} not found`);
  }

  // Basic HTML extraction using cheerio
  const $ = cheerio.load(snapshot.raw_body);

  // Remove unwanted elements
  $('nav, footer, header, script, style, .share, .comments, .recommend').remove();

  // Extract title
  let title = $('title').text().trim();
  if (!title) {
    title = $('h1').first().text().trim() || 'Untitled Document';
  }

  // Extract main text (simplified version)
  let rawText = $('body').text();
  
  // Normalize text:
  // 1. Fold whitespace
  // 2. Full-width to half-width (simplified)
  let normalizedText = rawText
    .replace(/\s+/g, ' ')
    .replace(/[\uFF01-\uFF5E]/g, (char: string) => String.fromCharCode(char.charCodeAt(0) - 0xFEE0))
    .trim();

  const normalizedHash = crypto.createHash('sha256').update(normalizedText).digest('hex');

  // Check deduplication
  const lastDoc = db.prepare(`
    SELECT * FROM normalized_document 
    WHERE source_id = ? AND doc_status = 'active' 
    ORDER BY created_at DESC LIMIT 1
  `).get(snapshot.source_id) as any;

  if (lastDoc && lastDoc.normalized_hash === normalizedHash) {
    console.log(`Document for source ${snapshot.source_id} is unchanged.`);
    return null; // No new version
  }

  // It's a new version, so we create a new normalized document
  // If there was an active doc, we can optionally archive it, but typically we just keep it active or rely on version date.
  // The spec says doc_status = active/archived.
  
  if (lastDoc) {
    db.prepare(`UPDATE normalized_document SET doc_status = 'archived' WHERE doc_id = ?`).run(lastDoc.doc_id);
  }

  const doc_id = `doc_${uuidv4().replace(/-/g, '').substring(0, 16)}`;
  const now = new Date().toISOString();
  // Simplified date extraction, defaulting to today
  const versionDate = new Date().toISOString().split('T')[0] || new Date().toISOString();
  
  const newDoc: NormalizedDocument = {
    doc_id,
    source_id: snapshot.source_id,
    snapshot_id: snapshot.snapshot_id,
    title,
    version_date: versionDate,
    effective_date: versionDate,
    normalized_text: normalizedText,
    normalized_hash: normalizedHash,
    doc_status: 'active',
    created_at: now
  };

  const stmt = db.prepare(`
    INSERT INTO normalized_document (
      doc_id, source_id, snapshot_id, title, version_date, effective_date,
      normalized_text, normalized_hash, doc_status, created_at
    ) VALUES (
      @doc_id, @source_id, @snapshot_id, @title, @version_date, @effective_date,
      @normalized_text, @normalized_hash, @doc_status, @created_at
    )
  `);

  stmt.run(newDoc);

  return newDoc;
}