File size: 1,588 Bytes
34dcea4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
processing.cleaner — Text normalization and deduplication hashing.

Handles:
  - Unicode normalization (NFC)
  - HTML tag stripping
  - Control character removal
  - Excessive whitespace collapse
  - Minimum length filtering
  - SHA-256 content hashing for deduplication
"""

import hashlib
import re
import unicodedata
from typing import Optional


def clean_text(text: str) -> Optional[str]:
    """
    Normalize and clean text for embedding.

    Returns cleaned text, or None if the result is too short
    to be meaningful (< 30 characters after cleaning).
    """
    if not text:
        return None

    # 1. Unicode normalize to NFC (compose characters)
    text = unicodedata.normalize("NFC", text)

    # 2. Strip residual HTML tags (from Gemini exports, etc.)
    text = re.sub(r"<[^>]+>", " ", text)

    # 3. Remove control characters (except newlines and tabs)
    text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text)

    # 4. Collapse excessive newlines (3+ → 2)
    text = re.sub(r"\n{3,}", "\n\n", text)

    # 5. Collapse excessive spaces (3+ → 2)
    text = re.sub(r" {3,}", "  ", text)

    # 6. Strip leading/trailing whitespace
    text = text.strip()

    # 7. Minimum length gate
    if len(text) < 30:
        return None

    return text


def compute_hash(content: str) -> str:
    """
    Generate a deterministic SHA-256 hash for content deduplication.
    Two identical chunks will always produce the same hash,
    preventing re-embedding on repeated updates.
    """
    return hashlib.sha256(content.encode("utf-8")).hexdigest()