| """ |
| processing.cleaner — Text normalization and deduplication hashing. |
| |
| Handles: |
| - Unicode normalization (NFC) |
| - HTML tag stripping |
| - Control character removal |
| - Excessive whitespace collapse |
| - Minimum length filtering |
| - SHA-256 content hashing for deduplication |
| """ |
|
|
| import hashlib |
| import re |
| import unicodedata |
| from typing import Optional |
|
|
|
|
| def clean_text(text: str) -> Optional[str]: |
| """ |
| Normalize and clean text for embedding. |
| |
| Returns cleaned text, or None if the result is too short |
| to be meaningful (< 30 characters after cleaning). |
| """ |
| if not text: |
| return None |
|
|
| |
| text = unicodedata.normalize("NFC", text) |
|
|
| |
| text = re.sub(r"<[^>]+>", " ", text) |
|
|
| |
| text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text) |
|
|
| |
| text = re.sub(r"\n{3,}", "\n\n", text) |
|
|
| |
| text = re.sub(r" {3,}", " ", text) |
|
|
| |
| text = text.strip() |
|
|
| |
| if len(text) < 30: |
| return None |
|
|
| return text |
|
|
|
|
| def compute_hash(content: str) -> str: |
| """ |
| Generate a deterministic SHA-256 hash for content deduplication. |
| Two identical chunks will always produce the same hash, |
| preventing re-embedding on repeated updates. |
| """ |
| return hashlib.sha256(content.encode("utf-8")).hexdigest() |
|
|