Spaces:
Sleeping
Sleeping
File size: 810 Bytes
e63c592 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
import re
from hashlib import sha256
from typing import Optional
_MIN_DOC_CHARS = 200
def normalize_text(text: str) -> str:
"""Normalize text by collapsing whitespace and stripping edges."""
# Replace multiple whitespace (including newlines) with a single space
normalized = re.sub(r"\s+", " ", text or "").strip()
return normalized
def is_valid_document(text: str, min_chars: int = _MIN_DOC_CHARS) -> bool:
"""Return True if the text is long enough to be considered useful."""
return len(text) >= min_chars
def make_doc_id(source: str, title: str, url: Optional[str] = None) -> str:
"""Create a stable SHA256-based document id from source, title, and URL."""
base = f"{source}|{title}|{url or ''}"
digest = sha256(base.encode("utf-8")).hexdigest()
return digest |