Spaces:
Sleeping
Sleeping
| import re | |
| from hashlib import sha256 | |
| from typing import Optional | |
| _MIN_DOC_CHARS = 200 | |
| def normalize_text(text: str) -> str: | |
| """Normalize text by collapsing whitespace and stripping edges.""" | |
| # Replace multiple whitespace (including newlines) with a single space | |
| normalized = re.sub(r"\s+", " ", text or "").strip() | |
| return normalized | |
| def is_valid_document(text: str, min_chars: int = _MIN_DOC_CHARS) -> bool: | |
| """Return True if the text is long enough to be considered useful.""" | |
| return len(text) >= min_chars | |
| def make_doc_id(source: str, title: str, url: Optional[str] = None) -> str: | |
| """Create a stable SHA256-based document id from source, title, and URL.""" | |
| base = f"{source}|{title}|{url or ''}" | |
| digest = sha256(base.encode("utf-8")).hexdigest() | |
| return digest |