Spaces:
Sleeping
Sleeping
| import re | |
| import unicodedata | |
| def clean_text(text: str) -> str: | |
| """Clean and normalize text for LLM ingestion.""" | |
| if not isinstance(text, str): | |
| return "" | |
| # Normalize unicode | |
| text = unicodedata.normalize("NFKC", text) | |
| # Remove control characters | |
| text = re.sub(r"[\x00-\x1F\x7F]", " ", text) | |
| # Replace multiple spaces/newlines with a single space | |
| text = re.sub(r"\s+", " ", text) | |
| # Strip leading/trailing whitespace | |
| text = text.strip() | |
| return text | |