Spaces:
Sleeping
Sleeping
| """ | |
| Text Cleaner - Cleans and normalizes scraped text. | |
| Removes noise, normalizes encoding, standardizes formatting. | |
| """ | |
| import re | |
| import logging | |
| import unicodedata | |
| logger = logging.getLogger("Cleaner") | |
| class TextCleaner: | |
| """Cleans raw scraped text for dataset preparation.""" | |
| # Patterns to remove | |
| NOISE_PATTERNS = [ | |
| r'Cookie[s]?\s*(Policy|Settings|Consent).*?(?:\n|$)', | |
| r'(Pranoj|Refuzoj)\s*cookie.*?(?:\n|$)', | |
| r'Të gjitha të drejtat.*?(?:\n|$)', | |
| r'All rights reserved.*?(?:\n|$)', | |
| r'©\s*\d{4}.*?(?:\n|$)', | |
| r'Share\s*(on)?\s*(Facebook|Twitter|LinkedIn|Email).*?(?:\n|$)', | |
| r'(Ndaj|Shpërndaj)\s*(në)?\s*(Facebook|Twitter).*?(?:\n|$)', | |
| r'(Loading|Duke\s*ngarkuar)\.{2,}', | |
| r'(Subscribe|Abonohu).*?(newsletter|buletinin).*?(?:\n|$)', | |
| r'\[.*?banner.*?\]', | |
| r'Advertisement', | |
| r'Reklama', | |
| r'<script.*?</script>', | |
| r'<style.*?</style>', | |
| r'{\s*[\w-]+\s*:.*?}', # CSS remnants | |
| ] | |
| def __init__(self): | |
| self.compiled_patterns = [ | |
| re.compile(pattern, re.IGNORECASE | re.DOTALL) | |
| for pattern in self.NOISE_PATTERNS | |
| ] | |
| def clean_text(self, text: str) -> str: | |
| """Full cleaning pipeline.""" | |
| if not text: | |
| return "" | |
| text = self._normalize_unicode(text) | |
| text = self._remove_noise(text) | |
| text = self._normalize_whitespace(text) | |
| text = self._fix_albanian_chars(text) | |
| text = self._remove_short_lines(text) | |
| text = self._strip_urls_in_text(text) | |
| return text.strip() | |
| def _normalize_unicode(self, text: str) -> str: | |
| """Normalize Unicode to NFC form.""" | |
| return unicodedata.normalize("NFC", text) | |
| def _remove_noise(self, text: str) -> str: | |
| """Remove known noise patterns.""" | |
| for pattern in self.compiled_patterns: | |
| text = pattern.sub("", text) | |
| return text | |
| def _normalize_whitespace(self, text: str) -> str: | |
| """Normalize whitespace while preserving paragraph structure.""" | |
| # Replace tabs with spaces | |
| text = text.replace("\t", " ") | |
| # Replace multiple spaces with single space | |
| text = re.sub(r'[ ]{2,}', ' ', text) | |
| # Replace 3+ newlines with double newline | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| # Strip trailing spaces from lines | |
| text = "\n".join(line.rstrip() for line in text.split("\n")) | |
| return text | |
| def _fix_albanian_chars(self, text: str) -> str: | |
| """Ensure Albanian special characters are correct.""" | |
| # Fix common encoding issues with Albanian chars | |
| replacements = { | |
| "ë": "ë", | |
| "ç": "ç", | |
| "–": "–", | |
| "—": "—", | |
| "“": '"', | |
| "â€\x9d": '"', | |
| "‘": "'", | |
| "’": "'", | |
| "\u00eb": "ë", # ë | |
| "\u00c7": "Ç", # Ç | |
| "\u00e7": "ç", # ç | |
| } | |
| for old, new in replacements.items(): | |
| text = text.replace(old, new) | |
| return text | |
| def _remove_short_lines(self, text: str, min_words: int = 3) -> str: | |
| """Remove very short lines that are likely navigation or noise.""" | |
| lines = text.split("\n") | |
| cleaned = [] | |
| for line in lines: | |
| stripped = line.strip() | |
| # Keep empty lines (paragraph separators) | |
| if not stripped: | |
| cleaned.append("") | |
| continue | |
| # Keep lines with enough words | |
| word_count = len(stripped.split()) | |
| if word_count >= min_words: | |
| cleaned.append(line) | |
| # Keep lines that look like headings (start with #, numbers, etc.) | |
| elif re.match(r'^[#\d\.\-\*•]', stripped): | |
| cleaned.append(line) | |
| return "\n".join(cleaned) | |
| def _strip_urls_in_text(self, text: str) -> str: | |
| """Remove inline URLs from text body.""" | |
| url_pattern = r'https?://\S+' | |
| return re.sub(url_pattern, '', text) | |
| def is_quality_content(self, text: str, min_length: int = 200) -> bool: | |
| """Check if text meets quality threshold.""" | |
| if not text or len(text.strip()) < min_length: | |
| return False | |
| # Check for actual Albanian/English words (not just numbers/symbols) | |
| words = re.findall(r'[a-zA-ZëçÇË]+', text) | |
| if len(words) < 20: | |
| return False | |
| return True | |