""" Text Cleaner - Cleans and normalizes scraped text. Removes noise, normalizes encoding, standardizes formatting. """ import re import logging import unicodedata logger = logging.getLogger("Cleaner") class TextCleaner: """Cleans raw scraped text for dataset preparation.""" # Patterns to remove NOISE_PATTERNS = [ r'Cookie[s]?\s*(Policy|Settings|Consent).*?(?:\n|$)', r'(Pranoj|Refuzoj)\s*cookie.*?(?:\n|$)', r'Të gjitha të drejtat.*?(?:\n|$)', r'All rights reserved.*?(?:\n|$)', r'©\s*\d{4}.*?(?:\n|$)', r'Share\s*(on)?\s*(Facebook|Twitter|LinkedIn|Email).*?(?:\n|$)', r'(Ndaj|Shpërndaj)\s*(në)?\s*(Facebook|Twitter).*?(?:\n|$)', r'(Loading|Duke\s*ngarkuar)\.{2,}', r'(Subscribe|Abonohu).*?(newsletter|buletinin).*?(?:\n|$)', r'\[.*?banner.*?\]', r'Advertisement', r'Reklama', r'', r'', r'{\s*[\w-]+\s*:.*?}', # CSS remnants ] def __init__(self): self.compiled_patterns = [ re.compile(pattern, re.IGNORECASE | re.DOTALL) for pattern in self.NOISE_PATTERNS ] def clean_text(self, text: str) -> str: """Full cleaning pipeline.""" if not text: return "" text = self._normalize_unicode(text) text = self._remove_noise(text) text = self._normalize_whitespace(text) text = self._fix_albanian_chars(text) text = self._remove_short_lines(text) text = self._strip_urls_in_text(text) return text.strip() def _normalize_unicode(self, text: str) -> str: """Normalize Unicode to NFC form.""" return unicodedata.normalize("NFC", text) def _remove_noise(self, text: str) -> str: """Remove known noise patterns.""" for pattern in self.compiled_patterns: text = pattern.sub("", text) return text def _normalize_whitespace(self, text: str) -> str: """Normalize whitespace while preserving paragraph structure.""" # Replace tabs with spaces text = text.replace("\t", " ") # Replace multiple spaces with single space text = re.sub(r'[ ]{2,}', ' ', text) # Replace 3+ newlines with double newline text = re.sub(r'\n{3,}', '\n\n', text) # Strip trailing spaces from lines text = "\n".join(line.rstrip() for line in text.split("\n")) return text def _fix_albanian_chars(self, text: str) -> str: """Ensure Albanian special characters are correct.""" # Fix common encoding issues with Albanian chars replacements = { "ë": "ë", "ç": "ç", "–": "–", "—": "—", "“": '"', "â€\x9d": '"', "‘": "'", "’": "'", "\u00eb": "ë", # ë "\u00c7": "Ç", # Ç "\u00e7": "ç", # ç } for old, new in replacements.items(): text = text.replace(old, new) return text def _remove_short_lines(self, text: str, min_words: int = 3) -> str: """Remove very short lines that are likely navigation or noise.""" lines = text.split("\n") cleaned = [] for line in lines: stripped = line.strip() # Keep empty lines (paragraph separators) if not stripped: cleaned.append("") continue # Keep lines with enough words word_count = len(stripped.split()) if word_count >= min_words: cleaned.append(line) # Keep lines that look like headings (start with #, numbers, etc.) elif re.match(r'^[#\d\.\-\*•]', stripped): cleaned.append(line) return "\n".join(cleaned) def _strip_urls_in_text(self, text: str) -> str: """Remove inline URLs from text body.""" url_pattern = r'https?://\S+' return re.sub(url_pattern, '', text) def is_quality_content(self, text: str, min_length: int = 200) -> bool: """Check if text meets quality threshold.""" if not text or len(text.strip()) < min_length: return False # Check for actual Albanian/English words (not just numbers/symbols) words = re.findall(r'[a-zA-ZëçÇË]+', text) if len(words) < 20: return False return True