kiafa's picture
Premium UI/UX Overhaul & Optimization Update
b96f3a5 verified
"""
Text Cleaner - Cleans and normalizes scraped text.
Removes noise, normalizes encoding, standardizes formatting.
"""
import re
import logging
import unicodedata
logger = logging.getLogger("Cleaner")
class TextCleaner:
"""Cleans raw scraped text for dataset preparation."""
# Patterns to remove
NOISE_PATTERNS = [
r'Cookie[s]?\s*(Policy|Settings|Consent).*?(?:\n|$)',
r'(Pranoj|Refuzoj)\s*cookie.*?(?:\n|$)',
r'Të gjitha të drejtat.*?(?:\n|$)',
r'All rights reserved.*?(?:\n|$)',
r'©\s*\d{4}.*?(?:\n|$)',
r'Share\s*(on)?\s*(Facebook|Twitter|LinkedIn|Email).*?(?:\n|$)',
r'(Ndaj|Shpërndaj)\s*(në)?\s*(Facebook|Twitter).*?(?:\n|$)',
r'(Loading|Duke\s*ngarkuar)\.{2,}',
r'(Subscribe|Abonohu).*?(newsletter|buletinin).*?(?:\n|$)',
r'\[.*?banner.*?\]',
r'Advertisement',
r'Reklama',
r'<script.*?</script>',
r'<style.*?</style>',
r'{\s*[\w-]+\s*:.*?}', # CSS remnants
]
def __init__(self):
self.compiled_patterns = [
re.compile(pattern, re.IGNORECASE | re.DOTALL)
for pattern in self.NOISE_PATTERNS
]
def clean_text(self, text: str) -> str:
"""Full cleaning pipeline."""
if not text:
return ""
text = self._normalize_unicode(text)
text = self._remove_noise(text)
text = self._normalize_whitespace(text)
text = self._fix_albanian_chars(text)
text = self._remove_short_lines(text)
text = self._strip_urls_in_text(text)
return text.strip()
def _normalize_unicode(self, text: str) -> str:
"""Normalize Unicode to NFC form."""
return unicodedata.normalize("NFC", text)
def _remove_noise(self, text: str) -> str:
"""Remove known noise patterns."""
for pattern in self.compiled_patterns:
text = pattern.sub("", text)
return text
def _normalize_whitespace(self, text: str) -> str:
"""Normalize whitespace while preserving paragraph structure."""
# Replace tabs with spaces
text = text.replace("\t", " ")
# Replace multiple spaces with single space
text = re.sub(r'[ ]{2,}', ' ', text)
# Replace 3+ newlines with double newline
text = re.sub(r'\n{3,}', '\n\n', text)
# Strip trailing spaces from lines
text = "\n".join(line.rstrip() for line in text.split("\n"))
return text
def _fix_albanian_chars(self, text: str) -> str:
"""Ensure Albanian special characters are correct."""
# Fix common encoding issues with Albanian chars
replacements = {
"ë": "ë",
"ç": "ç",
"–": "–",
"—": "—",
"“": '"',
"â€\x9d": '"',
"‘": "'",
"’": "'",
"\u00eb": "ë", # ë
"\u00c7": "Ç", # Ç
"\u00e7": "ç", # ç
}
for old, new in replacements.items():
text = text.replace(old, new)
return text
def _remove_short_lines(self, text: str, min_words: int = 3) -> str:
"""Remove very short lines that are likely navigation or noise."""
lines = text.split("\n")
cleaned = []
for line in lines:
stripped = line.strip()
# Keep empty lines (paragraph separators)
if not stripped:
cleaned.append("")
continue
# Keep lines with enough words
word_count = len(stripped.split())
if word_count >= min_words:
cleaned.append(line)
# Keep lines that look like headings (start with #, numbers, etc.)
elif re.match(r'^[#\d\.\-\*•]', stripped):
cleaned.append(line)
return "\n".join(cleaned)
def _strip_urls_in_text(self, text: str) -> str:
"""Remove inline URLs from text body."""
url_pattern = r'https?://\S+'
return re.sub(url_pattern, '', text)
def is_quality_content(self, text: str, min_length: int = 200) -> bool:
"""Check if text meets quality threshold."""
if not text or len(text.strip()) < min_length:
return False
# Check for actual Albanian/English words (not just numbers/symbols)
words = re.findall(r'[a-zA-ZëçÇË]+', text)
if len(words) < 20:
return False
return True