Spaces:
Sleeping
Sleeping
File size: 4,524 Bytes
b96f3a5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | """
Text Cleaner - Cleans and normalizes scraped text.
Removes noise, normalizes encoding, standardizes formatting.
"""
import re
import logging
import unicodedata
logger = logging.getLogger("Cleaner")
class TextCleaner:
"""Cleans raw scraped text for dataset preparation."""
# Patterns to remove
NOISE_PATTERNS = [
r'Cookie[s]?\s*(Policy|Settings|Consent).*?(?:\n|$)',
r'(Pranoj|Refuzoj)\s*cookie.*?(?:\n|$)',
r'Të gjitha të drejtat.*?(?:\n|$)',
r'All rights reserved.*?(?:\n|$)',
r'©\s*\d{4}.*?(?:\n|$)',
r'Share\s*(on)?\s*(Facebook|Twitter|LinkedIn|Email).*?(?:\n|$)',
r'(Ndaj|Shpërndaj)\s*(në)?\s*(Facebook|Twitter).*?(?:\n|$)',
r'(Loading|Duke\s*ngarkuar)\.{2,}',
r'(Subscribe|Abonohu).*?(newsletter|buletinin).*?(?:\n|$)',
r'\[.*?banner.*?\]',
r'Advertisement',
r'Reklama',
r'<script.*?</script>',
r'<style.*?</style>',
r'{\s*[\w-]+\s*:.*?}', # CSS remnants
]
def __init__(self):
self.compiled_patterns = [
re.compile(pattern, re.IGNORECASE | re.DOTALL)
for pattern in self.NOISE_PATTERNS
]
def clean_text(self, text: str) -> str:
"""Full cleaning pipeline."""
if not text:
return ""
text = self._normalize_unicode(text)
text = self._remove_noise(text)
text = self._normalize_whitespace(text)
text = self._fix_albanian_chars(text)
text = self._remove_short_lines(text)
text = self._strip_urls_in_text(text)
return text.strip()
def _normalize_unicode(self, text: str) -> str:
"""Normalize Unicode to NFC form."""
return unicodedata.normalize("NFC", text)
def _remove_noise(self, text: str) -> str:
"""Remove known noise patterns."""
for pattern in self.compiled_patterns:
text = pattern.sub("", text)
return text
def _normalize_whitespace(self, text: str) -> str:
"""Normalize whitespace while preserving paragraph structure."""
# Replace tabs with spaces
text = text.replace("\t", " ")
# Replace multiple spaces with single space
text = re.sub(r'[ ]{2,}', ' ', text)
# Replace 3+ newlines with double newline
text = re.sub(r'\n{3,}', '\n\n', text)
# Strip trailing spaces from lines
text = "\n".join(line.rstrip() for line in text.split("\n"))
return text
def _fix_albanian_chars(self, text: str) -> str:
"""Ensure Albanian special characters are correct."""
# Fix common encoding issues with Albanian chars
replacements = {
"ë": "ë",
"ç": "ç",
"–": "–",
"—": "—",
"“": '"',
"â€\x9d": '"',
"‘": "'",
"’": "'",
"\u00eb": "ë", # ë
"\u00c7": "Ç", # Ç
"\u00e7": "ç", # ç
}
for old, new in replacements.items():
text = text.replace(old, new)
return text
def _remove_short_lines(self, text: str, min_words: int = 3) -> str:
"""Remove very short lines that are likely navigation or noise."""
lines = text.split("\n")
cleaned = []
for line in lines:
stripped = line.strip()
# Keep empty lines (paragraph separators)
if not stripped:
cleaned.append("")
continue
# Keep lines with enough words
word_count = len(stripped.split())
if word_count >= min_words:
cleaned.append(line)
# Keep lines that look like headings (start with #, numbers, etc.)
elif re.match(r'^[#\d\.\-\*•]', stripped):
cleaned.append(line)
return "\n".join(cleaned)
def _strip_urls_in_text(self, text: str) -> str:
"""Remove inline URLs from text body."""
url_pattern = r'https?://\S+'
return re.sub(url_pattern, '', text)
def is_quality_content(self, text: str, min_length: int = 200) -> bool:
"""Check if text meets quality threshold."""
if not text or len(text.strip()) < min_length:
return False
# Check for actual Albanian/English words (not just numbers/symbols)
words = re.findall(r'[a-zA-ZëçÇË]+', text)
if len(words) < 20:
return False
return True
|