Spaces:

kiafa
/

kia-command-center

Sleeping

App Files Files Community

kia-command-center / processing /cleaner.py

kiafa

Premium UI/UX Overhaul & Optimization Update

b96f3a5 verified about 2 months ago

raw

history blame contribute delete

4.52 kB

	"""
	Text Cleaner - Cleans and normalizes scraped text.
	Removes noise, normalizes encoding, standardizes formatting.
	"""
	import re
	import logging
	import unicodedata

	logger = logging.getLogger("Cleaner")


	class TextCleaner:
	"""Cleans raw scraped text for dataset preparation."""

	# Patterns to remove
	NOISE_PATTERNS = [
	r'Cookie[s]?\s(Policy\|Settings\|Consent).?(?:\n\|$)',
	r'(Pranoj\|Refuzoj)\scookie.?(?:\n\|$)',
	r'Të gjitha të drejtat.*?(?:\n\|$)',
	r'All rights reserved.*?(?:\n\|$)',
	r'©\s\d{4}.?(?:\n\|$)',
	r'Share\s(on)?\s(Facebook\|Twitter\|LinkedIn\|Email).*?(?:\n\|$)',
	r'(Ndaj\|Shpërndaj)\s(në)?\s(Facebook\|Twitter).*?(?:\n\|$)',
	r'(Loading\|Duke\s*ngarkuar)\.{2,}',
	r'(Subscribe\|Abonohu).?(newsletter\|buletinin).?(?:\n\|$)',
	r'\[.?banner.?\]',
	r'Advertisement',
	r'Reklama',
	r'<script.*?</script>',
	r'<style.*?</style>',
	r'{\s[\w-]+\s:.*?}', # CSS remnants
	]

	def __init__(self):
	self.compiled_patterns = [
	re.compile(pattern, re.IGNORECASE \| re.DOTALL)
	for pattern in self.NOISE_PATTERNS
	]

	def clean_text(self, text: str) -> str:
	"""Full cleaning pipeline."""
	if not text:
	return ""

	text = self._normalize_unicode(text)
	text = self._remove_noise(text)
	text = self._normalize_whitespace(text)
	text = self._fix_albanian_chars(text)
	text = self._remove_short_lines(text)
	text = self._strip_urls_in_text(text)

	return text.strip()

	def _normalize_unicode(self, text: str) -> str:
	"""Normalize Unicode to NFC form."""
	return unicodedata.normalize("NFC", text)

	def _remove_noise(self, text: str) -> str:
	"""Remove known noise patterns."""
	for pattern in self.compiled_patterns:
	text = pattern.sub("", text)
	return text

	def _normalize_whitespace(self, text: str) -> str:
	"""Normalize whitespace while preserving paragraph structure."""
	# Replace tabs with spaces
	text = text.replace("\t", " ")
	# Replace multiple spaces with single space
	text = re.sub(r'[ ]{2,}', ' ', text)
	# Replace 3+ newlines with double newline
	text = re.sub(r'\n{3,}', '\n\n', text)
	# Strip trailing spaces from lines
	text = "\n".join(line.rstrip() for line in text.split("\n"))
	return text

	def _fix_albanian_chars(self, text: str) -> str:
	"""Ensure Albanian special characters are correct."""
	# Fix common encoding issues with Albanian chars
	replacements = {
	"Ã«": "ë",
	"Ã§": "ç",
	"â€“": "–",
	"â€”": "—",
	"â€œ": '"',
	"â€\x9d": '"',
	"â€˜": "'",
	"â€™": "'",
	"\u00eb": "ë", # ë
	"\u00c7": "Ç", # Ç
	"\u00e7": "ç", # ç
	}
	for old, new in replacements.items():
	text = text.replace(old, new)
	return text

	def _remove_short_lines(self, text: str, min_words: int = 3) -> str:
	"""Remove very short lines that are likely navigation or noise."""
	lines = text.split("\n")
	cleaned = []
	for line in lines:
	stripped = line.strip()
	# Keep empty lines (paragraph separators)
	if not stripped:
	cleaned.append("")
	continue
	# Keep lines with enough words
	word_count = len(stripped.split())
	if word_count >= min_words:
	cleaned.append(line)
	# Keep lines that look like headings (start with #, numbers, etc.)
	elif re.match(r'^[#\d\.\-\*•]', stripped):
	cleaned.append(line)
	return "\n".join(cleaned)

	def _strip_urls_in_text(self, text: str) -> str:
	"""Remove inline URLs from text body."""
	url_pattern = r'https?://\S+'
	return re.sub(url_pattern, '', text)

	def is_quality_content(self, text: str, min_length: int = 200) -> bool:
	"""Check if text meets quality threshold."""
	if not text or len(text.strip()) < min_length:
	return False

	# Check for actual Albanian/English words (not just numbers/symbols)
	words = re.findall(r'[a-zA-ZëçÇË]+', text)
	if len(words) < 20:
	return False

	return True