Spaces:

kiafa
/

kia-command-center

Sleeping

File size: 4,524 Bytes

b96f3a5

"""
Text Cleaner - Cleans and normalizes scraped text.
Removes noise, normalizes encoding, standardizes formatting.
"""
import re
import logging
import unicodedata

logger = logging.getLogger("Cleaner")


class TextCleaner:
    """Cleans raw scraped text for dataset preparation."""

    # Patterns to remove
    NOISE_PATTERNS = [
        r'Cookie[s]?\s*(Policy|Settings|Consent).*?(?:\n|$)',
        r'(Pranoj|Refuzoj)\s*cookie.*?(?:\n|$)',
        r'Të gjitha të drejtat.*?(?:\n|$)',
        r'All rights reserved.*?(?:\n|$)',
        r'©\s*\d{4}.*?(?:\n|$)',
        r'Share\s*(on)?\s*(Facebook|Twitter|LinkedIn|Email).*?(?:\n|$)',
        r'(Ndaj|Shpërndaj)\s*(në)?\s*(Facebook|Twitter).*?(?:\n|$)',
        r'(Loading|Duke\s*ngarkuar)\.{2,}',
        r'(Subscribe|Abonohu).*?(newsletter|buletinin).*?(?:\n|$)',
        r'\[.*?banner.*?\]',
        r'Advertisement',
        r'Reklama',
        r'<script.*?</script>',
        r'<style.*?</style>',
        r'{\s*[\w-]+\s*:.*?}',  # CSS remnants
    ]

    def __init__(self):
        self.compiled_patterns = [
            re.compile(pattern, re.IGNORECASE | re.DOTALL)
            for pattern in self.NOISE_PATTERNS
        ]

    def clean_text(self, text: str) -> str:
        """Full cleaning pipeline."""
        if not text:
            return ""

        text = self._normalize_unicode(text)
        text = self._remove_noise(text)
        text = self._normalize_whitespace(text)
        text = self._fix_albanian_chars(text)
        text = self._remove_short_lines(text)
        text = self._strip_urls_in_text(text)

        return text.strip()

    def _normalize_unicode(self, text: str) -> str:
        """Normalize Unicode to NFC form."""
        return unicodedata.normalize("NFC", text)

    def _remove_noise(self, text: str) -> str:
        """Remove known noise patterns."""
        for pattern in self.compiled_patterns:
            text = pattern.sub("", text)
        return text

    def _normalize_whitespace(self, text: str) -> str:
        """Normalize whitespace while preserving paragraph structure."""
        # Replace tabs with spaces
        text = text.replace("\t", " ")
        # Replace multiple spaces with single space
        text = re.sub(r'[ ]{2,}', ' ', text)
        # Replace 3+ newlines with double newline
        text = re.sub(r'\n{3,}', '\n\n', text)
        # Strip trailing spaces from lines
        text = "\n".join(line.rstrip() for line in text.split("\n"))
        return text

    def _fix_albanian_chars(self, text: str) -> str:
        """Ensure Albanian special characters are correct."""
        # Fix common encoding issues with Albanian chars
        replacements = {
            "Ã«": "ë",
            "Ã§": "ç",
            "â€“": "–",
            "â€”": "—",
            "â€œ": '"',
            "â€\x9d": '"',
            "â€˜": "'",
            "â€™": "'",
            "\u00eb": "ë",  # ë
            "\u00c7": "Ç",  # Ç
            "\u00e7": "ç",  # ç
        }
        for old, new in replacements.items():
            text = text.replace(old, new)
        return text

    def _remove_short_lines(self, text: str, min_words: int = 3) -> str:
        """Remove very short lines that are likely navigation or noise."""
        lines = text.split("\n")
        cleaned = []
        for line in lines:
            stripped = line.strip()
            # Keep empty lines (paragraph separators)
            if not stripped:
                cleaned.append("")
                continue
            # Keep lines with enough words
            word_count = len(stripped.split())
            if word_count >= min_words:
                cleaned.append(line)
            # Keep lines that look like headings (start with #, numbers, etc.)
            elif re.match(r'^[#\d\.\-\*•]', stripped):
                cleaned.append(line)
        return "\n".join(cleaned)

    def _strip_urls_in_text(self, text: str) -> str:
        """Remove inline URLs from text body."""
        url_pattern = r'https?://\S+'
        return re.sub(url_pattern, '', text)

    def is_quality_content(self, text: str, min_length: int = 200) -> bool:
        """Check if text meets quality threshold."""
        if not text or len(text.strip()) < min_length:
            return False

        # Check for actual Albanian/English words (not just numbers/symbols)
        words = re.findall(r'[a-zA-ZëçÇË]+', text)
        if len(words) < 20:
            return False

        return True