File size: 4,524 Bytes
b96f3a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
Text Cleaner - Cleans and normalizes scraped text.
Removes noise, normalizes encoding, standardizes formatting.
"""
import re
import logging
import unicodedata

logger = logging.getLogger("Cleaner")


class TextCleaner:
    """Cleans raw scraped text for dataset preparation."""

    # Patterns to remove
    NOISE_PATTERNS = [
        r'Cookie[s]?\s*(Policy|Settings|Consent).*?(?:\n|$)',
        r'(Pranoj|Refuzoj)\s*cookie.*?(?:\n|$)',
        r'Të gjitha të drejtat.*?(?:\n|$)',
        r'All rights reserved.*?(?:\n|$)',
        r'©\s*\d{4}.*?(?:\n|$)',
        r'Share\s*(on)?\s*(Facebook|Twitter|LinkedIn|Email).*?(?:\n|$)',
        r'(Ndaj|Shpërndaj)\s*(në)?\s*(Facebook|Twitter).*?(?:\n|$)',
        r'(Loading|Duke\s*ngarkuar)\.{2,}',
        r'(Subscribe|Abonohu).*?(newsletter|buletinin).*?(?:\n|$)',
        r'\[.*?banner.*?\]',
        r'Advertisement',
        r'Reklama',
        r'<script.*?</script>',
        r'<style.*?</style>',
        r'{\s*[\w-]+\s*:.*?}',  # CSS remnants
    ]

    def __init__(self):
        self.compiled_patterns = [
            re.compile(pattern, re.IGNORECASE | re.DOTALL)
            for pattern in self.NOISE_PATTERNS
        ]

    def clean_text(self, text: str) -> str:
        """Full cleaning pipeline."""
        if not text:
            return ""

        text = self._normalize_unicode(text)
        text = self._remove_noise(text)
        text = self._normalize_whitespace(text)
        text = self._fix_albanian_chars(text)
        text = self._remove_short_lines(text)
        text = self._strip_urls_in_text(text)

        return text.strip()

    def _normalize_unicode(self, text: str) -> str:
        """Normalize Unicode to NFC form."""
        return unicodedata.normalize("NFC", text)

    def _remove_noise(self, text: str) -> str:
        """Remove known noise patterns."""
        for pattern in self.compiled_patterns:
            text = pattern.sub("", text)
        return text

    def _normalize_whitespace(self, text: str) -> str:
        """Normalize whitespace while preserving paragraph structure."""
        # Replace tabs with spaces
        text = text.replace("\t", " ")
        # Replace multiple spaces with single space
        text = re.sub(r'[ ]{2,}', ' ', text)
        # Replace 3+ newlines with double newline
        text = re.sub(r'\n{3,}', '\n\n', text)
        # Strip trailing spaces from lines
        text = "\n".join(line.rstrip() for line in text.split("\n"))
        return text

    def _fix_albanian_chars(self, text: str) -> str:
        """Ensure Albanian special characters are correct."""
        # Fix common encoding issues with Albanian chars
        replacements = {
            "ë": "ë",
            "ç": "ç",
            "–": "–",
            "—": "—",
            "“": '"',
            "â€\x9d": '"',
            "‘": "'",
            "’": "'",
            "\u00eb": "ë",  # ë
            "\u00c7": "Ç",  # Ç
            "\u00e7": "ç",  # ç
        }
        for old, new in replacements.items():
            text = text.replace(old, new)
        return text

    def _remove_short_lines(self, text: str, min_words: int = 3) -> str:
        """Remove very short lines that are likely navigation or noise."""
        lines = text.split("\n")
        cleaned = []
        for line in lines:
            stripped = line.strip()
            # Keep empty lines (paragraph separators)
            if not stripped:
                cleaned.append("")
                continue
            # Keep lines with enough words
            word_count = len(stripped.split())
            if word_count >= min_words:
                cleaned.append(line)
            # Keep lines that look like headings (start with #, numbers, etc.)
            elif re.match(r'^[#\d\.\-\*•]', stripped):
                cleaned.append(line)
        return "\n".join(cleaned)

    def _strip_urls_in_text(self, text: str) -> str:
        """Remove inline URLs from text body."""
        url_pattern = r'https?://\S+'
        return re.sub(url_pattern, '', text)

    def is_quality_content(self, text: str, min_length: int = 200) -> bool:
        """Check if text meets quality threshold."""
        if not text or len(text.strip()) < min_length:
            return False

        # Check for actual Albanian/English words (not just numbers/symbols)
        words = re.findall(r'[a-zA-ZëçÇË]+', text)
        if len(words) < 20:
            return False

        return True