File size: 13,048 Bytes
ae3c6b8
 
 
 
 
 
 
 
 
 
 
 
 
 
bb9baa9
df41fce
 
 
 
 
 
 
 
ae3c6b8
df41fce
 
 
bb9baa9
 
 
 
ae3c6b8
bb9baa9
 
 
 
df41fce
ae3c6b8
bb9baa9
 
df41fce
ae3c6b8
 
 
 
df41fce
 
 
 
 
 
 
 
ae3c6b8
df41fce
 
 
bb9baa9
 
 
 
 
df41fce
bb9baa9
 
 
 
df41fce
 
 
bb9baa9
 
df41fce
bb9baa9
 
df41fce
bb9baa9
 
 
df41fce
 
 
bb9baa9
df41fce
bb9baa9
 
df41fce
 
 
bb9baa9
df41fce
bb9baa9
 
df41fce
 
 
 
bb9baa9
 
 
df41fce
 
 
 
 
bb9baa9
 
 
df41fce
bb9baa9
 
df41fce
 
 
 
bb9baa9
 
 
df41fce
bb9baa9
ae3c6b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb9baa9
 
 
 
df41fce
ae3c6b8
bb9baa9
df41fce
 
 
 
 
ae3c6b8
df41fce
 
 
 
bb9baa9
 
df41fce
bb9baa9
df41fce
 
 
 
 
 
 
 
 
 
 
 
 
 
ae3c6b8
 
 
 
 
 
 
df41fce
 
bb9baa9
 
 
df41fce
bb9baa9
 
df41fce
 
 
 
ae3c6b8
df41fce
 
 
bb9baa9
 
 
 
 
 
 
df41fce
ae3c6b8
bb9baa9
df41fce
bb9baa9
 
 
df41fce
bb9baa9
df41fce
 
 
 
 
ae3c6b8
 
 
df41fce
 
 
 
bb9baa9
 
 
 
df41fce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb9baa9
 
 
ae3c6b8
 
df41fce
ae3c6b8
df41fce
 
 
 
 
ae3c6b8
 
 
bb9baa9
 
ae3c6b8
df41fce
 
 
 
 
 
 
 
 
ae3c6b8
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
# ============================================================
# CHANGELOG - review_cleaner.py
# ============================================================
# Issue ID | Change Description                              | Lines Affected
# ------------------------------------------------------------
# PROC-02  | Added duplicate review detection with similarity | Lines ~95-130
#          | - Added _is_duplicate() method with fuzzy match  |
#          | - Added 'removed_duplicates' to stats tracking   |
#          | - Uses simple word overlap similarity (no deps)  |
#          | - Threshold: 85% similarity = duplicate          |
# ============================================================
# IMPORTANT: All other code is UNCHANGED from original working version
# ============================================================

"""
Review Text Cleaner - FIXED VERSION
Less aggressive cleaning that preserves more reviews.

FIXES:
1. Don't discard reviews just because they're short
2. Keep reviews with minimal cleaning
3. Better handling of special characters
4. Log what's being cleaned for debugging
5. [PROC-02] Detect and remove duplicate reviews

Author: Tushar Pingle
Updated: Nov 2024
"""

import re
import unicodedata
from typing import List, Tuple, Set


class ReviewCleaner:
    """
    Cleans review text while preserving as much content as possible.
    Now includes duplicate detection.
    """
    
    # Minimum length for a valid review (characters)
    MIN_REVIEW_LENGTH = 10  # Very permissive
    
    # [PROC-02] Similarity threshold for duplicate detection (0.0 to 1.0)
    DUPLICATE_SIMILARITY_THRESHOLD = 0.85
    
    def __init__(self, verbose: bool = False):
        self.verbose = verbose
        self.stats = {
            'total': 0,
            'kept': 0,
            'removed_empty': 0,
            'removed_short': 0,
            'removed_duplicates': 0,  # [PROC-02] Added
            'chars_original': 0,
            'chars_cleaned': 0
        }
    
    def clean_review(self, text: str) -> str:
        """
        Clean a single review text.
        
        FIXED: Less aggressive cleaning, preserves more content.
        """
        if not text or not isinstance(text, str):
            return ""
        
        original_len = len(text)
        
        # 1. Basic whitespace normalization (gentle)
        text = ' '.join(text.split())
        
        # 2. Remove only truly problematic emojis (keep basic punctuation)
        text = self._remove_emojis(text)
        
        # 3. Normalize quotes (don't remove them)
        text = text.replace('"', '"').replace('"', '"')
        text = text.replace("'", "'").replace("'", "'")
        
        # 4. Remove control characters only (keep newlines as spaces)
        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C' or char == ' ')
        
        # 5. Normalize multiple spaces
        text = re.sub(r'\s+', ' ', text)
        
        # 6. Truncate very long reviews (>1500 chars) - increased limit
        if len(text) > 1500:
            text = text[:1497] + "..."
        
        # 7. Strip whitespace
        text = text.strip()
        
        # Track stats
        self.stats['chars_original'] += original_len
        self.stats['chars_cleaned'] += len(text)
        
        return text
    
    def _remove_emojis(self, text: str) -> str:
        """
        Remove emojis but keep more unicode characters.
        FIXED: Less aggressive pattern.
        """
        # Only remove actual emoji pictographs, not all unicode
        emoji_pattern = re.compile(
            "["
            "\U0001F600-\U0001F64F"  # emoticons
            "\U0001F300-\U0001F5FF"  # symbols & pictographs  
            "\U0001F680-\U0001F6FF"  # transport & map symbols
            "\U0001F1E0-\U0001F1FF"  # flags
            "\U0001F900-\U0001F9FF"  # supplemental symbols
            "\U0001FA00-\U0001FA6F"  # chess symbols
            "\U0001FA70-\U0001FAFF"  # symbols extended
            "\U00002702-\U000027B0"  # dingbats
            "]+",
            flags=re.UNICODE
        )
        return emoji_pattern.sub('', text)
    
    # =========================================================================
    # [PROC-02] DUPLICATE DETECTION - NEW METHOD
    # =========================================================================
    def _get_word_set(self, text: str) -> Set[str]:
        """
        Extract set of meaningful words from text for comparison.
        Ignores common stop words and very short words.
        """
        # Simple stop words (common words that don't help identify duplicates)
        stop_words = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
            'of', 'with', 'by', 'is', 'was', 'are', 'were', 'be', 'been', 'being',
            'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
            'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those',
            'i', 'we', 'you', 'they', 'it', 'my', 'our', 'your', 'their', 'its',
            'very', 'really', 'so', 'just', 'also', 'as', 'if', 'when', 'where'
        }
        
        # Extract words (alphanumeric only, lowercase)
        words = re.findall(r'\b[a-z]+\b', text.lower())
        
        # Filter out stop words and very short words
        meaningful = {w for w in words if len(w) > 2 and w not in stop_words}
        
        return meaningful
    
    def _calculate_similarity(self, text1: str, text2: str) -> float:
        """
        Calculate similarity between two texts using Jaccard similarity.
        Returns value from 0.0 (completely different) to 1.0 (identical).
        
        This is a simple, dependency-free implementation.
        """
        words1 = self._get_word_set(text1)
        words2 = self._get_word_set(text2)
        
        # Handle edge cases
        if not words1 and not words2:
            return 1.0  # Both empty = same
        if not words1 or not words2:
            return 0.0  # One empty = different
        
        # Jaccard similarity: intersection / union
        intersection = len(words1 & words2)
        union = len(words1 | words2)
        
        return intersection / union if union > 0 else 0.0
    
    def _is_duplicate(self, text: str, existing_reviews: List[str]) -> bool:
        """
        Check if text is a duplicate of any existing review.
        Uses fuzzy matching to catch near-duplicates.
        
        Returns True if text is a duplicate, False otherwise.
        """
        # Quick exact match check first (fast)
        if text in existing_reviews:
            return True
        
        # Fuzzy match for near-duplicates
        for existing in existing_reviews:
            similarity = self._calculate_similarity(text, existing)
            if similarity >= self.DUPLICATE_SIMILARITY_THRESHOLD:
                if self.verbose:
                    print(f"   ๐Ÿ”„ Found duplicate ({similarity:.0%} similar)")
                return True
        
        return False
    # =========================================================================
    # END [PROC-02] DUPLICATE DETECTION
    # =========================================================================
    
    def clean_reviews(self, reviews: List[str]) -> List[str]:
        """
        Clean a list of reviews.
        
        FIXED: Only removes truly empty reviews, not short ones.
        [PROC-02] Now also removes duplicate reviews.
        """
        self.stats = {
            'total': len(reviews),
            'kept': 0,
            'removed_empty': 0,
            'removed_short': 0,
            'removed_duplicates': 0,  # [PROC-02] Added
            'chars_original': 0,
            'chars_cleaned': 0
        }
        
        cleaned = []
        for i, review in enumerate(reviews):
            # Clean the review
            cleaned_text = self.clean_review(review)
            
            # Check if it's still valid
            if not cleaned_text:
                self.stats['removed_empty'] += 1
                if self.verbose:
                    print(f"   โš ๏ธ  Review {i} was empty/None, skipping")
                continue
            
            if len(cleaned_text) < self.MIN_REVIEW_LENGTH:
                self.stats['removed_short'] += 1
                if self.verbose:
                    print(f"   โš ๏ธ  Review {i} too short ({len(cleaned_text)} chars): '{cleaned_text[:50]}'")
                continue
            
            # [PROC-02] Check for duplicates
            if self._is_duplicate(cleaned_text, cleaned):
                self.stats['removed_duplicates'] += 1
                if self.verbose:
                    print(f"   ๐Ÿ”„ Review {i} is a duplicate, skipping")
                continue
            
            cleaned.append(cleaned_text)
            self.stats['kept'] += 1
        
        return cleaned
    
    def get_cleaning_stats(self) -> dict:
        """Get statistics about the cleaning process."""
        return {
            "original_count": self.stats['total'],
            "cleaned_count": self.stats['kept'],
            "removed_empty": self.stats['removed_empty'],
            "removed_short": self.stats['removed_short'],
            "removed_duplicates": self.stats['removed_duplicates'],  # [PROC-02] Added
            "original_chars": self.stats['chars_original'],
            "cleaned_chars": self.stats['chars_cleaned'],
            "retention_rate": round(self.stats['kept'] / max(self.stats['total'], 1) * 100, 1)
        }


def clean_reviews_for_ai(reviews: List[str], verbose: bool = True) -> List[str]:
    """
    Convenience function to clean reviews.
    
    FIXED: Better stats reporting, less aggressive cleaning.
    [PROC-02] Now includes duplicate detection.
    """
    cleaner = ReviewCleaner(verbose=False)  # Don't spam individual messages
    cleaned = cleaner.clean_reviews(reviews)
    
    if verbose:
        stats = cleaner.get_cleaning_stats()
        print(f"๐Ÿงน Cleaned {stats['original_count']} reviews:")
        print(f"   โœ… Kept: {stats['cleaned_count']} ({stats['retention_rate']}%)")
        if stats['removed_empty'] > 0:
            print(f"   โŒ Empty: {stats['removed_empty']}")
        if stats['removed_short'] > 0:
            print(f"   โŒ Too short: {stats['removed_short']}")
        # [PROC-02] Report duplicates
        if stats['removed_duplicates'] > 0:
            print(f"   ๐Ÿ”„ Duplicates: {stats['removed_duplicates']}")
        
        # Warn if we're losing too many reviews
        if stats['retention_rate'] < 50:
            print(f"   โš ๏ธ  WARNING: Only {stats['retention_rate']}% retention! Check scraper.")
    
    return cleaned


# Also add a debug function
def analyze_review_loss(reviews: List[str]) -> None:
    """
    Debug function to understand why reviews are being lost.
    """
    print(f"\n{'='*60}")
    print("REVIEW LOSS ANALYSIS")
    print(f"{'='*60}\n")
    
    empty_count = 0
    short_count = 0
    valid_count = 0
    
    print("Sample of problematic reviews:\n")
    
    for i, review in enumerate(reviews):
        if not review or not isinstance(review, str):
            empty_count += 1
            if empty_count <= 3:
                print(f"  [{i}] EMPTY: {repr(review)}")
        elif len(review.strip()) < 10:
            short_count += 1
            if short_count <= 3:
                print(f"  [{i}] SHORT ({len(review)} chars): '{review[:50]}'")
        else:
            valid_count += 1
    
    print(f"\n{'='*60}")
    print(f"SUMMARY:")
    print(f"  Total: {len(reviews)}")
    print(f"  Valid: {valid_count} ({valid_count/len(reviews)*100:.1f}%)")
    print(f"  Empty: {empty_count}")
    print(f"  Short: {short_count}")
    print(f"{'='*60}\n")


if __name__ == "__main__":
    # Test the cleaner
    test_reviews = [
        'This place is "amazing"! The food was incredible.',
        "The food was great but service was slow. Would come back!",
        'Chef said "it\'s the best" and I agree! Great experience.',
        "Loved everything! Best Italian in town.",
        "",  # Empty
        "Good",  # Too short
        "   ",  # Just whitespace
        None,  # None
        "The pasta was perfectly cooked, al dente just how I like it.",
        # [PROC-02] Test duplicates
        "The food was great but service was slow. Would come back!",  # Exact duplicate
        "The food was great but the service was slow. Would come back again!",  # Near duplicate
    ]
    
    print("Testing review cleaner with duplicate detection...\n")
    
    # First analyze
    analyze_review_loss(test_reviews)
    
    # Then clean
    cleaned = clean_reviews_for_ai(test_reviews, verbose=True)
    
    print(f"\nCleaned reviews ({len(cleaned)}):")
    for i, review in enumerate(cleaned):
        print(f"  {i+1}. {review[:60]}...")
    
    print("\nโœ… Duplicate detection test complete!")