Spaces:
Sleeping
Sleeping
| # ============================================================ | |
| # CHANGELOG - review_cleaner.py | |
| # ============================================================ | |
| # Issue ID | Change Description | Lines Affected | |
| # ------------------------------------------------------------ | |
| # PROC-02 | Added duplicate review detection with similarity | Lines ~95-130 | |
| # | - Added _is_duplicate() method with fuzzy match | | |
| # | - Added 'removed_duplicates' to stats tracking | | |
| # | - Uses simple word overlap similarity (no deps) | | |
| # | - Threshold: 85% similarity = duplicate | | |
| # ============================================================ | |
| # IMPORTANT: All other code is UNCHANGED from original working version | |
| # ============================================================ | |
| """ | |
| Review Text Cleaner - FIXED VERSION | |
| Less aggressive cleaning that preserves more reviews. | |
| FIXES: | |
| 1. Don't discard reviews just because they're short | |
| 2. Keep reviews with minimal cleaning | |
| 3. Better handling of special characters | |
| 4. Log what's being cleaned for debugging | |
| 5. [PROC-02] Detect and remove duplicate reviews | |
| Author: Tushar Pingle | |
| Updated: Nov 2024 | |
| """ | |
| import re | |
| import unicodedata | |
| from typing import List, Tuple, Set | |
| class ReviewCleaner: | |
| """ | |
| Cleans review text while preserving as much content as possible. | |
| Now includes duplicate detection. | |
| """ | |
| # Minimum length for a valid review (characters) | |
| MIN_REVIEW_LENGTH = 10 # Very permissive | |
| # [PROC-02] Similarity threshold for duplicate detection (0.0 to 1.0) | |
| DUPLICATE_SIMILARITY_THRESHOLD = 0.85 | |
| def __init__(self, verbose: bool = False): | |
| self.verbose = verbose | |
| self.stats = { | |
| 'total': 0, | |
| 'kept': 0, | |
| 'removed_empty': 0, | |
| 'removed_short': 0, | |
| 'removed_duplicates': 0, # [PROC-02] Added | |
| 'chars_original': 0, | |
| 'chars_cleaned': 0 | |
| } | |
| def clean_review(self, text: str) -> str: | |
| """ | |
| Clean a single review text. | |
| FIXED: Less aggressive cleaning, preserves more content. | |
| """ | |
| if not text or not isinstance(text, str): | |
| return "" | |
| original_len = len(text) | |
| # 1. Basic whitespace normalization (gentle) | |
| text = ' '.join(text.split()) | |
| # 2. Remove only truly problematic emojis (keep basic punctuation) | |
| text = self._remove_emojis(text) | |
| # 3. Normalize quotes (don't remove them) | |
| text = text.replace('"', '"').replace('"', '"') | |
| text = text.replace("'", "'").replace("'", "'") | |
| # 4. Remove control characters only (keep newlines as spaces) | |
| text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') | |
| text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C' or char == ' ') | |
| # 5. Normalize multiple spaces | |
| text = re.sub(r'\s+', ' ', text) | |
| # 6. Truncate very long reviews (>1500 chars) - increased limit | |
| if len(text) > 1500: | |
| text = text[:1497] + "..." | |
| # 7. Strip whitespace | |
| text = text.strip() | |
| # Track stats | |
| self.stats['chars_original'] += original_len | |
| self.stats['chars_cleaned'] += len(text) | |
| return text | |
| def _remove_emojis(self, text: str) -> str: | |
| """ | |
| Remove emojis but keep more unicode characters. | |
| FIXED: Less aggressive pattern. | |
| """ | |
| # Only remove actual emoji pictographs, not all unicode | |
| emoji_pattern = re.compile( | |
| "[" | |
| "\U0001F600-\U0001F64F" # emoticons | |
| "\U0001F300-\U0001F5FF" # symbols & pictographs | |
| "\U0001F680-\U0001F6FF" # transport & map symbols | |
| "\U0001F1E0-\U0001F1FF" # flags | |
| "\U0001F900-\U0001F9FF" # supplemental symbols | |
| "\U0001FA00-\U0001FA6F" # chess symbols | |
| "\U0001FA70-\U0001FAFF" # symbols extended | |
| "\U00002702-\U000027B0" # dingbats | |
| "]+", | |
| flags=re.UNICODE | |
| ) | |
| return emoji_pattern.sub('', text) | |
| # ========================================================================= | |
| # [PROC-02] DUPLICATE DETECTION - NEW METHOD | |
| # ========================================================================= | |
| def _get_word_set(self, text: str) -> Set[str]: | |
| """ | |
| Extract set of meaningful words from text for comparison. | |
| Ignores common stop words and very short words. | |
| """ | |
| # Simple stop words (common words that don't help identify duplicates) | |
| stop_words = { | |
| 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', | |
| 'of', 'with', 'by', 'is', 'was', 'are', 'were', 'be', 'been', 'being', | |
| 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', | |
| 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', | |
| 'i', 'we', 'you', 'they', 'it', 'my', 'our', 'your', 'their', 'its', | |
| 'very', 'really', 'so', 'just', 'also', 'as', 'if', 'when', 'where' | |
| } | |
| # Extract words (alphanumeric only, lowercase) | |
| words = re.findall(r'\b[a-z]+\b', text.lower()) | |
| # Filter out stop words and very short words | |
| meaningful = {w for w in words if len(w) > 2 and w not in stop_words} | |
| return meaningful | |
| def _calculate_similarity(self, text1: str, text2: str) -> float: | |
| """ | |
| Calculate similarity between two texts using Jaccard similarity. | |
| Returns value from 0.0 (completely different) to 1.0 (identical). | |
| This is a simple, dependency-free implementation. | |
| """ | |
| words1 = self._get_word_set(text1) | |
| words2 = self._get_word_set(text2) | |
| # Handle edge cases | |
| if not words1 and not words2: | |
| return 1.0 # Both empty = same | |
| if not words1 or not words2: | |
| return 0.0 # One empty = different | |
| # Jaccard similarity: intersection / union | |
| intersection = len(words1 & words2) | |
| union = len(words1 | words2) | |
| return intersection / union if union > 0 else 0.0 | |
| def _is_duplicate(self, text: str, existing_reviews: List[str]) -> bool: | |
| """ | |
| Check if text is a duplicate of any existing review. | |
| Uses fuzzy matching to catch near-duplicates. | |
| Returns True if text is a duplicate, False otherwise. | |
| """ | |
| # Quick exact match check first (fast) | |
| if text in existing_reviews: | |
| return True | |
| # Fuzzy match for near-duplicates | |
| for existing in existing_reviews: | |
| similarity = self._calculate_similarity(text, existing) | |
| if similarity >= self.DUPLICATE_SIMILARITY_THRESHOLD: | |
| if self.verbose: | |
| print(f" 🔄 Found duplicate ({similarity:.0%} similar)") | |
| return True | |
| return False | |
| # ========================================================================= | |
| # END [PROC-02] DUPLICATE DETECTION | |
| # ========================================================================= | |
| def clean_reviews(self, reviews: List[str]) -> List[str]: | |
| """ | |
| Clean a list of reviews. | |
| FIXED: Only removes truly empty reviews, not short ones. | |
| [PROC-02] Now also removes duplicate reviews. | |
| """ | |
| self.stats = { | |
| 'total': len(reviews), | |
| 'kept': 0, | |
| 'removed_empty': 0, | |
| 'removed_short': 0, | |
| 'removed_duplicates': 0, # [PROC-02] Added | |
| 'chars_original': 0, | |
| 'chars_cleaned': 0 | |
| } | |
| cleaned = [] | |
| for i, review in enumerate(reviews): | |
| # Clean the review | |
| cleaned_text = self.clean_review(review) | |
| # Check if it's still valid | |
| if not cleaned_text: | |
| self.stats['removed_empty'] += 1 | |
| if self.verbose: | |
| print(f" ⚠️ Review {i} was empty/None, skipping") | |
| continue | |
| if len(cleaned_text) < self.MIN_REVIEW_LENGTH: | |
| self.stats['removed_short'] += 1 | |
| if self.verbose: | |
| print(f" ⚠️ Review {i} too short ({len(cleaned_text)} chars): '{cleaned_text[:50]}'") | |
| continue | |
| # [PROC-02] Check for duplicates | |
| if self._is_duplicate(cleaned_text, cleaned): | |
| self.stats['removed_duplicates'] += 1 | |
| if self.verbose: | |
| print(f" 🔄 Review {i} is a duplicate, skipping") | |
| continue | |
| cleaned.append(cleaned_text) | |
| self.stats['kept'] += 1 | |
| return cleaned | |
| def get_cleaning_stats(self) -> dict: | |
| """Get statistics about the cleaning process.""" | |
| return { | |
| "original_count": self.stats['total'], | |
| "cleaned_count": self.stats['kept'], | |
| "removed_empty": self.stats['removed_empty'], | |
| "removed_short": self.stats['removed_short'], | |
| "removed_duplicates": self.stats['removed_duplicates'], # [PROC-02] Added | |
| "original_chars": self.stats['chars_original'], | |
| "cleaned_chars": self.stats['chars_cleaned'], | |
| "retention_rate": round(self.stats['kept'] / max(self.stats['total'], 1) * 100, 1) | |
| } | |
| def clean_reviews_for_ai(reviews: List[str], verbose: bool = True) -> List[str]: | |
| """ | |
| Convenience function to clean reviews. | |
| FIXED: Better stats reporting, less aggressive cleaning. | |
| [PROC-02] Now includes duplicate detection. | |
| """ | |
| cleaner = ReviewCleaner(verbose=False) # Don't spam individual messages | |
| cleaned = cleaner.clean_reviews(reviews) | |
| if verbose: | |
| stats = cleaner.get_cleaning_stats() | |
| print(f"🧹 Cleaned {stats['original_count']} reviews:") | |
| print(f" ✅ Kept: {stats['cleaned_count']} ({stats['retention_rate']}%)") | |
| if stats['removed_empty'] > 0: | |
| print(f" ❌ Empty: {stats['removed_empty']}") | |
| if stats['removed_short'] > 0: | |
| print(f" ❌ Too short: {stats['removed_short']}") | |
| # [PROC-02] Report duplicates | |
| if stats['removed_duplicates'] > 0: | |
| print(f" 🔄 Duplicates: {stats['removed_duplicates']}") | |
| # Warn if we're losing too many reviews | |
| if stats['retention_rate'] < 50: | |
| print(f" ⚠️ WARNING: Only {stats['retention_rate']}% retention! Check scraper.") | |
| return cleaned | |
| # Also add a debug function | |
| def analyze_review_loss(reviews: List[str]) -> None: | |
| """ | |
| Debug function to understand why reviews are being lost. | |
| """ | |
| print(f"\n{'='*60}") | |
| print("REVIEW LOSS ANALYSIS") | |
| print(f"{'='*60}\n") | |
| empty_count = 0 | |
| short_count = 0 | |
| valid_count = 0 | |
| print("Sample of problematic reviews:\n") | |
| for i, review in enumerate(reviews): | |
| if not review or not isinstance(review, str): | |
| empty_count += 1 | |
| if empty_count <= 3: | |
| print(f" [{i}] EMPTY: {repr(review)}") | |
| elif len(review.strip()) < 10: | |
| short_count += 1 | |
| if short_count <= 3: | |
| print(f" [{i}] SHORT ({len(review)} chars): '{review[:50]}'") | |
| else: | |
| valid_count += 1 | |
| print(f"\n{'='*60}") | |
| print(f"SUMMARY:") | |
| print(f" Total: {len(reviews)}") | |
| print(f" Valid: {valid_count} ({valid_count/len(reviews)*100:.1f}%)") | |
| print(f" Empty: {empty_count}") | |
| print(f" Short: {short_count}") | |
| print(f"{'='*60}\n") | |
| if __name__ == "__main__": | |
| # Test the cleaner | |
| test_reviews = [ | |
| 'This place is "amazing"! The food was incredible.', | |
| "The food was great but service was slow. Would come back!", | |
| 'Chef said "it\'s the best" and I agree! Great experience.', | |
| "Loved everything! Best Italian in town.", | |
| "", # Empty | |
| "Good", # Too short | |
| " ", # Just whitespace | |
| None, # None | |
| "The pasta was perfectly cooked, al dente just how I like it.", | |
| # [PROC-02] Test duplicates | |
| "The food was great but service was slow. Would come back!", # Exact duplicate | |
| "The food was great but the service was slow. Would come back again!", # Near duplicate | |
| ] | |
| print("Testing review cleaner with duplicate detection...\n") | |
| # First analyze | |
| analyze_review_loss(test_reviews) | |
| # Then clean | |
| cleaned = clean_reviews_for_ai(test_reviews, verbose=True) | |
| print(f"\nCleaned reviews ({len(cleaned)}):") | |
| for i, review in enumerate(cleaned): | |
| print(f" {i+1}. {review[:60]}...") | |
| print("\n✅ Duplicate detection test complete!") |