Spaces:

TushP
/

restaurant-intelligence-agent

Sleeping

File size: 13,048 Bytes

ae3c6b8
 
 
 
 
 
 
 
 
 
 
 
 
 
bb9baa9
df41fce
 
 
 
 
 
 
 
ae3c6b8
df41fce
 
 
bb9baa9
 
 
 
ae3c6b8
bb9baa9
 
 
 
df41fce
ae3c6b8
bb9baa9
 
df41fce
ae3c6b8
 
 
 
df41fce
 
 
 
 
 
 
 
ae3c6b8
df41fce
 
 
bb9baa9
 
 
 
 
df41fce
bb9baa9
 
 
 
df41fce
 
 
bb9baa9
 
df41fce
bb9baa9
 
df41fce
bb9baa9
 
 
df41fce
 
 
bb9baa9
df41fce
bb9baa9
 
df41fce
 
 
bb9baa9
df41fce
bb9baa9
 
df41fce
 
 
 
bb9baa9
 
 
df41fce
 
 
 
 
bb9baa9
 
 
df41fce
bb9baa9
 
df41fce
 
 
 
bb9baa9
 
 
df41fce
bb9baa9
ae3c6b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb9baa9
 
 
 
df41fce
ae3c6b8
bb9baa9
df41fce
 
 
 
 
ae3c6b8
df41fce
 
 
 
bb9baa9
 
df41fce
bb9baa9
df41fce
 
 
 
 
 
 
 
 
 
 
 
 
 
ae3c6b8
 
 
 
 
 
 
df41fce
 
bb9baa9
 
 
df41fce
bb9baa9
 
df41fce
 
 
 
ae3c6b8
df41fce
 
 
bb9baa9
 
 
 
 
 
 
df41fce
ae3c6b8
bb9baa9
df41fce
bb9baa9
 
 
df41fce
bb9baa9
df41fce
 
 
 
 
ae3c6b8
 
 
df41fce
 
 
 
bb9baa9
 
 
 
df41fce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb9baa9
 
 
ae3c6b8
 
df41fce
ae3c6b8
df41fce
 
 
 
 
ae3c6b8
 
 
bb9baa9
 
ae3c6b8
df41fce
 
 
 
 
 
 
 
 
ae3c6b8

# ============================================================
# CHANGELOG - review_cleaner.py
# ============================================================
# Issue ID | Change Description                              | Lines Affected
# ------------------------------------------------------------
# PROC-02  | Added duplicate review detection with similarity | Lines ~95-130
#          | - Added _is_duplicate() method with fuzzy match  |
#          | - Added 'removed_duplicates' to stats tracking   |
#          | - Uses simple word overlap similarity (no deps)  |
#          | - Threshold: 85% similarity = duplicate          |
# ============================================================
# IMPORTANT: All other code is UNCHANGED from original working version
# ============================================================

"""
Review Text Cleaner - FIXED VERSION
Less aggressive cleaning that preserves more reviews.

FIXES:
1. Don't discard reviews just because they're short
2. Keep reviews with minimal cleaning
3. Better handling of special characters
4. Log what's being cleaned for debugging
5. [PROC-02] Detect and remove duplicate reviews

Author: Tushar Pingle
Updated: Nov 2024
"""

import re
import unicodedata
from typing import List, Tuple, Set


class ReviewCleaner:
    """
    Cleans review text while preserving as much content as possible.
    Now includes duplicate detection.
    """
    
    # Minimum length for a valid review (characters)
    MIN_REVIEW_LENGTH = 10  # Very permissive
    
    # [PROC-02] Similarity threshold for duplicate detection (0.0 to 1.0)
    DUPLICATE_SIMILARITY_THRESHOLD = 0.85
    
    def __init__(self, verbose: bool = False):
        self.verbose = verbose
        self.stats = {
            'total': 0,
            'kept': 0,
            'removed_empty': 0,
            'removed_short': 0,
            'removed_duplicates': 0,  # [PROC-02] Added
            'chars_original': 0,
            'chars_cleaned': 0
        }
    
    def clean_review(self, text: str) -> str:
        """
        Clean a single review text.
        
        FIXED: Less aggressive cleaning, preserves more content.
        """
        if not text or not isinstance(text, str):
            return ""
        
        original_len = len(text)
        
        # 1. Basic whitespace normalization (gentle)
        text = ' '.join(text.split())
        
        # 2. Remove only truly problematic emojis (keep basic punctuation)
        text = self._remove_emojis(text)
        
        # 3. Normalize quotes (don't remove them)
        text = text.replace('"', '"').replace('"', '"')
        text = text.replace("'", "'").replace("'", "'")
        
        # 4. Remove control characters only (keep newlines as spaces)
        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C' or char == ' ')
        
        # 5. Normalize multiple spaces
        text = re.sub(r'\s+', ' ', text)
        
        # 6. Truncate very long reviews (>1500 chars) - increased limit
        if len(text) > 1500:
            text = text[:1497] + "..."
        
        # 7. Strip whitespace
        text = text.strip()
        
        # Track stats
        self.stats['chars_original'] += original_len
        self.stats['chars_cleaned'] += len(text)
        
        return text
    
    def _remove_emojis(self, text: str) -> str:
        """
        Remove emojis but keep more unicode characters.
        FIXED: Less aggressive pattern.
        """
        # Only remove actual emoji pictographs, not all unicode
        emoji_pattern = re.compile(
            "["
            "\U0001F600-\U0001F64F"  # emoticons
            "\U0001F300-\U0001F5FF"  # symbols & pictographs  
            "\U0001F680-\U0001F6FF"  # transport & map symbols
            "\U0001F1E0-\U0001F1FF"  # flags
            "\U0001F900-\U0001F9FF"  # supplemental symbols
            "\U0001FA00-\U0001FA6F"  # chess symbols
            "\U0001FA70-\U0001FAFF"  # symbols extended
            "\U00002702-\U000027B0"  # dingbats
            "]+",
            flags=re.UNICODE
        )
        return emoji_pattern.sub('', text)
    
    # =========================================================================
    # [PROC-02] DUPLICATE DETECTION - NEW METHOD
    # =========================================================================
    def _get_word_set(self, text: str) -> Set[str]:
        """
        Extract set of meaningful words from text for comparison.
        Ignores common stop words and very short words.
        """
        # Simple stop words (common words that don't help identify duplicates)
        stop_words = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
            'of', 'with', 'by', 'is', 'was', 'are', 'were', 'be', 'been', 'being',
            'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
            'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those',
            'i', 'we', 'you', 'they', 'it', 'my', 'our', 'your', 'their', 'its',
            'very', 'really', 'so', 'just', 'also', 'as', 'if', 'when', 'where'
        }
        
        # Extract words (alphanumeric only, lowercase)
        words = re.findall(r'\b[a-z]+\b', text.lower())
        
        # Filter out stop words and very short words
        meaningful = {w for w in words if len(w) > 2 and w not in stop_words}
        
        return meaningful
    
    def _calculate_similarity(self, text1: str, text2: str) -> float:
        """
        Calculate similarity between two texts using Jaccard similarity.
        Returns value from 0.0 (completely different) to 1.0 (identical).
        
        This is a simple, dependency-free implementation.
        """
        words1 = self._get_word_set(text1)
        words2 = self._get_word_set(text2)
        
        # Handle edge cases
        if not words1 and not words2:
            return 1.0  # Both empty = same
        if not words1 or not words2:
            return 0.0  # One empty = different
        
        # Jaccard similarity: intersection / union
        intersection = len(words1 & words2)
        union = len(words1 | words2)
        
        return intersection / union if union > 0 else 0.0
    
    def _is_duplicate(self, text: str, existing_reviews: List[str]) -> bool:
        """
        Check if text is a duplicate of any existing review.
        Uses fuzzy matching to catch near-duplicates.
        
        Returns True if text is a duplicate, False otherwise.
        """
        # Quick exact match check first (fast)
        if text in existing_reviews:
            return True
        
        # Fuzzy match for near-duplicates
        for existing in existing_reviews:
            similarity = self._calculate_similarity(text, existing)
            if similarity >= self.DUPLICATE_SIMILARITY_THRESHOLD:
                if self.verbose:
                    print(f"   🔄 Found duplicate ({similarity:.0%} similar)")
                return True
        
        return False
    # =========================================================================
    # END [PROC-02] DUPLICATE DETECTION
    # =========================================================================
    
    def clean_reviews(self, reviews: List[str]) -> List[str]:
        """
        Clean a list of reviews.
        
        FIXED: Only removes truly empty reviews, not short ones.
        [PROC-02] Now also removes duplicate reviews.
        """
        self.stats = {
            'total': len(reviews),
            'kept': 0,
            'removed_empty': 0,
            'removed_short': 0,
            'removed_duplicates': 0,  # [PROC-02] Added
            'chars_original': 0,
            'chars_cleaned': 0
        }
        
        cleaned = []
        for i, review in enumerate(reviews):
            # Clean the review
            cleaned_text = self.clean_review(review)
            
            # Check if it's still valid
            if not cleaned_text:
                self.stats['removed_empty'] += 1
                if self.verbose:
                    print(f"   ⚠️  Review {i} was empty/None, skipping")
                continue
            
            if len(cleaned_text) < self.MIN_REVIEW_LENGTH:
                self.stats['removed_short'] += 1
                if self.verbose:
                    print(f"   ⚠️  Review {i} too short ({len(cleaned_text)} chars): '{cleaned_text[:50]}'")
                continue
            
            # [PROC-02] Check for duplicates
            if self._is_duplicate(cleaned_text, cleaned):
                self.stats['removed_duplicates'] += 1
                if self.verbose:
                    print(f"   🔄 Review {i} is a duplicate, skipping")
                continue
            
            cleaned.append(cleaned_text)
            self.stats['kept'] += 1
        
        return cleaned
    
    def get_cleaning_stats(self) -> dict:
        """Get statistics about the cleaning process."""
        return {
            "original_count": self.stats['total'],
            "cleaned_count": self.stats['kept'],
            "removed_empty": self.stats['removed_empty'],
            "removed_short": self.stats['removed_short'],
            "removed_duplicates": self.stats['removed_duplicates'],  # [PROC-02] Added
            "original_chars": self.stats['chars_original'],
            "cleaned_chars": self.stats['chars_cleaned'],
            "retention_rate": round(self.stats['kept'] / max(self.stats['total'], 1) * 100, 1)
        }


def clean_reviews_for_ai(reviews: List[str], verbose: bool = True) -> List[str]:
    """
    Convenience function to clean reviews.
    
    FIXED: Better stats reporting, less aggressive cleaning.
    [PROC-02] Now includes duplicate detection.
    """
    cleaner = ReviewCleaner(verbose=False)  # Don't spam individual messages
    cleaned = cleaner.clean_reviews(reviews)
    
    if verbose:
        stats = cleaner.get_cleaning_stats()
        print(f"🧹 Cleaned {stats['original_count']} reviews:")
        print(f"   ✅ Kept: {stats['cleaned_count']} ({stats['retention_rate']}%)")
        if stats['removed_empty'] > 0:
            print(f"   ❌ Empty: {stats['removed_empty']}")
        if stats['removed_short'] > 0:
            print(f"   ❌ Too short: {stats['removed_short']}")
        # [PROC-02] Report duplicates
        if stats['removed_duplicates'] > 0:
            print(f"   🔄 Duplicates: {stats['removed_duplicates']}")
        
        # Warn if we're losing too many reviews
        if stats['retention_rate'] < 50:
            print(f"   ⚠️  WARNING: Only {stats['retention_rate']}% retention! Check scraper.")
    
    return cleaned


# Also add a debug function
def analyze_review_loss(reviews: List[str]) -> None:
    """
    Debug function to understand why reviews are being lost.
    """
    print(f"\n{'='*60}")
    print("REVIEW LOSS ANALYSIS")
    print(f"{'='*60}\n")
    
    empty_count = 0
    short_count = 0
    valid_count = 0
    
    print("Sample of problematic reviews:\n")
    
    for i, review in enumerate(reviews):
        if not review or not isinstance(review, str):
            empty_count += 1
            if empty_count <= 3:
                print(f"  [{i}] EMPTY: {repr(review)}")
        elif len(review.strip()) < 10:
            short_count += 1
            if short_count <= 3:
                print(f"  [{i}] SHORT ({len(review)} chars): '{review[:50]}'")
        else:
            valid_count += 1
    
    print(f"\n{'='*60}")
    print(f"SUMMARY:")
    print(f"  Total: {len(reviews)}")
    print(f"  Valid: {valid_count} ({valid_count/len(reviews)*100:.1f}%)")
    print(f"  Empty: {empty_count}")
    print(f"  Short: {short_count}")
    print(f"{'='*60}\n")


if __name__ == "__main__":
    # Test the cleaner
    test_reviews = [
        'This place is "amazing"! The food was incredible.',
        "The food was great but service was slow. Would come back!",
        'Chef said "it\'s the best" and I agree! Great experience.',
        "Loved everything! Best Italian in town.",
        "",  # Empty
        "Good",  # Too short
        "   ",  # Just whitespace
        None,  # None
        "The pasta was perfectly cooked, al dente just how I like it.",
        # [PROC-02] Test duplicates
        "The food was great but service was slow. Would come back!",  # Exact duplicate
        "The food was great but the service was slow. Would come back again!",  # Near duplicate
    ]
    
    print("Testing review cleaner with duplicate detection...\n")
    
    # First analyze
    analyze_review_loss(test_reviews)
    
    # Then clean
    cleaned = clean_reviews_for_ai(test_reviews, verbose=True)
    
    print(f"\nCleaned reviews ({len(cleaned)}):")
    for i, review in enumerate(cleaned):
        print(f"  {i+1}. {review[:60]}...")
    
    print("\n✅ Duplicate detection test complete!")