TushP's picture
Upload folder using huggingface_hub
ae3c6b8 verified
# ============================================================
# CHANGELOG - review_processor.py
# ============================================================
# Issue ID | Change Description | Lines Affected
# ------------------------------------------------------------
# PROC-01 | Added multi-format handling (NESTED + legacy) | Lines ~30-80
# | - Detects format from scraper result |
# | - Handles both OpenTable and Google Maps |
# NEW | Dynamic source detection from metadata | Lines ~85-90
# | - Uses metadata.source if available |
# | - Fallback to 'unknown' |
# NEW | Graceful handling of missing rating fields | Lines ~50-70
# | - Google Maps lacks food/service/ambience |
# | - Fills with 0.0 if missing |
# ============================================================
# IMPORTANT: All other code is UNCHANGED from original working version
# ============================================================
"""
Review data processor - Converts scraped JSON to clean pandas DataFrame
UPDATED: Now supports both OpenTable and Google Maps scrapers
- Handles NESTED format (new standard)
- Handles legacy FLAT format (backwards compatible)
- Graceful handling of missing fields (Google Maps doesn't have sub-ratings)
"""
import pandas as pd
from typing import Dict, Any, List, Optional
from pathlib import Path
def process_reviews(scraper_result: Dict[str, Any]) -> pd.DataFrame:
"""
Convert scraper output to clean pandas DataFrame.
Supports multiple input formats:
1. NESTED format: {'reviews': {'names': [...], 'review_texts': [...], ...}}
2. FLAT format (legacy): {'names': [...], 'reviews': [...], ...}
Args:
scraper_result: Output from scrape_opentable() or scrape_google_maps()
Returns:
DataFrame with columns: name, date, overall_rating, food_rating,
service_rating, ambience_rating, review_text, source
"""
if not scraper_result.get('success', False):
raise ValueError(f"Scraper failed: {scraper_result.get('error', 'Unknown error')}")
# =========================================================================
# [PROC-01] Detect format and extract reviews data
# =========================================================================
reviews_data = scraper_result.get('reviews', {})
# FORMAT 1: NESTED dict (new standard - both scrapers use this now)
# {'reviews': {'names': [...], 'dates': [...], 'review_texts': [...], ...}}
if isinstance(reviews_data, dict) and 'review_texts' in reviews_data:
print("📋 Detected NESTED format")
n = len(reviews_data.get('review_texts', []))
if n == 0:
raise ValueError("No reviews found in NESTED format response")
df = pd.DataFrame({
'name': _safe_get_list(reviews_data, 'names', n),
'date': _safe_get_list(reviews_data, 'dates', n),
'overall_rating': _safe_get_list(reviews_data, 'overall_ratings', n, default=0.0),
'food_rating': _safe_get_list(reviews_data, 'food_ratings', n, default=0.0),
'service_rating': _safe_get_list(reviews_data, 'service_ratings', n, default=0.0),
'ambience_rating': _safe_get_list(reviews_data, 'ambience_ratings', n, default=0.0),
'review_text': reviews_data.get('review_texts', [])
})
# FORMAT 2: FLAT format (legacy - for backwards compatibility)
# {'names': [...], 'dates': [...], 'reviews': [...], ...}
elif 'names' in scraper_result and isinstance(scraper_result.get('names'), list):
print("📋 Detected FLAT format (legacy)")
# Try 'review_texts' first, then 'reviews' as fallback
review_texts = scraper_result.get('review_texts', scraper_result.get('reviews', []))
n = len(review_texts) if isinstance(review_texts, list) else 0
if n == 0:
raise ValueError("No reviews found in FLAT format response")
df = pd.DataFrame({
'name': _safe_get_list(scraper_result, 'names', n),
'date': _safe_get_list(scraper_result, 'dates', n),
'overall_rating': _safe_get_list(scraper_result, 'overall_ratings', n, default=0.0),
'food_rating': _safe_get_list(scraper_result, 'food_ratings', n, default=0.0),
'service_rating': _safe_get_list(scraper_result, 'service_ratings', n, default=0.0),
'ambience_rating': _safe_get_list(scraper_result, 'ambience_ratings', n, default=0.0),
'review_text': review_texts
})
# FORMAT 3: Simple list of reviews (minimal format)
elif isinstance(reviews_data, list) and len(reviews_data) > 0:
print("📋 Detected simple list format")
n = len(reviews_data)
df = pd.DataFrame({
'name': [''] * n,
'date': _safe_get_list(scraper_result, 'dates', n),
'overall_rating': _safe_get_list(scraper_result, 'overall_ratings', n, default=0.0),
'food_rating': [0.0] * n,
'service_rating': [0.0] * n,
'ambience_rating': [0.0] * n,
'review_text': reviews_data
})
else:
raise ValueError(f"Unknown scraper result format. Keys: {list(scraper_result.keys())}")
print(f"✅ Created DataFrame with {len(df)} reviews")
# =========================================================================
# Convert ratings to numeric
# =========================================================================
for col in ['overall_rating', 'food_rating', 'service_rating', 'ambience_rating']:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
# =========================================================================
# Clean text fields
# =========================================================================
df['review_text'] = df['review_text'].astype(str).str.strip()
df['name'] = df['name'].astype(str).str.strip()
# =========================================================================
# Add metadata - DYNAMIC source detection
# =========================================================================
metadata = scraper_result.get('metadata', {})
source = metadata.get('source', scraper_result.get('source', 'unknown'))
df['source'] = source
df['scrape_timestamp'] = pd.Timestamp.now()
print(f"📊 Source: {source}")
return df
def _safe_get_list(data: Dict, key: str, expected_len: int, default: Any = '') -> List:
"""
Safely get a list from dict, padding with default if too short.
This handles cases where Google Maps doesn't have certain fields
that OpenTable has (like food_rating, service_rating, ambience_rating).
"""
values = data.get(key, [])
if not isinstance(values, list):
values = []
# Pad with default value if list is too short
if len(values) < expected_len:
values = values + [default] * (expected_len - len(values))
# Truncate if too long
return values[:expected_len]
def save_to_csv(df: pd.DataFrame, output_path: str = 'data/raw/reviews.csv'):
"""
Save DataFrame to CSV.
Args:
df: Processed reviews DataFrame
output_path: Where to save the CSV file
"""
output_file = Path(output_path)
output_file.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_file, index=False, encoding='utf-8')
print(f"✅ Saved {len(df)} reviews to {output_path}")
return output_path
def get_review_stats(df: pd.DataFrame) -> Dict[str, Any]:
"""
Get statistics about the processed reviews.
Args:
df: Processed reviews DataFrame
Returns:
Dict with review statistics
"""
stats = {
'total_reviews': len(df),
'unique_reviewers': df['name'].nunique(),
'date_range': {
'earliest': df['date'].min() if 'date' in df.columns else None,
'latest': df['date'].max() if 'date' in df.columns else None
},
'ratings': {}
}
# Calculate rating stats for non-zero ratings
for col in ['overall_rating', 'food_rating', 'service_rating', 'ambience_rating']:
if col in df.columns:
valid_ratings = df[col][df[col] > 0]
if len(valid_ratings) > 0:
stats['ratings'][col] = {
'count': len(valid_ratings),
'mean': round(valid_ratings.mean(), 2),
'min': valid_ratings.min(),
'max': valid_ratings.max()
}
# Source breakdown
if 'source' in df.columns:
stats['sources'] = df['source'].value_counts().to_dict()
return stats
if __name__ == "__main__":
# Test with mock data
print("Testing review processor with both formats...\n")
# Test 1: NESTED format (new standard)
print("=" * 60)
print("TEST 1: NESTED format")
print("=" * 60)
nested_result = {
'success': True,
'reviews': {
'names': ['Alice', 'Bob', 'Charlie'],
'dates': ['2 days ago', '1 week ago', '3 weeks ago'],
'overall_ratings': [5.0, 4.0, 3.5],
'food_ratings': [5.0, 4.5, 3.0],
'service_ratings': [4.5, 4.0, 4.0],
'ambience_ratings': [5.0, 3.5, 3.5],
'review_texts': [
'Amazing food! The sushi was incredible.',
'Good but a bit pricey. Service was slow.',
'Average experience. Nothing special.'
]
},
'metadata': {
'source': 'opentable',
'url': 'https://opentable.com/test'
}
}
df1 = process_reviews(nested_result)
print(f"\nDataFrame shape: {df1.shape}")
print(f"Columns: {list(df1.columns)}")
print(f"\nFirst review:\n{df1.iloc[0].to_dict()}\n")
# Test 2: Google Maps format (no sub-ratings)
print("=" * 60)
print("TEST 2: Google Maps format (missing sub-ratings)")
print("=" * 60)
gmaps_result = {
'success': True,
'reviews': {
'names': ['Dave', 'Eve'],
'dates': ['a month ago', '2 months ago'],
'overall_ratings': [4.0, 5.0],
# Note: NO food_ratings, service_ratings, ambience_ratings
'review_texts': [
'Great place for dinner!',
'Best restaurant in town.'
]
},
'metadata': {
'source': 'google_maps'
}
}
df2 = process_reviews(gmaps_result)
print(f"\nDataFrame shape: {df2.shape}")
print(f"Food rating (should be 0.0): {df2['food_rating'].tolist()}")
print(f"Source: {df2['source'].unique()}\n")
# Test 3: Stats
print("=" * 60)
print("TEST 3: Review statistics")
print("=" * 60)
stats = get_review_stats(df1)
print(f"\nStats for nested format:")
print(f" Total reviews: {stats['total_reviews']}")
print(f" Unique reviewers: {stats['unique_reviewers']}")
print(f" Rating stats: {stats['ratings']}")
print("\n✅ All tests passed!")