TushP commited on
Commit
3b719a9
Β·
verified Β·
1 Parent(s): 2fd9100

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. modal_backend.py +105 -39
modal_backend.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Modal Backend for Restaurant Intelligence Agent - PARALLEL OPTIMIZED
3
  Version 3.0 - Uses Modal's parallel processing for 5x speed improvement
4
 
5
  KEY OPTIMIZATIONS:
@@ -9,6 +9,8 @@ KEY OPTIMIZATIONS:
9
  4. Reduced timeout since parallel is faster
10
 
11
  TARGET: 1000 reviews in ~5 minutes (down from 15+ minutes)
 
 
12
  """
13
 
14
  import modal
@@ -479,6 +481,7 @@ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
479
  Target: 1000 reviews in ~5 minutes
480
  """
481
  import time
 
482
  start_time = time.time()
483
 
484
  print(f"πŸš€ Starting PARALLEL analysis for {url}")
@@ -505,54 +508,103 @@ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
505
  if not result.get("success"):
506
  return {"success": False, "error": result.get("error", "Scraping failed")}
507
 
508
- # Check if we actually got any reviews
509
- review_count = result.get('total_reviews', 0)
510
- reviews_list = result.get('reviews', [])
511
- if review_count == 0 and len(reviews_list) == 0:
512
- return {"success": False, "error": "No reviews found. The restaurant may have no reviews or the scraper couldn't access them."}
513
-
514
  print(f"βœ… Scraping complete in {time.time() - scrape_start:.1f}s")
 
515
 
516
- # Process reviews - FIXED: Handle both old and new scraper formats
 
 
517
  from src.data_processing import clean_reviews_for_ai
518
- import pandas as pd
519
 
520
- # The scraper returns data at top level, not nested under 'reviews'
521
- # Build DataFrame directly from scraper result
522
- if 'names' in result:
523
- # OpenTable format: data at top level with parallel arrays
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  df = pd.DataFrame({
525
- 'name': result.get('names', []),
526
- 'date': result.get('dates', []),
527
- 'overall_rating': result.get('overall_ratings', []),
528
- 'food_rating': result.get('food_ratings', []),
529
- 'service_rating': result.get('service_ratings', []),
530
- 'ambience_rating': result.get('ambience_ratings', []),
531
- 'review_text': result.get('reviews', [])
532
  })
533
- elif 'reviews' in result and isinstance(result['reviews'], list):
534
- # Google Maps format: just a list of review texts
535
- reviews_list = result.get('reviews', [])
536
- dates_list = result.get('dates', [''] * len(reviews_list))
537
- ratings_list = result.get('ratings', [0] * len(reviews_list))
 
 
 
538
 
539
- # Ensure all lists are same length
540
- n = len(reviews_list)
541
  df = pd.DataFrame({
542
  'name': [''] * n,
543
  'date': (dates_list + [''] * n)[:n],
544
- 'overall_rating': (ratings_list + [0] * n)[:n],
545
- 'food_rating': [0] * n,
546
- 'service_rating': [0] * n,
547
- 'ambience_rating': [0] * n,
548
- 'review_text': reviews_list
549
  })
 
 
 
550
  else:
551
- # Fallback: try old format with process_reviews
552
- from src.data_processing import process_reviews
553
- df = process_reviews(result)
 
 
 
 
 
554
 
 
 
 
 
 
555
  # Convert ratings to numeric (handles both numeric and text ratings)
 
556
  def parse_rating(val):
557
  """Convert rating to numeric. OpenTable uses text ratings like 'Excellent', 'Good', etc."""
558
  if pd.isna(val) or val == '' or val is None:
@@ -561,7 +613,7 @@ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
561
  # If already numeric
562
  try:
563
  num = float(val)
564
- if 1 <= num <= 5:
565
  return num
566
  except (ValueError, TypeError):
567
  pass
@@ -593,7 +645,7 @@ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
593
  # Get clean review texts
594
  reviews = clean_reviews_for_ai(df["review_text"].dropna().tolist(), verbose=False)
595
 
596
- print(f"πŸ“Š Total reviews: {len(reviews)}")
597
 
598
  # Debug: Check what ratings we got
599
  valid_ratings = df['overall_rating'][df['overall_rating'] > 0]
@@ -601,7 +653,9 @@ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
601
  if len(valid_ratings) > 0:
602
  print(f"πŸ“Š Rating range: {valid_ratings.min():.1f} to {valid_ratings.max():.1f}, avg: {valid_ratings.mean():.2f}")
603
 
604
- # Create trend data with better defaults
 
 
605
  trend_data = []
606
  for _, row in df.iterrows():
607
  text = str(row.get("review_text", ""))
@@ -613,12 +667,24 @@ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
613
  # Map sentiment (-1 to 1) to rating (1 to 5)
614
  rating = round((sentiment + 1) * 2 + 1, 1) # -1β†’1, 0β†’3, 1β†’5
615
 
 
 
 
 
 
 
 
616
  trend_data.append({
617
- "date": str(row.get("date", "")),
618
  "rating": rating,
619
  "sentiment": sentiment
620
  })
621
 
 
 
 
 
 
622
  # Extract restaurant name
623
  if platform == "opentable":
624
  restaurant_name = url.split("/")[-1].split("?")[0].replace("-", " ").title()
 
1
  """
2
+ New Modal Backend for Restaurant Intelligence Agent - PARALLEL OPTIMIZED
3
  Version 3.0 - Uses Modal's parallel processing for 5x speed improvement
4
 
5
  KEY OPTIMIZATIONS:
 
9
  4. Reduced timeout since parallel is faster
10
 
11
  TARGET: 1000 reviews in ~5 minutes (down from 15+ minutes)
12
+
13
+ FIXED: Proper handling of both OpenTable and Google Maps scraper response formats
14
  """
15
 
16
  import modal
 
481
  Target: 1000 reviews in ~5 minutes
482
  """
483
  import time
484
+ import pandas as pd
485
  start_time = time.time()
486
 
487
  print(f"πŸš€ Starting PARALLEL analysis for {url}")
 
508
  if not result.get("success"):
509
  return {"success": False, "error": result.get("error", "Scraping failed")}
510
 
 
 
 
 
 
 
511
  print(f"βœ… Scraping complete in {time.time() - scrape_start:.1f}s")
512
+ print(f"πŸ“¦ Raw result keys: {list(result.keys())}")
513
 
514
+ # =========================================================================
515
+ # FIXED: Properly handle BOTH OpenTable and Google Maps response formats
516
+ # =========================================================================
517
  from src.data_processing import clean_reviews_for_ai
 
518
 
519
+ df = None
520
+
521
+ # Check what format we got
522
+ reviews_data = result.get('reviews', {})
523
+ print(f"πŸ“¦ reviews_data type: {type(reviews_data)}")
524
+
525
+ # FORMAT 1: Google Maps - nested dict with 'review_texts' key
526
+ # {'reviews': {'names': [...], 'dates': [...], 'overall_ratings': [...], 'review_texts': [...]}}
527
+ if isinstance(reviews_data, dict) and 'review_texts' in reviews_data:
528
+ print("πŸ“‹ Detected Google Maps format (nested dict with review_texts)")
529
+ review_texts = reviews_data.get('review_texts', [])
530
+ n = len(review_texts)
531
+
532
+ if n == 0:
533
+ return {"success": False, "error": "No reviews found in Google Maps response."}
534
+
535
+ df = pd.DataFrame({
536
+ 'name': (reviews_data.get('names', []) + [''] * n)[:n],
537
+ 'date': (reviews_data.get('dates', []) + [''] * n)[:n],
538
+ 'overall_rating': (reviews_data.get('overall_ratings', []) + [0.0] * n)[:n],
539
+ 'food_rating': reviews_data.get('food_ratings', [0.0] * n)[:n],
540
+ 'service_rating': reviews_data.get('service_ratings', [0.0] * n)[:n],
541
+ 'ambience_rating': reviews_data.get('ambience_ratings', [0.0] * n)[:n],
542
+ 'review_text': review_texts
543
+ })
544
+ print(f"βœ… Built DataFrame from Google Maps: {len(df)} reviews")
545
+
546
+ # FORMAT 2: OpenTable - flat structure at top level
547
+ # {'names': [...], 'dates': [...], 'overall_ratings': [...], 'reviews': [...]}
548
+ elif 'names' in result and isinstance(result.get('names'), list):
549
+ print("πŸ“‹ Detected OpenTable format (flat top-level arrays)")
550
+ review_texts = result.get('reviews', [])
551
+ n = len(review_texts) if isinstance(review_texts, list) else 0
552
+
553
+ if n == 0:
554
+ # Try 'review_texts' key as fallback
555
+ review_texts = result.get('review_texts', [])
556
+ n = len(review_texts)
557
+
558
+ if n == 0:
559
+ return {"success": False, "error": "No reviews found in OpenTable response."}
560
+
561
  df = pd.DataFrame({
562
+ 'name': (result.get('names', []) + [''] * n)[:n],
563
+ 'date': (result.get('dates', []) + [''] * n)[:n],
564
+ 'overall_rating': (result.get('overall_ratings', []) + [0.0] * n)[:n],
565
+ 'food_rating': (result.get('food_ratings', []) + [0.0] * n)[:n],
566
+ 'service_rating': (result.get('service_ratings', []) + [0.0] * n)[:n],
567
+ 'ambience_rating': (result.get('ambience_ratings', []) + [0.0] * n)[:n],
568
+ 'review_text': review_texts
569
  })
570
+ print(f"βœ… Built DataFrame from OpenTable: {len(df)} reviews")
571
+
572
+ # FORMAT 3: Simple list of reviews (legacy format)
573
+ elif isinstance(reviews_data, list) and len(reviews_data) > 0:
574
+ print("πŸ“‹ Detected simple list format")
575
+ n = len(reviews_data)
576
+ dates_list = result.get('dates', [''] * n)
577
+ ratings_list = result.get('ratings', result.get('overall_ratings', [0.0] * n))
578
 
 
 
579
  df = pd.DataFrame({
580
  'name': [''] * n,
581
  'date': (dates_list + [''] * n)[:n],
582
+ 'overall_rating': (ratings_list + [0.0] * n)[:n],
583
+ 'food_rating': [0.0] * n,
584
+ 'service_rating': [0.0] * n,
585
+ 'ambience_rating': [0.0] * n,
586
+ 'review_text': reviews_data
587
  })
588
+ print(f"βœ… Built DataFrame from list: {len(df)} reviews")
589
+
590
+ # FORMAT 4: Fallback to process_reviews
591
  else:
592
+ print("πŸ“‹ Using fallback process_reviews()")
593
+ try:
594
+ from src.data_processing import process_reviews
595
+ df = process_reviews(result)
596
+ print(f"βœ… Built DataFrame via process_reviews: {len(df)} reviews")
597
+ except Exception as e:
598
+ print(f"❌ process_reviews failed: {e}")
599
+ return {"success": False, "error": f"Could not parse reviews: {e}"}
600
 
601
+ # Validate we got something
602
+ if df is None or len(df) == 0:
603
+ return {"success": False, "error": "No reviews found. The restaurant may have no reviews or the scraper couldn't access them."}
604
+
605
+ # =========================================================================
606
  # Convert ratings to numeric (handles both numeric and text ratings)
607
+ # =========================================================================
608
  def parse_rating(val):
609
  """Convert rating to numeric. OpenTable uses text ratings like 'Excellent', 'Good', etc."""
610
  if pd.isna(val) or val == '' or val is None:
 
613
  # If already numeric
614
  try:
615
  num = float(val)
616
+ if 0 <= num <= 5:
617
  return num
618
  except (ValueError, TypeError):
619
  pass
 
645
  # Get clean review texts
646
  reviews = clean_reviews_for_ai(df["review_text"].dropna().tolist(), verbose=False)
647
 
648
+ print(f"πŸ“Š Total clean reviews: {len(reviews)}")
649
 
650
  # Debug: Check what ratings we got
651
  valid_ratings = df['overall_rating'][df['overall_rating'] > 0]
 
653
  if len(valid_ratings) > 0:
654
  print(f"πŸ“Š Rating range: {valid_ratings.min():.1f} to {valid_ratings.max():.1f}, avg: {valid_ratings.mean():.2f}")
655
 
656
+ # =========================================================================
657
+ # Create trend_data with proper date handling
658
+ # =========================================================================
659
  trend_data = []
660
  for _, row in df.iterrows():
661
  text = str(row.get("review_text", ""))
 
667
  # Map sentiment (-1 to 1) to rating (1 to 5)
668
  rating = round((sentiment + 1) * 2 + 1, 1) # -1β†’1, 0β†’3, 1β†’5
669
 
670
+ date_val = row.get("date", "")
671
+ # Clean up date string
672
+ if pd.isna(date_val):
673
+ date_val = ""
674
+ else:
675
+ date_val = str(date_val).strip()
676
+
677
  trend_data.append({
678
+ "date": date_val,
679
  "rating": rating,
680
  "sentiment": sentiment
681
  })
682
 
683
+ print(f"πŸ“Š Trend data points: {len(trend_data)}")
684
+ if trend_data:
685
+ sample_dates = [t['date'] for t in trend_data[:5]]
686
+ print(f"πŸ“Š Sample dates: {sample_dates}")
687
+
688
  # Extract restaurant name
689
  if platform == "opentable":
690
  restaurant_name = url.split("/")[-1].split("?")[0].replace("-", " ").title()