Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- modal_backend.py +105 -39
modal_backend.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
Modal Backend for Restaurant Intelligence Agent - PARALLEL OPTIMIZED
|
| 3 |
Version 3.0 - Uses Modal's parallel processing for 5x speed improvement
|
| 4 |
|
| 5 |
KEY OPTIMIZATIONS:
|
|
@@ -9,6 +9,8 @@ KEY OPTIMIZATIONS:
|
|
| 9 |
4. Reduced timeout since parallel is faster
|
| 10 |
|
| 11 |
TARGET: 1000 reviews in ~5 minutes (down from 15+ minutes)
|
|
|
|
|
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
import modal
|
|
@@ -479,6 +481,7 @@ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
|
|
| 479 |
Target: 1000 reviews in ~5 minutes
|
| 480 |
"""
|
| 481 |
import time
|
|
|
|
| 482 |
start_time = time.time()
|
| 483 |
|
| 484 |
print(f"π Starting PARALLEL analysis for {url}")
|
|
@@ -505,54 +508,103 @@ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
|
|
| 505 |
if not result.get("success"):
|
| 506 |
return {"success": False, "error": result.get("error", "Scraping failed")}
|
| 507 |
|
| 508 |
-
# Check if we actually got any reviews
|
| 509 |
-
review_count = result.get('total_reviews', 0)
|
| 510 |
-
reviews_list = result.get('reviews', [])
|
| 511 |
-
if review_count == 0 and len(reviews_list) == 0:
|
| 512 |
-
return {"success": False, "error": "No reviews found. The restaurant may have no reviews or the scraper couldn't access them."}
|
| 513 |
-
|
| 514 |
print(f"β
Scraping complete in {time.time() - scrape_start:.1f}s")
|
|
|
|
| 515 |
|
| 516 |
-
#
|
|
|
|
|
|
|
| 517 |
from src.data_processing import clean_reviews_for_ai
|
| 518 |
-
import pandas as pd
|
| 519 |
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
df = pd.DataFrame({
|
| 525 |
-
'name': result.get('names', []),
|
| 526 |
-
'date': result.get('dates', []),
|
| 527 |
-
'overall_rating': result.get('overall_ratings', []),
|
| 528 |
-
'food_rating': result.get('food_ratings', []),
|
| 529 |
-
'service_rating': result.get('service_ratings', []),
|
| 530 |
-
'ambience_rating': result.get('ambience_ratings', []),
|
| 531 |
-
'review_text':
|
| 532 |
})
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
|
|
|
|
|
|
|
|
|
| 538 |
|
| 539 |
-
# Ensure all lists are same length
|
| 540 |
-
n = len(reviews_list)
|
| 541 |
df = pd.DataFrame({
|
| 542 |
'name': [''] * n,
|
| 543 |
'date': (dates_list + [''] * n)[:n],
|
| 544 |
-
'overall_rating': (ratings_list + [0] * n)[:n],
|
| 545 |
-
'food_rating': [0] * n,
|
| 546 |
-
'service_rating': [0] * n,
|
| 547 |
-
'ambience_rating': [0] * n,
|
| 548 |
-
'review_text':
|
| 549 |
})
|
|
|
|
|
|
|
|
|
|
| 550 |
else:
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
# Convert ratings to numeric (handles both numeric and text ratings)
|
|
|
|
| 556 |
def parse_rating(val):
|
| 557 |
"""Convert rating to numeric. OpenTable uses text ratings like 'Excellent', 'Good', etc."""
|
| 558 |
if pd.isna(val) or val == '' or val is None:
|
|
@@ -561,7 +613,7 @@ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
|
|
| 561 |
# If already numeric
|
| 562 |
try:
|
| 563 |
num = float(val)
|
| 564 |
-
if
|
| 565 |
return num
|
| 566 |
except (ValueError, TypeError):
|
| 567 |
pass
|
|
@@ -593,7 +645,7 @@ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
|
|
| 593 |
# Get clean review texts
|
| 594 |
reviews = clean_reviews_for_ai(df["review_text"].dropna().tolist(), verbose=False)
|
| 595 |
|
| 596 |
-
print(f"π Total reviews: {len(reviews)}")
|
| 597 |
|
| 598 |
# Debug: Check what ratings we got
|
| 599 |
valid_ratings = df['overall_rating'][df['overall_rating'] > 0]
|
|
@@ -601,7 +653,9 @@ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
|
|
| 601 |
if len(valid_ratings) > 0:
|
| 602 |
print(f"π Rating range: {valid_ratings.min():.1f} to {valid_ratings.max():.1f}, avg: {valid_ratings.mean():.2f}")
|
| 603 |
|
| 604 |
-
#
|
|
|
|
|
|
|
| 605 |
trend_data = []
|
| 606 |
for _, row in df.iterrows():
|
| 607 |
text = str(row.get("review_text", ""))
|
|
@@ -613,12 +667,24 @@ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
|
|
| 613 |
# Map sentiment (-1 to 1) to rating (1 to 5)
|
| 614 |
rating = round((sentiment + 1) * 2 + 1, 1) # -1β1, 0β3, 1β5
|
| 615 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
trend_data.append({
|
| 617 |
-
"date":
|
| 618 |
"rating": rating,
|
| 619 |
"sentiment": sentiment
|
| 620 |
})
|
| 621 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
# Extract restaurant name
|
| 623 |
if platform == "opentable":
|
| 624 |
restaurant_name = url.split("/")[-1].split("?")[0].replace("-", " ").title()
|
|
|
|
| 1 |
"""
|
| 2 |
+
New Modal Backend for Restaurant Intelligence Agent - PARALLEL OPTIMIZED
|
| 3 |
Version 3.0 - Uses Modal's parallel processing for 5x speed improvement
|
| 4 |
|
| 5 |
KEY OPTIMIZATIONS:
|
|
|
|
| 9 |
4. Reduced timeout since parallel is faster
|
| 10 |
|
| 11 |
TARGET: 1000 reviews in ~5 minutes (down from 15+ minutes)
|
| 12 |
+
|
| 13 |
+
FIXED: Proper handling of both OpenTable and Google Maps scraper response formats
|
| 14 |
"""
|
| 15 |
|
| 16 |
import modal
|
|
|
|
| 481 |
Target: 1000 reviews in ~5 minutes
|
| 482 |
"""
|
| 483 |
import time
|
| 484 |
+
import pandas as pd
|
| 485 |
start_time = time.time()
|
| 486 |
|
| 487 |
print(f"π Starting PARALLEL analysis for {url}")
|
|
|
|
| 508 |
if not result.get("success"):
|
| 509 |
return {"success": False, "error": result.get("error", "Scraping failed")}
|
| 510 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
print(f"β
Scraping complete in {time.time() - scrape_start:.1f}s")
|
| 512 |
+
print(f"π¦ Raw result keys: {list(result.keys())}")
|
| 513 |
|
| 514 |
+
# =========================================================================
|
| 515 |
+
# FIXED: Properly handle BOTH OpenTable and Google Maps response formats
|
| 516 |
+
# =========================================================================
|
| 517 |
from src.data_processing import clean_reviews_for_ai
|
|
|
|
| 518 |
|
| 519 |
+
df = None
|
| 520 |
+
|
| 521 |
+
# Check what format we got
|
| 522 |
+
reviews_data = result.get('reviews', {})
|
| 523 |
+
print(f"π¦ reviews_data type: {type(reviews_data)}")
|
| 524 |
+
|
| 525 |
+
# FORMAT 1: Google Maps - nested dict with 'review_texts' key
|
| 526 |
+
# {'reviews': {'names': [...], 'dates': [...], 'overall_ratings': [...], 'review_texts': [...]}}
|
| 527 |
+
if isinstance(reviews_data, dict) and 'review_texts' in reviews_data:
|
| 528 |
+
print("π Detected Google Maps format (nested dict with review_texts)")
|
| 529 |
+
review_texts = reviews_data.get('review_texts', [])
|
| 530 |
+
n = len(review_texts)
|
| 531 |
+
|
| 532 |
+
if n == 0:
|
| 533 |
+
return {"success": False, "error": "No reviews found in Google Maps response."}
|
| 534 |
+
|
| 535 |
+
df = pd.DataFrame({
|
| 536 |
+
'name': (reviews_data.get('names', []) + [''] * n)[:n],
|
| 537 |
+
'date': (reviews_data.get('dates', []) + [''] * n)[:n],
|
| 538 |
+
'overall_rating': (reviews_data.get('overall_ratings', []) + [0.0] * n)[:n],
|
| 539 |
+
'food_rating': reviews_data.get('food_ratings', [0.0] * n)[:n],
|
| 540 |
+
'service_rating': reviews_data.get('service_ratings', [0.0] * n)[:n],
|
| 541 |
+
'ambience_rating': reviews_data.get('ambience_ratings', [0.0] * n)[:n],
|
| 542 |
+
'review_text': review_texts
|
| 543 |
+
})
|
| 544 |
+
print(f"β
Built DataFrame from Google Maps: {len(df)} reviews")
|
| 545 |
+
|
| 546 |
+
# FORMAT 2: OpenTable - flat structure at top level
|
| 547 |
+
# {'names': [...], 'dates': [...], 'overall_ratings': [...], 'reviews': [...]}
|
| 548 |
+
elif 'names' in result and isinstance(result.get('names'), list):
|
| 549 |
+
print("π Detected OpenTable format (flat top-level arrays)")
|
| 550 |
+
review_texts = result.get('reviews', [])
|
| 551 |
+
n = len(review_texts) if isinstance(review_texts, list) else 0
|
| 552 |
+
|
| 553 |
+
if n == 0:
|
| 554 |
+
# Try 'review_texts' key as fallback
|
| 555 |
+
review_texts = result.get('review_texts', [])
|
| 556 |
+
n = len(review_texts)
|
| 557 |
+
|
| 558 |
+
if n == 0:
|
| 559 |
+
return {"success": False, "error": "No reviews found in OpenTable response."}
|
| 560 |
+
|
| 561 |
df = pd.DataFrame({
|
| 562 |
+
'name': (result.get('names', []) + [''] * n)[:n],
|
| 563 |
+
'date': (result.get('dates', []) + [''] * n)[:n],
|
| 564 |
+
'overall_rating': (result.get('overall_ratings', []) + [0.0] * n)[:n],
|
| 565 |
+
'food_rating': (result.get('food_ratings', []) + [0.0] * n)[:n],
|
| 566 |
+
'service_rating': (result.get('service_ratings', []) + [0.0] * n)[:n],
|
| 567 |
+
'ambience_rating': (result.get('ambience_ratings', []) + [0.0] * n)[:n],
|
| 568 |
+
'review_text': review_texts
|
| 569 |
})
|
| 570 |
+
print(f"β
Built DataFrame from OpenTable: {len(df)} reviews")
|
| 571 |
+
|
| 572 |
+
# FORMAT 3: Simple list of reviews (legacy format)
|
| 573 |
+
elif isinstance(reviews_data, list) and len(reviews_data) > 0:
|
| 574 |
+
print("π Detected simple list format")
|
| 575 |
+
n = len(reviews_data)
|
| 576 |
+
dates_list = result.get('dates', [''] * n)
|
| 577 |
+
ratings_list = result.get('ratings', result.get('overall_ratings', [0.0] * n))
|
| 578 |
|
|
|
|
|
|
|
| 579 |
df = pd.DataFrame({
|
| 580 |
'name': [''] * n,
|
| 581 |
'date': (dates_list + [''] * n)[:n],
|
| 582 |
+
'overall_rating': (ratings_list + [0.0] * n)[:n],
|
| 583 |
+
'food_rating': [0.0] * n,
|
| 584 |
+
'service_rating': [0.0] * n,
|
| 585 |
+
'ambience_rating': [0.0] * n,
|
| 586 |
+
'review_text': reviews_data
|
| 587 |
})
|
| 588 |
+
print(f"β
Built DataFrame from list: {len(df)} reviews")
|
| 589 |
+
|
| 590 |
+
# FORMAT 4: Fallback to process_reviews
|
| 591 |
else:
|
| 592 |
+
print("π Using fallback process_reviews()")
|
| 593 |
+
try:
|
| 594 |
+
from src.data_processing import process_reviews
|
| 595 |
+
df = process_reviews(result)
|
| 596 |
+
print(f"β
Built DataFrame via process_reviews: {len(df)} reviews")
|
| 597 |
+
except Exception as e:
|
| 598 |
+
print(f"β process_reviews failed: {e}")
|
| 599 |
+
return {"success": False, "error": f"Could not parse reviews: {e}"}
|
| 600 |
|
| 601 |
+
# Validate we got something
|
| 602 |
+
if df is None or len(df) == 0:
|
| 603 |
+
return {"success": False, "error": "No reviews found. The restaurant may have no reviews or the scraper couldn't access them."}
|
| 604 |
+
|
| 605 |
+
# =========================================================================
|
| 606 |
# Convert ratings to numeric (handles both numeric and text ratings)
|
| 607 |
+
# =========================================================================
|
| 608 |
def parse_rating(val):
|
| 609 |
"""Convert rating to numeric. OpenTable uses text ratings like 'Excellent', 'Good', etc."""
|
| 610 |
if pd.isna(val) or val == '' or val is None:
|
|
|
|
| 613 |
# If already numeric
|
| 614 |
try:
|
| 615 |
num = float(val)
|
| 616 |
+
if 0 <= num <= 5:
|
| 617 |
return num
|
| 618 |
except (ValueError, TypeError):
|
| 619 |
pass
|
|
|
|
| 645 |
# Get clean review texts
|
| 646 |
reviews = clean_reviews_for_ai(df["review_text"].dropna().tolist(), verbose=False)
|
| 647 |
|
| 648 |
+
print(f"π Total clean reviews: {len(reviews)}")
|
| 649 |
|
| 650 |
# Debug: Check what ratings we got
|
| 651 |
valid_ratings = df['overall_rating'][df['overall_rating'] > 0]
|
|
|
|
| 653 |
if len(valid_ratings) > 0:
|
| 654 |
print(f"π Rating range: {valid_ratings.min():.1f} to {valid_ratings.max():.1f}, avg: {valid_ratings.mean():.2f}")
|
| 655 |
|
| 656 |
+
# =========================================================================
|
| 657 |
+
# Create trend_data with proper date handling
|
| 658 |
+
# =========================================================================
|
| 659 |
trend_data = []
|
| 660 |
for _, row in df.iterrows():
|
| 661 |
text = str(row.get("review_text", ""))
|
|
|
|
| 667 |
# Map sentiment (-1 to 1) to rating (1 to 5)
|
| 668 |
rating = round((sentiment + 1) * 2 + 1, 1) # -1β1, 0β3, 1β5
|
| 669 |
|
| 670 |
+
date_val = row.get("date", "")
|
| 671 |
+
# Clean up date string
|
| 672 |
+
if pd.isna(date_val):
|
| 673 |
+
date_val = ""
|
| 674 |
+
else:
|
| 675 |
+
date_val = str(date_val).strip()
|
| 676 |
+
|
| 677 |
trend_data.append({
|
| 678 |
+
"date": date_val,
|
| 679 |
"rating": rating,
|
| 680 |
"sentiment": sentiment
|
| 681 |
})
|
| 682 |
|
| 683 |
+
print(f"π Trend data points: {len(trend_data)}")
|
| 684 |
+
if trend_data:
|
| 685 |
+
sample_dates = [t['date'] for t in trend_data[:5]]
|
| 686 |
+
print(f"π Sample dates: {sample_dates}")
|
| 687 |
+
|
| 688 |
# Extract restaurant name
|
| 689 |
if platform == "opentable":
|
| 690 |
restaurant_name = url.split("/")[-1].split("?")[0].replace("-", " ").title()
|