Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- modal_backend.py +425 -253
- src/data_processing/review_cleaner.py +158 -74
- src/scrapers/opentable_scraper.py +219 -149
modal_backend.py
CHANGED
|
@@ -1,19 +1,20 @@
|
|
| 1 |
"""
|
| 2 |
-
Modal Backend for Restaurant Intelligence Agent
|
| 3 |
-
|
| 4 |
|
| 5 |
-
|
| 6 |
-
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
1. Analysis API endpoint
|
| 12 |
-
2. MCP Server endpoint
|
| 13 |
"""
|
| 14 |
|
| 15 |
import modal
|
| 16 |
from typing import Dict, Any, List
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Create Modal app
|
| 19 |
app = modal.App("restaurant-intelligence")
|
|
@@ -33,14 +34,14 @@ image = (
|
|
| 33 |
"matplotlib",
|
| 34 |
"fastapi[standard]",
|
| 35 |
"httpx",
|
| 36 |
-
"fastmcp",
|
| 37 |
)
|
| 38 |
.add_local_python_source("src")
|
| 39 |
)
|
| 40 |
|
| 41 |
|
| 42 |
# ============================================================================
|
| 43 |
-
# HELPER
|
| 44 |
# ============================================================================
|
| 45 |
|
| 46 |
def calculate_sentiment(text: str) -> float:
|
|
@@ -65,186 +66,277 @@ def calculate_sentiment(text: str) -> float:
|
|
| 65 |
|
| 66 |
|
| 67 |
# ============================================================================
|
| 68 |
-
#
|
| 69 |
# ============================================================================
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
def mcp_server():
|
| 79 |
-
"""
|
| 80 |
-
TRUE MCP Server - exposes tools via MCP protocol over HTTP.
|
| 81 |
"""
|
| 82 |
-
|
| 83 |
-
from pydantic import BaseModel
|
| 84 |
-
from datetime import datetime
|
| 85 |
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
| 91 |
|
| 92 |
-
|
| 93 |
-
restaurant_name: str
|
| 94 |
-
reviews: List[str]
|
| 95 |
|
| 96 |
-
|
| 97 |
-
restaurant_name: str
|
| 98 |
-
question: str
|
| 99 |
-
top_k: int = 5
|
| 100 |
|
| 101 |
-
#
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
"restaurant": restaurant_name,
|
| 107 |
-
"indexed_count": len(reviews),
|
| 108 |
-
"message": f"Indexed {len(reviews)} reviews for {restaurant_name}"
|
| 109 |
-
}
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
scored.sort(reverse=True, key=lambda x: x[0])
|
| 119 |
|
| 120 |
-
|
| 121 |
-
"success": True,
|
| 122 |
-
"restaurant": restaurant_name,
|
| 123 |
-
"question": question,
|
| 124 |
-
"relevant_reviews": [r[1] for r in scored[:top_k]],
|
| 125 |
-
"review_count": min(top_k, len(reviews))
|
| 126 |
-
}
|
| 127 |
-
|
| 128 |
-
def save_report(restaurant_name: str, report_data: Dict, report_type: str = "analysis") -> Dict[str, Any]:
|
| 129 |
-
report_id = f"{restaurant_name}_{report_type}_{datetime.now().isoformat()}"
|
| 130 |
-
ANALYSIS_CACHE[report_id] = {"restaurant": restaurant_name, "type": report_type, "data": report_data}
|
| 131 |
-
return {"success": True, "report_id": report_id}
|
| 132 |
-
|
| 133 |
-
def list_tools() -> Dict[str, Any]:
|
| 134 |
-
return {
|
| 135 |
-
"success": True,
|
| 136 |
-
"tools": [
|
| 137 |
-
{"name": "index_reviews", "description": "Index reviews for RAG Q&A"},
|
| 138 |
-
{"name": "query_reviews", "description": "Answer questions about reviews"},
|
| 139 |
-
{"name": "save_report", "description": "Save analysis report"},
|
| 140 |
-
]
|
| 141 |
-
}
|
| 142 |
-
|
| 143 |
-
@mcp_api.get("/")
|
| 144 |
-
async def root():
|
| 145 |
-
return {"name": "Restaurant Intelligence MCP Server", "protocol": "MCP", "version": "1.0"}
|
| 146 |
-
|
| 147 |
-
@mcp_api.get("/health")
|
| 148 |
-
async def health():
|
| 149 |
-
return {"status": "healthy", "mcp": "enabled"}
|
| 150 |
-
|
| 151 |
-
@mcp_api.get("/tools")
|
| 152 |
-
async def get_tools():
|
| 153 |
-
return list_tools()
|
| 154 |
-
|
| 155 |
-
@mcp_api.post("/mcp/call")
|
| 156 |
-
async def call_tool(request: ToolRequest):
|
| 157 |
-
"""TRUE MCP interface - agent calls tools via this endpoint."""
|
| 158 |
-
tool_map = {
|
| 159 |
-
"index_reviews": lambda args: index_reviews(args["restaurant_name"], args["reviews"]),
|
| 160 |
-
"query_reviews": lambda args: query_reviews(args["restaurant_name"], args["question"], args.get("top_k", 5)),
|
| 161 |
-
"save_report": lambda args: save_report(args["restaurant_name"], args["report_data"], args.get("report_type", "analysis")),
|
| 162 |
-
"list_tools": lambda args: list_tools()
|
| 163 |
-
}
|
| 164 |
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
-
|
| 183 |
|
|
|
|
| 184 |
|
| 185 |
-
|
| 186 |
-
# MAIN ANALYSIS FUNCTIONS
|
| 187 |
-
# ============================================================================
|
| 188 |
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
|
|
|
| 192 |
|
|
|
|
|
|
|
| 193 |
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
return {
|
| 228 |
-
"
|
| 229 |
-
"
|
| 230 |
-
"
|
| 231 |
-
"
|
| 232 |
-
"metadata": result.get("metadata", {}),
|
| 233 |
}
|
| 234 |
|
| 235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
@app.function(
|
| 237 |
image=image,
|
| 238 |
secrets=[modal.Secret.from_name("anthropic-api-key")],
|
| 239 |
-
timeout=
|
| 240 |
)
|
| 241 |
-
def
|
| 242 |
"""
|
| 243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
-
|
| 246 |
-
This reduces response size by ~97% and fixes timeout issues.
|
| 247 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
# Detect platform
|
| 250 |
url_lower = url.lower()
|
|
@@ -253,7 +345,10 @@ def full_analysis_modal(url: str, max_reviews: int = 100) -> Dict[str, Any]:
|
|
| 253 |
if platform == "unknown":
|
| 254 |
return {"success": False, "error": "Unsupported platform. Use OpenTable or Google Maps."}
|
| 255 |
|
| 256 |
-
#
|
|
|
|
|
|
|
|
|
|
| 257 |
if platform == "opentable":
|
| 258 |
from src.scrapers.opentable_scraper import scrape_opentable
|
| 259 |
result = scrape_opentable(url=url, max_reviews=max_reviews, headless=True)
|
|
@@ -262,52 +357,181 @@ def full_analysis_modal(url: str, max_reviews: int = 100) -> Dict[str, Any]:
|
|
| 262 |
result = scrape_google_maps(url=url, max_reviews=max_reviews, headless=True)
|
| 263 |
|
| 264 |
if not result.get("success"):
|
| 265 |
-
return {"success": False, "error": result.get("error")}
|
| 266 |
|
|
|
|
|
|
|
|
|
|
| 267 |
from src.data_processing import process_reviews, clean_reviews_for_ai
|
| 268 |
-
from src.agent.base_agent import RestaurantAnalysisAgent
|
| 269 |
|
| 270 |
df = process_reviews(result)
|
| 271 |
reviews = clean_reviews_for_ai(df["review_text"].tolist(), verbose=False)
|
| 272 |
|
| 273 |
-
|
| 274 |
-
|
|
|
|
| 275 |
trend_data = []
|
| 276 |
for _, row in df.iterrows():
|
| 277 |
text = str(row.get("review_text", ""))
|
| 278 |
trend_data.append({
|
| 279 |
"date": str(row.get("date", "")),
|
| 280 |
"rating": float(row.get("overall_rating", 0) or 0),
|
| 281 |
-
"sentiment": calculate_sentiment(text)
|
| 282 |
})
|
| 283 |
|
| 284 |
-
# Extract restaurant name
|
| 285 |
if platform == "opentable":
|
| 286 |
restaurant_name = url.split("/")[-1].split("?")[0].replace("-", " ").title()
|
| 287 |
else:
|
| 288 |
-
# Google Maps
|
| 289 |
if '/place/' in url:
|
| 290 |
restaurant_name = url.split('/place/')[1].split('/')[0].replace('+', ' ').replace('%20', ' ')
|
| 291 |
else:
|
| 292 |
restaurant_name = "Restaurant"
|
| 293 |
|
| 294 |
-
#
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
-
|
| 303 |
-
REVIEW_INDEX[restaurant_name] = reviews
|
| 304 |
|
| 305 |
-
#
|
| 306 |
-
|
| 307 |
-
|
|
|
|
| 308 |
|
| 309 |
-
|
| 310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
response_size = len(json.dumps(analysis))
|
| 312 |
print(f"[MODAL] Response size: {response_size / 1024:.1f} KB")
|
| 313 |
|
|
@@ -315,113 +539,61 @@ def full_analysis_modal(url: str, max_reviews: int = 100) -> Dict[str, Any]:
|
|
| 315 |
|
| 316 |
|
| 317 |
# ============================================================================
|
| 318 |
-
# FASTAPI APP
|
| 319 |
# ============================================================================
|
| 320 |
|
| 321 |
@app.function(
|
| 322 |
image=image,
|
| 323 |
secrets=[modal.Secret.from_name("anthropic-api-key")],
|
| 324 |
-
timeout=
|
| 325 |
)
|
| 326 |
@modal.asgi_app()
|
| 327 |
def fastapi_app():
|
| 328 |
-
"""Main API
|
| 329 |
from fastapi import FastAPI, HTTPException
|
| 330 |
from pydantic import BaseModel
|
| 331 |
|
| 332 |
-
web_app = FastAPI(title="Restaurant Intelligence API
|
| 333 |
|
| 334 |
class AnalyzeRequest(BaseModel):
|
| 335 |
url: str
|
| 336 |
max_reviews: int = 100
|
| 337 |
|
| 338 |
-
class MCPCallRequest(BaseModel):
|
| 339 |
-
tool_name: str
|
| 340 |
-
arguments: Dict[str, Any] = {}
|
| 341 |
-
|
| 342 |
@web_app.get("/")
|
| 343 |
async def root():
|
| 344 |
return {
|
| 345 |
"name": "Restaurant Intelligence API",
|
| 346 |
-
"version": "3.0",
|
| 347 |
-
"
|
| 348 |
-
"
|
| 349 |
-
"endpoints": {
|
| 350 |
-
"analyze": "/analyze",
|
| 351 |
-
"mcp_tools": "/mcp/call",
|
| 352 |
-
"mcp_list": "/mcp/tools"
|
| 353 |
-
}
|
| 354 |
}
|
| 355 |
|
| 356 |
@web_app.get("/health")
|
| 357 |
async def health():
|
| 358 |
-
return {"status": "healthy", "
|
| 359 |
|
| 360 |
@web_app.post("/analyze")
|
| 361 |
async def analyze(request: AnalyzeRequest):
|
| 362 |
try:
|
| 363 |
-
result =
|
| 364 |
return result
|
| 365 |
except Exception as e:
|
|
|
|
|
|
|
| 366 |
raise HTTPException(status_code=500, detail=str(e))
|
| 367 |
|
| 368 |
-
# MCP Endpoints
|
| 369 |
-
@web_app.get("/mcp/tools")
|
| 370 |
-
async def mcp_list_tools():
|
| 371 |
-
return {
|
| 372 |
-
"tools": [
|
| 373 |
-
{"name": "index_reviews", "description": "Index reviews for RAG Q&A"},
|
| 374 |
-
{"name": "query_reviews", "description": "Answer questions about reviews"},
|
| 375 |
-
{"name": "save_report", "description": "Save analysis report"},
|
| 376 |
-
]
|
| 377 |
-
}
|
| 378 |
-
|
| 379 |
-
@web_app.post("/mcp/call")
|
| 380 |
-
async def mcp_call(request: MCPCallRequest):
|
| 381 |
-
"""TRUE MCP interface."""
|
| 382 |
-
# For now, this delegates to local functions
|
| 383 |
-
if request.tool_name == "index_reviews":
|
| 384 |
-
args = request.arguments
|
| 385 |
-
REVIEW_INDEX[args["restaurant_name"]] = args["reviews"]
|
| 386 |
-
return {"success": True, "indexed": len(args["reviews"])}
|
| 387 |
-
|
| 388 |
-
elif request.tool_name == "query_reviews":
|
| 389 |
-
args = request.arguments
|
| 390 |
-
reviews = REVIEW_INDEX.get(args["restaurant_name"], [])
|
| 391 |
-
if not reviews:
|
| 392 |
-
return {"success": False, "error": "No reviews indexed"}
|
| 393 |
-
|
| 394 |
-
question_words = set(args["question"].lower().split())
|
| 395 |
-
scored = [(len(question_words & set(r.lower().split())), r) for r in reviews]
|
| 396 |
-
scored.sort(reverse=True, key=lambda x: x[0])
|
| 397 |
-
top_k = args.get("top_k", 5)
|
| 398 |
-
|
| 399 |
-
return {
|
| 400 |
-
"success": True,
|
| 401 |
-
"relevant_reviews": [r[1] for r in scored[:top_k]]
|
| 402 |
-
}
|
| 403 |
-
|
| 404 |
-
return {"success": False, "error": f"Unknown tool: {request.tool_name}"}
|
| 405 |
-
|
| 406 |
return web_app
|
| 407 |
|
| 408 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
@app.local_entrypoint()
|
| 410 |
def main():
|
| 411 |
-
print("🧪 Testing Modal deployment
|
| 412 |
-
|
| 413 |
-
print("1️⃣ Testing connection...")
|
| 414 |
-
result = hello.remote()
|
| 415 |
-
print(f"✅ {result}\n")
|
| 416 |
-
|
| 417 |
-
print("2️⃣ MCP Server deployed at:")
|
| 418 |
-
print(" https://tushar-pingle--restaurant-intelligence-mcp-server.modal.run")
|
| 419 |
|
| 420 |
-
print("
|
| 421 |
print(" https://tushar-pingle--restaurant-intelligence-fastapi-app.modal.run")
|
| 422 |
|
| 423 |
-
print("\n✅
|
| 424 |
-
print("\n📊 Optimizations enabled:")
|
| 425 |
-
print(" - Slim trend_data (no full review text)")
|
| 426 |
-
print(" - Pre-calculated sentiment in backend")
|
| 427 |
-
print(" - ~97% smaller response size")
|
|
|
|
| 1 |
"""
|
| 2 |
+
Modal Backend for Restaurant Intelligence Agent - PARALLEL OPTIMIZED
|
| 3 |
+
Version 3.0 - Uses Modal's parallel processing for 5x speed improvement
|
| 4 |
|
| 5 |
+
KEY OPTIMIZATIONS:
|
| 6 |
+
1. Parallel batch processing with .map() - Process all batches simultaneously
|
| 7 |
+
2. Parallel insights generation - Chef + Manager at same time
|
| 8 |
+
3. Larger batch sizes (30 reviews instead of 20)
|
| 9 |
+
4. Reduced timeout since parallel is faster
|
| 10 |
|
| 11 |
+
TARGET: 1000 reviews in ~5 minutes (down from 15+ minutes)
|
|
|
|
|
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
import modal
|
| 15 |
from typing import Dict, Any, List
|
| 16 |
+
import os
|
| 17 |
+
import json
|
| 18 |
|
| 19 |
# Create Modal app
|
| 20 |
app = modal.App("restaurant-intelligence")
|
|
|
|
| 34 |
"matplotlib",
|
| 35 |
"fastapi[standard]",
|
| 36 |
"httpx",
|
| 37 |
+
"fastmcp",
|
| 38 |
)
|
| 39 |
.add_local_python_source("src")
|
| 40 |
)
|
| 41 |
|
| 42 |
|
| 43 |
# ============================================================================
|
| 44 |
+
# HELPER FUNCTIONS
|
| 45 |
# ============================================================================
|
| 46 |
|
| 47 |
def calculate_sentiment(text: str) -> float:
|
|
|
|
| 66 |
|
| 67 |
|
| 68 |
# ============================================================================
|
| 69 |
+
# PARALLEL BATCH PROCESSOR - The key optimization!
|
| 70 |
# ============================================================================
|
| 71 |
|
| 72 |
+
@app.function(
|
| 73 |
+
image=image,
|
| 74 |
+
secrets=[modal.Secret.from_name("anthropic-api-key")],
|
| 75 |
+
timeout=120, # 2 min per batch is plenty
|
| 76 |
+
retries=2,
|
| 77 |
+
)
|
| 78 |
+
def process_batch(batch_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
| 79 |
"""
|
| 80 |
+
Process a single batch of reviews - runs in PARALLEL across containers!
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
This function is called via .map() to process all batches simultaneously.
|
| 83 |
+
Modal will spin up multiple containers to handle batches in parallel.
|
| 84 |
+
"""
|
| 85 |
+
from anthropic import Anthropic
|
| 86 |
+
import os
|
| 87 |
|
| 88 |
+
reviews = batch_data["reviews"]
|
| 89 |
+
restaurant_name = batch_data["restaurant_name"]
|
| 90 |
+
batch_index = batch_data["batch_index"]
|
| 91 |
+
start_index = batch_data["start_index"]
|
| 92 |
|
| 93 |
+
print(f"🔄 Processing batch {batch_index} ({len(reviews)} reviews)...")
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
+
# Build extraction prompt
|
| 98 |
+
numbered_reviews = []
|
| 99 |
+
for i, review in enumerate(reviews):
|
| 100 |
+
numbered_reviews.append(f"[Review {i}]: {review}")
|
| 101 |
+
reviews_text = "\n\n".join(numbered_reviews)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
+
prompt = f"""You are analyzing customer reviews for {restaurant_name}. Extract BOTH menu items AND aspects in ONE PASS.
|
| 104 |
+
|
| 105 |
+
REVIEWS:
|
| 106 |
+
{reviews_text}
|
| 107 |
+
|
| 108 |
+
YOUR TASK - Extract THREE things simultaneously:
|
| 109 |
+
1. **MENU ITEMS** (food & drinks mentioned)
|
| 110 |
+
2. **ASPECTS** (what customers care about: service, ambience, etc.)
|
| 111 |
+
3. **SENTIMENT** for each
|
| 112 |
+
|
| 113 |
+
SENTIMENT SCALE (IMPORTANT):
|
| 114 |
+
- **Positive (0.6 to 1.0):** Customer clearly enjoyed/praised this item or aspect
|
| 115 |
+
- **Neutral (0.0 to 0.59):** Mixed feelings, okay but not exceptional, or simply mentioned without strong opinion
|
| 116 |
+
- **Negative (-1.0 to -0.01):** Customer complained, criticized, or expressed disappointment
|
| 117 |
+
|
| 118 |
+
RULES:
|
| 119 |
+
- Specific items only: "salmon sushi", "miso soup", "sake"
|
| 120 |
+
- Separate food from drinks
|
| 121 |
+
- Lowercase names
|
| 122 |
+
- For EACH item/aspect, list which review NUMBERS mention it (just indices, not text)
|
| 123 |
+
|
| 124 |
+
OUTPUT (JSON):
|
| 125 |
+
{{
|
| 126 |
+
"food_items": [
|
| 127 |
+
{{"name": "item name", "mention_count": 2, "sentiment": 0.85, "category": "type", "related_reviews": [0, 5]}}
|
| 128 |
+
],
|
| 129 |
+
"drinks": [
|
| 130 |
+
{{"name": "drink name", "mention_count": 1, "sentiment": 0.7, "category": "alcohol", "related_reviews": [3]}}
|
| 131 |
+
],
|
| 132 |
+
"aspects": [
|
| 133 |
+
{{"name": "service speed", "mention_count": 3, "sentiment": 0.65, "description": "brief desc", "related_reviews": [1, 2, 7]}}
|
| 134 |
+
]
|
| 135 |
+
}}
|
| 136 |
+
|
| 137 |
+
CRITICAL: Output ONLY valid JSON, no other text. Use sentiment scale: >= 0.6 positive, 0-0.59 neutral, < 0 negative
|
| 138 |
+
|
| 139 |
+
Extract everything:"""
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
response = client.messages.create(
|
| 143 |
+
model="claude-sonnet-4-20250514",
|
| 144 |
+
max_tokens=4000,
|
| 145 |
+
temperature=0.3,
|
| 146 |
+
messages=[{"role": "user", "content": prompt}]
|
| 147 |
+
)
|
| 148 |
|
| 149 |
+
result_text = response.content[0].text
|
| 150 |
+
result_text = result_text.replace('```json', '').replace('```', '').strip()
|
|
|
|
| 151 |
|
| 152 |
+
data = json.loads(result_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
# Map review indices back to full text
|
| 155 |
+
for item in data.get('food_items', []):
|
| 156 |
+
indices = item.get('related_reviews', [])
|
| 157 |
+
item['related_reviews'] = []
|
| 158 |
+
for idx in indices:
|
| 159 |
+
if isinstance(idx, int) and 0 <= idx < len(reviews):
|
| 160 |
+
item['related_reviews'].append({
|
| 161 |
+
'review_index': start_index + idx,
|
| 162 |
+
'review_text': reviews[idx]
|
| 163 |
+
})
|
| 164 |
+
if 'name' in item:
|
| 165 |
+
item['name'] = item['name'].lower()
|
| 166 |
|
| 167 |
+
for item in data.get('drinks', []):
|
| 168 |
+
indices = item.get('related_reviews', [])
|
| 169 |
+
item['related_reviews'] = []
|
| 170 |
+
for idx in indices:
|
| 171 |
+
if isinstance(idx, int) and 0 <= idx < len(reviews):
|
| 172 |
+
item['related_reviews'].append({
|
| 173 |
+
'review_index': start_index + idx,
|
| 174 |
+
'review_text': reviews[idx]
|
| 175 |
+
})
|
| 176 |
+
if 'name' in item:
|
| 177 |
+
item['name'] = item['name'].lower()
|
| 178 |
+
|
| 179 |
+
for aspect in data.get('aspects', []):
|
| 180 |
+
indices = aspect.get('related_reviews', [])
|
| 181 |
+
aspect['related_reviews'] = []
|
| 182 |
+
for idx in indices:
|
| 183 |
+
if isinstance(idx, int) and 0 <= idx < len(reviews):
|
| 184 |
+
aspect['related_reviews'].append({
|
| 185 |
+
'review_index': start_index + idx,
|
| 186 |
+
'review_text': reviews[idx]
|
| 187 |
+
})
|
| 188 |
+
if 'name' in aspect:
|
| 189 |
+
aspect['name'] = aspect['name'].lower()
|
| 190 |
+
|
| 191 |
+
print(f"✅ Batch {batch_index} complete: {len(data.get('food_items', []))} food, {len(data.get('drinks', []))} drinks, {len(data.get('aspects', []))} aspects")
|
| 192 |
+
return {"success": True, "batch_index": batch_index, "data": data}
|
| 193 |
+
|
| 194 |
+
except json.JSONDecodeError as e:
|
| 195 |
+
print(f"⚠️ Batch {batch_index} JSON error: {e}")
|
| 196 |
+
return {"success": False, "batch_index": batch_index, "data": {"food_items": [], "drinks": [], "aspects": []}}
|
| 197 |
+
except Exception as e:
|
| 198 |
+
print(f"❌ Batch {batch_index} error: {e}")
|
| 199 |
+
return {"success": False, "batch_index": batch_index, "data": {"food_items": [], "drinks": [], "aspects": []}}
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
@app.function(
|
| 203 |
+
image=image,
|
| 204 |
+
secrets=[modal.Secret.from_name("anthropic-api-key")],
|
| 205 |
+
timeout=180, # 3 min for insights
|
| 206 |
+
)
|
| 207 |
+
def generate_insights_parallel(analysis_data: Dict[str, Any], restaurant_name: str, role: str) -> Dict[str, Any]:
|
| 208 |
+
"""Generate insights for a single role - runs in parallel with other insights."""
|
| 209 |
+
from anthropic import Anthropic
|
| 210 |
+
import os
|
| 211 |
+
import re
|
| 212 |
+
|
| 213 |
+
print(f"🧠 Generating {role} insights...")
|
| 214 |
+
|
| 215 |
+
client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
|
| 216 |
+
|
| 217 |
+
# Build prompt based on role
|
| 218 |
+
menu_items = analysis_data.get('menu_analysis', {}).get('food_items', [])[:20]
|
| 219 |
+
drinks = analysis_data.get('menu_analysis', {}).get('drinks', [])[:10]
|
| 220 |
+
aspects = analysis_data.get('aspect_analysis', {}).get('aspects', [])[:20]
|
| 221 |
+
|
| 222 |
+
# Format menu summary
|
| 223 |
+
menu_lines = ["TOP MENU ITEMS:"]
|
| 224 |
+
for item in menu_items:
|
| 225 |
+
s = item.get('sentiment', 0)
|
| 226 |
+
emoji = "🟢" if s >= 0.6 else "🟡" if s >= 0 else "🔴"
|
| 227 |
+
menu_lines.append(f" {emoji} {item.get('name', '?')}: sentiment {s:+.2f}, {item.get('mention_count', 0)} mentions")
|
| 228 |
+
menu_summary = "\n".join(menu_lines)
|
| 229 |
+
|
| 230 |
+
# Format aspect summary
|
| 231 |
+
aspect_lines = ["TOP ASPECTS:"]
|
| 232 |
+
for a in aspects:
|
| 233 |
+
s = a.get('sentiment', 0)
|
| 234 |
+
emoji = "🟢" if s >= 0.6 else "🟡" if s >= 0 else "🔴"
|
| 235 |
+
aspect_lines.append(f" {emoji} {a.get('name', '?')}: sentiment {s:+.2f}, {a.get('mention_count', 0)} mentions")
|
| 236 |
+
aspect_summary = "\n".join(aspect_lines)
|
| 237 |
+
|
| 238 |
+
if role == 'chef':
|
| 239 |
+
focus = "Focus on: Food quality, menu items, ingredients, presentation, portions, consistency"
|
| 240 |
+
topic_filter = "ONLY on food/kitchen topics"
|
| 241 |
+
else:
|
| 242 |
+
focus = "Focus on: Service, staff, wait times, ambience, value, cleanliness"
|
| 243 |
+
topic_filter = "ONLY on operations/service topics"
|
| 244 |
|
| 245 |
+
prompt = f"""You are an expert restaurant consultant analyzing feedback for {restaurant_name}.
|
| 246 |
|
| 247 |
+
{menu_summary}
|
| 248 |
|
| 249 |
+
{aspect_summary}
|
|
|
|
|
|
|
| 250 |
|
| 251 |
+
SENTIMENT SCALE:
|
| 252 |
+
- 🟢 POSITIVE (>= 0.6): Highlight as STRENGTH
|
| 253 |
+
- 🟡 NEUTRAL (0 to 0.59): Room for improvement
|
| 254 |
+
- 🔴 NEGATIVE (< 0): Flag as CONCERN
|
| 255 |
|
| 256 |
+
YOUR TASK: Generate insights for the {"HEAD CHEF" if role == "chef" else "RESTAURANT MANAGER"}.
|
| 257 |
+
{focus}
|
| 258 |
|
| 259 |
+
RULES:
|
| 260 |
+
1. Focus {topic_filter}
|
| 261 |
+
2. STRENGTHS from items with sentiment >= 0.6
|
| 262 |
+
3. CONCERNS from items with sentiment < 0
|
| 263 |
+
4. Output ONLY valid JSON
|
| 264 |
+
|
| 265 |
+
OUTPUT:
|
| 266 |
+
{{
|
| 267 |
+
"summary": "2-3 sentence executive summary",
|
| 268 |
+
"strengths": ["strength 1", "strength 2", "strength 3", "strength 4", "strength 5"],
|
| 269 |
+
"concerns": ["concern 1", "concern 2", "concern 3"],
|
| 270 |
+
"recommendations": [
|
| 271 |
+
{{"priority": "high", "action": "action", "reason": "why", "evidence": "data"}},
|
| 272 |
+
{{"priority": "medium", "action": "action", "reason": "why", "evidence": "data"}},
|
| 273 |
+
{{"priority": "low", "action": "action", "reason": "why", "evidence": "data"}}
|
| 274 |
+
]
|
| 275 |
+
}}
|
| 276 |
+
|
| 277 |
+
Generate {role} insights:"""
|
| 278 |
+
|
| 279 |
+
try:
|
| 280 |
+
response = client.messages.create(
|
| 281 |
+
model="claude-sonnet-4-20250514",
|
| 282 |
+
max_tokens=2000,
|
| 283 |
+
temperature=0.4,
|
| 284 |
+
messages=[{"role": "user", "content": prompt}]
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
result_text = response.content[0].text.strip()
|
| 288 |
+
result_text = result_text.replace('```json', '').replace('```', '').strip()
|
| 289 |
+
|
| 290 |
+
# Find JSON in response
|
| 291 |
+
match = re.search(r'\{[\s\S]*\}', result_text)
|
| 292 |
+
if match:
|
| 293 |
+
insights = json.loads(match.group())
|
| 294 |
+
print(f"✅ {role.title()} insights generated")
|
| 295 |
+
return {"role": role, "insights": insights}
|
| 296 |
+
else:
|
| 297 |
+
print(f"⚠️ No JSON found in {role} response")
|
| 298 |
+
return {"role": role, "insights": _fallback_insights(role)}
|
| 299 |
+
|
| 300 |
+
except Exception as e:
|
| 301 |
+
print(f"❌ Error generating {role} insights: {e}")
|
| 302 |
+
return {"role": role, "insights": _fallback_insights(role)}
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def _fallback_insights(role: str) -> Dict[str, Any]:
|
| 306 |
+
"""Fallback insights if generation fails."""
|
| 307 |
return {
|
| 308 |
+
"summary": f"Analysis complete. See data for {role} insights.",
|
| 309 |
+
"strengths": ["Data available in charts"],
|
| 310 |
+
"concerns": ["Review individual items for details"],
|
| 311 |
+
"recommendations": [{"priority": "medium", "action": "Review data", "reason": "Auto-generated", "evidence": "N/A"}]
|
|
|
|
| 312 |
}
|
| 313 |
|
| 314 |
|
| 315 |
+
# ============================================================================
|
| 316 |
+
# MAIN ANALYSIS FUNCTION - PARALLEL OPTIMIZED
|
| 317 |
+
# ============================================================================
|
| 318 |
+
|
| 319 |
@app.function(
|
| 320 |
image=image,
|
| 321 |
secrets=[modal.Secret.from_name("anthropic-api-key")],
|
| 322 |
+
timeout=600, # 10 min max (down from 40 min)
|
| 323 |
)
|
| 324 |
+
def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
|
| 325 |
"""
|
| 326 |
+
PARALLEL OPTIMIZED analysis pipeline.
|
| 327 |
+
|
| 328 |
+
Speed improvements:
|
| 329 |
+
1. Batches processed in PARALLEL via .map()
|
| 330 |
+
2. Chef + Manager insights generated in PARALLEL
|
| 331 |
+
3. Larger batch size (30 reviews)
|
| 332 |
|
| 333 |
+
Target: 1000 reviews in ~5 minutes
|
|
|
|
| 334 |
"""
|
| 335 |
+
import time
|
| 336 |
+
start_time = time.time()
|
| 337 |
+
|
| 338 |
+
print(f"🚀 Starting PARALLEL analysis for {url}")
|
| 339 |
+
print(f"📊 Max reviews: {max_reviews}")
|
| 340 |
|
| 341 |
# Detect platform
|
| 342 |
url_lower = url.lower()
|
|
|
|
| 345 |
if platform == "unknown":
|
| 346 |
return {"success": False, "error": "Unsupported platform. Use OpenTable or Google Maps."}
|
| 347 |
|
| 348 |
+
# Phase 1: Scrape reviews
|
| 349 |
+
print("📥 Phase 1: Scraping reviews...")
|
| 350 |
+
scrape_start = time.time()
|
| 351 |
+
|
| 352 |
if platform == "opentable":
|
| 353 |
from src.scrapers.opentable_scraper import scrape_opentable
|
| 354 |
result = scrape_opentable(url=url, max_reviews=max_reviews, headless=True)
|
|
|
|
| 357 |
result = scrape_google_maps(url=url, max_reviews=max_reviews, headless=True)
|
| 358 |
|
| 359 |
if not result.get("success"):
|
| 360 |
+
return {"success": False, "error": result.get("error", "Scraping failed")}
|
| 361 |
|
| 362 |
+
print(f"✅ Scraping complete in {time.time() - scrape_start:.1f}s")
|
| 363 |
+
|
| 364 |
+
# Process reviews
|
| 365 |
from src.data_processing import process_reviews, clean_reviews_for_ai
|
|
|
|
| 366 |
|
| 367 |
df = process_reviews(result)
|
| 368 |
reviews = clean_reviews_for_ai(df["review_text"].tolist(), verbose=False)
|
| 369 |
|
| 370 |
+
print(f"📊 Total reviews: {len(reviews)}")
|
| 371 |
+
|
| 372 |
+
# Create trend data
|
| 373 |
trend_data = []
|
| 374 |
for _, row in df.iterrows():
|
| 375 |
text = str(row.get("review_text", ""))
|
| 376 |
trend_data.append({
|
| 377 |
"date": str(row.get("date", "")),
|
| 378 |
"rating": float(row.get("overall_rating", 0) or 0),
|
| 379 |
+
"sentiment": calculate_sentiment(text)
|
| 380 |
})
|
| 381 |
|
| 382 |
+
# Extract restaurant name
|
| 383 |
if platform == "opentable":
|
| 384 |
restaurant_name = url.split("/")[-1].split("?")[0].replace("-", " ").title()
|
| 385 |
else:
|
|
|
|
| 386 |
if '/place/' in url:
|
| 387 |
restaurant_name = url.split('/place/')[1].split('/')[0].replace('+', ' ').replace('%20', ' ')
|
| 388 |
else:
|
| 389 |
restaurant_name = "Restaurant"
|
| 390 |
|
| 391 |
+
# Phase 2: PARALLEL batch extraction
|
| 392 |
+
print("🔄 Phase 2: PARALLEL batch extraction...")
|
| 393 |
+
extract_start = time.time()
|
| 394 |
+
|
| 395 |
+
BATCH_SIZE = 30 # Larger batches = fewer API calls
|
| 396 |
+
batches = []
|
| 397 |
+
for i in range(0, len(reviews), BATCH_SIZE):
|
| 398 |
+
batch_reviews = reviews[i:i+BATCH_SIZE]
|
| 399 |
+
batches.append({
|
| 400 |
+
"reviews": batch_reviews,
|
| 401 |
+
"restaurant_name": restaurant_name,
|
| 402 |
+
"batch_index": len(batches) + 1,
|
| 403 |
+
"start_index": i
|
| 404 |
+
})
|
| 405 |
+
|
| 406 |
+
print(f"📦 Created {len(batches)} batches of ~{BATCH_SIZE} reviews each")
|
| 407 |
+
print(f"🚀 Processing ALL batches in PARALLEL...")
|
| 408 |
+
|
| 409 |
+
# THIS IS THE KEY: Process all batches in parallel!
|
| 410 |
+
batch_results = list(process_batch.map(batches))
|
| 411 |
|
| 412 |
+
print(f"✅ All batches complete in {time.time() - extract_start:.1f}s")
|
|
|
|
| 413 |
|
| 414 |
+
# Merge results from all batches
|
| 415 |
+
all_food_items = {}
|
| 416 |
+
all_drinks = {}
|
| 417 |
+
all_aspects = {}
|
| 418 |
|
| 419 |
+
for batch_result in batch_results:
|
| 420 |
+
if not batch_result.get("success"):
|
| 421 |
+
continue
|
| 422 |
+
|
| 423 |
+
data = batch_result.get("data", {})
|
| 424 |
+
|
| 425 |
+
# Merge food items
|
| 426 |
+
for item in data.get('food_items', []):
|
| 427 |
+
name = item.get('name', '').lower()
|
| 428 |
+
if not name:
|
| 429 |
+
continue
|
| 430 |
+
if name in all_food_items:
|
| 431 |
+
all_food_items[name]['mention_count'] += item.get('mention_count', 1)
|
| 432 |
+
all_food_items[name]['related_reviews'].extend(item.get('related_reviews', []))
|
| 433 |
+
# Weighted average sentiment
|
| 434 |
+
old_count = all_food_items[name]['mention_count'] - item.get('mention_count', 1)
|
| 435 |
+
new_count = item.get('mention_count', 1)
|
| 436 |
+
if old_count + new_count > 0:
|
| 437 |
+
old_sent = all_food_items[name]['sentiment']
|
| 438 |
+
new_sent = item.get('sentiment', 0)
|
| 439 |
+
all_food_items[name]['sentiment'] = (old_sent * old_count + new_sent * new_count) / (old_count + new_count)
|
| 440 |
+
else:
|
| 441 |
+
all_food_items[name] = item
|
| 442 |
+
|
| 443 |
+
# Merge drinks
|
| 444 |
+
for item in data.get('drinks', []):
|
| 445 |
+
name = item.get('name', '').lower()
|
| 446 |
+
if not name:
|
| 447 |
+
continue
|
| 448 |
+
if name in all_drinks:
|
| 449 |
+
all_drinks[name]['mention_count'] += item.get('mention_count', 1)
|
| 450 |
+
all_drinks[name]['related_reviews'].extend(item.get('related_reviews', []))
|
| 451 |
+
old_count = all_drinks[name]['mention_count'] - item.get('mention_count', 1)
|
| 452 |
+
new_count = item.get('mention_count', 1)
|
| 453 |
+
if old_count + new_count > 0:
|
| 454 |
+
old_sent = all_drinks[name]['sentiment']
|
| 455 |
+
new_sent = item.get('sentiment', 0)
|
| 456 |
+
all_drinks[name]['sentiment'] = (old_sent * old_count + new_sent * new_count) / (old_count + new_count)
|
| 457 |
+
else:
|
| 458 |
+
all_drinks[name] = item
|
| 459 |
+
|
| 460 |
+
# Merge aspects
|
| 461 |
+
for aspect in data.get('aspects', []):
|
| 462 |
+
name = aspect.get('name', '').lower()
|
| 463 |
+
if not name:
|
| 464 |
+
continue
|
| 465 |
+
if name in all_aspects:
|
| 466 |
+
all_aspects[name]['mention_count'] += aspect.get('mention_count', 1)
|
| 467 |
+
all_aspects[name]['related_reviews'].extend(aspect.get('related_reviews', []))
|
| 468 |
+
old_count = all_aspects[name]['mention_count'] - aspect.get('mention_count', 1)
|
| 469 |
+
new_count = aspect.get('mention_count', 1)
|
| 470 |
+
if old_count + new_count > 0:
|
| 471 |
+
old_sent = all_aspects[name]['sentiment']
|
| 472 |
+
new_sent = aspect.get('sentiment', 0)
|
| 473 |
+
all_aspects[name]['sentiment'] = (old_sent * old_count + new_sent * new_count) / (old_count + new_count)
|
| 474 |
+
else:
|
| 475 |
+
all_aspects[name] = aspect
|
| 476 |
+
|
| 477 |
+
# Sort by mention count
|
| 478 |
+
food_list = sorted(all_food_items.values(), key=lambda x: x.get('mention_count', 0), reverse=True)
|
| 479 |
+
drinks_list = sorted(all_drinks.values(), key=lambda x: x.get('mention_count', 0), reverse=True)
|
| 480 |
+
aspects_list = sorted(all_aspects.values(), key=lambda x: x.get('mention_count', 0), reverse=True)
|
| 481 |
+
|
| 482 |
+
print(f"📊 Discovered: {len(food_list)} food + {len(drinks_list)} drinks + {len(aspects_list)} aspects")
|
| 483 |
+
|
| 484 |
+
# Build analysis data
|
| 485 |
+
analysis_data = {
|
| 486 |
+
"menu_analysis": {
|
| 487 |
+
"food_items": food_list,
|
| 488 |
+
"drinks": drinks_list
|
| 489 |
+
},
|
| 490 |
+
"aspect_analysis": {
|
| 491 |
+
"aspects": aspects_list
|
| 492 |
+
}
|
| 493 |
+
}
|
| 494 |
+
|
| 495 |
+
# Phase 3: PARALLEL insights generation
|
| 496 |
+
print("🧠 Phase 3: PARALLEL insights generation...")
|
| 497 |
+
insights_start = time.time()
|
| 498 |
+
|
| 499 |
+
# Generate both insights in parallel!
|
| 500 |
+
insight_inputs = [
|
| 501 |
+
(analysis_data, restaurant_name, "chef"),
|
| 502 |
+
(analysis_data, restaurant_name, "manager")
|
| 503 |
+
]
|
| 504 |
+
|
| 505 |
+
insight_results = list(generate_insights_parallel.starmap(insight_inputs))
|
| 506 |
+
|
| 507 |
+
insights = {}
|
| 508 |
+
for result in insight_results:
|
| 509 |
+
insights[result["role"]] = result["insights"]
|
| 510 |
+
|
| 511 |
+
print(f"✅ Insights complete in {time.time() - insights_start:.1f}s")
|
| 512 |
+
|
| 513 |
+
# Build final response
|
| 514 |
+
total_time = time.time() - start_time
|
| 515 |
+
print(f"🎉 TOTAL TIME: {total_time:.1f}s ({total_time/60:.1f} min)")
|
| 516 |
+
|
| 517 |
+
analysis = {
|
| 518 |
+
"success": True,
|
| 519 |
+
"restaurant_name": restaurant_name,
|
| 520 |
+
"menu_analysis": analysis_data["menu_analysis"],
|
| 521 |
+
"aspect_analysis": analysis_data["aspect_analysis"],
|
| 522 |
+
"insights": insights,
|
| 523 |
+
"trend_data": trend_data,
|
| 524 |
+
"source": platform,
|
| 525 |
+
"stats": {
|
| 526 |
+
"total_reviews": len(reviews),
|
| 527 |
+
"food_items": len(food_list),
|
| 528 |
+
"drinks": len(drinks_list),
|
| 529 |
+
"aspects": len(aspects_list),
|
| 530 |
+
"processing_time_seconds": round(total_time, 1)
|
| 531 |
+
}
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
# Log response size
|
| 535 |
response_size = len(json.dumps(analysis))
|
| 536 |
print(f"[MODAL] Response size: {response_size / 1024:.1f} KB")
|
| 537 |
|
|
|
|
| 539 |
|
| 540 |
|
| 541 |
# ============================================================================
|
| 542 |
+
# FASTAPI APP - Updated to use parallel function
|
| 543 |
# ============================================================================
|
| 544 |
|
| 545 |
@app.function(
|
| 546 |
image=image,
|
| 547 |
secrets=[modal.Secret.from_name("anthropic-api-key")],
|
| 548 |
+
timeout=900, # 15 min timeout for the API endpoint
|
| 549 |
)
|
| 550 |
@modal.asgi_app()
|
| 551 |
def fastapi_app():
|
| 552 |
+
"""Main API - uses parallel processing for speed."""
|
| 553 |
from fastapi import FastAPI, HTTPException
|
| 554 |
from pydantic import BaseModel
|
| 555 |
|
| 556 |
+
web_app = FastAPI(title="Restaurant Intelligence API - PARALLEL OPTIMIZED")
|
| 557 |
|
| 558 |
class AnalyzeRequest(BaseModel):
|
| 559 |
url: str
|
| 560 |
max_reviews: int = 100
|
| 561 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
@web_app.get("/")
|
| 563 |
async def root():
|
| 564 |
return {
|
| 565 |
"name": "Restaurant Intelligence API",
|
| 566 |
+
"version": "3.0-parallel",
|
| 567 |
+
"optimizations": ["parallel_batches", "parallel_insights", "larger_batch_size"],
|
| 568 |
+
"target": "1000 reviews in ~5 minutes"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
}
|
| 570 |
|
| 571 |
@web_app.get("/health")
|
| 572 |
async def health():
|
| 573 |
+
return {"status": "healthy", "version": "parallel"}
|
| 574 |
|
| 575 |
@web_app.post("/analyze")
|
| 576 |
async def analyze(request: AnalyzeRequest):
|
| 577 |
try:
|
| 578 |
+
result = full_analysis_parallel.remote(url=request.url, max_reviews=request.max_reviews)
|
| 579 |
return result
|
| 580 |
except Exception as e:
|
| 581 |
+
import traceback
|
| 582 |
+
traceback.print_exc()
|
| 583 |
raise HTTPException(status_code=500, detail=str(e))
|
| 584 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
return web_app
|
| 586 |
|
| 587 |
|
| 588 |
+
# ============================================================================
|
| 589 |
+
# LOCAL ENTRYPOINT FOR TESTING
|
| 590 |
+
# ============================================================================
|
| 591 |
+
|
| 592 |
@app.local_entrypoint()
|
| 593 |
def main():
|
| 594 |
+
print("🧪 Testing PARALLEL Modal deployment...\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 595 |
|
| 596 |
+
print("1️⃣ API will be deployed at:")
|
| 597 |
print(" https://tushar-pingle--restaurant-intelligence-fastapi-app.modal.run")
|
| 598 |
|
| 599 |
+
print("\n✅ Deploy with: modal deploy modal_backend.py")
|
|
|
|
|
|
|
|
|
|
|
|
src/data_processing/review_cleaner.py
CHANGED
|
@@ -1,113 +1,151 @@
|
|
| 1 |
"""
|
| 2 |
-
Review Text Cleaner
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import re
|
| 7 |
import unicodedata
|
| 8 |
-
from typing import List
|
| 9 |
|
| 10 |
|
| 11 |
class ReviewCleaner:
|
| 12 |
"""
|
| 13 |
-
Cleans review text
|
| 14 |
"""
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def clean_review(self, text: str) -> str:
|
| 20 |
"""
|
| 21 |
Clean a single review text.
|
| 22 |
|
| 23 |
-
|
| 24 |
-
text: Raw review text
|
| 25 |
-
|
| 26 |
-
Returns:
|
| 27 |
-
Cleaned text safe for AI processing
|
| 28 |
"""
|
| 29 |
if not text or not isinstance(text, str):
|
| 30 |
return ""
|
| 31 |
|
| 32 |
-
|
|
|
|
|
|
|
| 33 |
text = ' '.join(text.split())
|
| 34 |
|
| 35 |
-
# 2. Remove emojis
|
| 36 |
text = self._remove_emojis(text)
|
| 37 |
|
| 38 |
-
# 3.
|
| 39 |
text = text.replace('"', '"').replace('"', '"')
|
| 40 |
text = text.replace("'", "'").replace("'", "'")
|
| 41 |
|
| 42 |
-
# 4. Remove
|
| 43 |
-
text = text.replace('\n', ' ')
|
| 44 |
-
text = text.
|
| 45 |
-
text = text.replace('\t', ' ') # Remove tabs
|
| 46 |
|
| 47 |
-
# 5.
|
| 48 |
-
text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C')
|
| 49 |
-
|
| 50 |
-
# 6. Normalize multiple spaces
|
| 51 |
text = re.sub(r'\s+', ' ', text)
|
| 52 |
|
| 53 |
-
#
|
| 54 |
-
if len(text) >
|
| 55 |
-
text = text[:
|
| 56 |
|
| 57 |
-
#
|
| 58 |
text = text.strip()
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
return text
|
| 61 |
|
| 62 |
def _remove_emojis(self, text: str) -> str:
|
| 63 |
-
"""
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
| 65 |
emoji_pattern = re.compile(
|
| 66 |
"["
|
| 67 |
"\U0001F600-\U0001F64F" # emoticons
|
| 68 |
-
"\U0001F300-\U0001F5FF" # symbols & pictographs
|
| 69 |
"\U0001F680-\U0001F6FF" # transport & map symbols
|
| 70 |
"\U0001F1E0-\U0001F1FF" # flags
|
| 71 |
-
"\
|
| 72 |
-
"\
|
|
|
|
|
|
|
| 73 |
"]+",
|
| 74 |
flags=re.UNICODE
|
| 75 |
)
|
| 76 |
-
return emoji_pattern.sub(
|
| 77 |
|
| 78 |
def clean_reviews(self, reviews: List[str]) -> List[str]:
|
| 79 |
"""
|
| 80 |
Clean a list of reviews.
|
| 81 |
|
| 82 |
-
|
| 83 |
-
reviews: List of raw review texts
|
| 84 |
-
|
| 85 |
-
Returns:
|
| 86 |
-
List of cleaned review texts
|
| 87 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
cleaned = []
|
| 89 |
for i, review in enumerate(reviews):
|
|
|
|
| 90 |
cleaned_text = self.clean_review(review)
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
return cleaned
|
| 97 |
|
| 98 |
-
def get_cleaning_stats(self
|
| 99 |
"""Get statistics about the cleaning process."""
|
| 100 |
-
original_chars = sum(len(r) for r in original)
|
| 101 |
-
cleaned_chars = sum(len(r) for r in cleaned)
|
| 102 |
-
|
| 103 |
return {
|
| 104 |
-
"original_count":
|
| 105 |
-
"cleaned_count":
|
| 106 |
-
"
|
| 107 |
-
"
|
| 108 |
-
"
|
| 109 |
-
"
|
| 110 |
-
"
|
| 111 |
}
|
| 112 |
|
| 113 |
|
|
@@ -115,39 +153,85 @@ def clean_reviews_for_ai(reviews: List[str], verbose: bool = True) -> List[str]:
|
|
| 115 |
"""
|
| 116 |
Convenience function to clean reviews.
|
| 117 |
|
| 118 |
-
|
| 119 |
-
reviews: Raw review texts
|
| 120 |
-
verbose: Print cleaning stats
|
| 121 |
-
|
| 122 |
-
Returns:
|
| 123 |
-
Cleaned review texts
|
| 124 |
"""
|
| 125 |
-
cleaner = ReviewCleaner()
|
| 126 |
cleaned = cleaner.clean_reviews(reviews)
|
| 127 |
|
| 128 |
if verbose:
|
| 129 |
-
stats = cleaner.get_cleaning_stats(
|
| 130 |
print(f"🧹 Cleaned {stats['original_count']} reviews:")
|
| 131 |
-
print(f"
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
return cleaned
|
| 136 |
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
if __name__ == "__main__":
|
| 139 |
# Test the cleaner
|
| 140 |
test_reviews = [
|
| 141 |
-
'This place is "amazing"!
|
| 142 |
-
"The food was great\n\nbut service was slow",
|
| 143 |
-
'Chef said "it\'s the best" and I agree!
|
| 144 |
-
"🍕🍝🍷 Loved everything!!!",
|
| 145 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
]
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Review Text Cleaner - FIXED VERSION
|
| 3 |
+
Less aggressive cleaning that preserves more reviews.
|
| 4 |
+
|
| 5 |
+
FIXES:
|
| 6 |
+
1. Don't discard reviews just because they're short
|
| 7 |
+
2. Keep reviews with minimal cleaning
|
| 8 |
+
3. Better handling of special characters
|
| 9 |
+
4. Log what's being cleaned for debugging
|
| 10 |
+
|
| 11 |
+
Author: Tushar Pingle
|
| 12 |
+
Updated: Nov 2024
|
| 13 |
"""
|
| 14 |
|
| 15 |
import re
|
| 16 |
import unicodedata
|
| 17 |
+
from typing import List, Tuple
|
| 18 |
|
| 19 |
|
| 20 |
class ReviewCleaner:
|
| 21 |
"""
|
| 22 |
+
Cleans review text while preserving as much content as possible.
|
| 23 |
"""
|
| 24 |
|
| 25 |
+
# Minimum length for a valid review (characters)
|
| 26 |
+
MIN_REVIEW_LENGTH = 10 # Very permissive - was effectively 0 but cleaned to nothing
|
| 27 |
+
|
| 28 |
+
def __init__(self, verbose: bool = False):
|
| 29 |
+
self.verbose = verbose
|
| 30 |
+
self.stats = {
|
| 31 |
+
'total': 0,
|
| 32 |
+
'kept': 0,
|
| 33 |
+
'removed_empty': 0,
|
| 34 |
+
'removed_short': 0,
|
| 35 |
+
'chars_original': 0,
|
| 36 |
+
'chars_cleaned': 0
|
| 37 |
+
}
|
| 38 |
|
| 39 |
def clean_review(self, text: str) -> str:
|
| 40 |
"""
|
| 41 |
Clean a single review text.
|
| 42 |
|
| 43 |
+
FIXED: Less aggressive cleaning, preserves more content.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"""
|
| 45 |
if not text or not isinstance(text, str):
|
| 46 |
return ""
|
| 47 |
|
| 48 |
+
original_len = len(text)
|
| 49 |
+
|
| 50 |
+
# 1. Basic whitespace normalization (gentle)
|
| 51 |
text = ' '.join(text.split())
|
| 52 |
|
| 53 |
+
# 2. Remove only truly problematic emojis (keep basic punctuation)
|
| 54 |
text = self._remove_emojis(text)
|
| 55 |
|
| 56 |
+
# 3. Normalize quotes (don't remove them)
|
| 57 |
text = text.replace('"', '"').replace('"', '"')
|
| 58 |
text = text.replace("'", "'").replace("'", "'")
|
| 59 |
|
| 60 |
+
# 4. Remove control characters only (keep newlines as spaces)
|
| 61 |
+
text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
|
| 62 |
+
text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C' or char == ' ')
|
|
|
|
| 63 |
|
| 64 |
+
# 5. Normalize multiple spaces
|
|
|
|
|
|
|
|
|
|
| 65 |
text = re.sub(r'\s+', ' ', text)
|
| 66 |
|
| 67 |
+
# 6. Truncate very long reviews (>1500 chars) - increased limit
|
| 68 |
+
if len(text) > 1500:
|
| 69 |
+
text = text[:1497] + "..."
|
| 70 |
|
| 71 |
+
# 7. Strip whitespace
|
| 72 |
text = text.strip()
|
| 73 |
|
| 74 |
+
# Track stats
|
| 75 |
+
self.stats['chars_original'] += original_len
|
| 76 |
+
self.stats['chars_cleaned'] += len(text)
|
| 77 |
+
|
| 78 |
return text
|
| 79 |
|
| 80 |
def _remove_emojis(self, text: str) -> str:
|
| 81 |
+
"""
|
| 82 |
+
Remove emojis but keep more unicode characters.
|
| 83 |
+
FIXED: Less aggressive pattern.
|
| 84 |
+
"""
|
| 85 |
+
# Only remove actual emoji pictographs, not all unicode
|
| 86 |
emoji_pattern = re.compile(
|
| 87 |
"["
|
| 88 |
"\U0001F600-\U0001F64F" # emoticons
|
| 89 |
+
"\U0001F300-\U0001F5FF" # symbols & pictographs
|
| 90 |
"\U0001F680-\U0001F6FF" # transport & map symbols
|
| 91 |
"\U0001F1E0-\U0001F1FF" # flags
|
| 92 |
+
"\U0001F900-\U0001F9FF" # supplemental symbols
|
| 93 |
+
"\U0001FA00-\U0001FA6F" # chess symbols
|
| 94 |
+
"\U0001FA70-\U0001FAFF" # symbols extended
|
| 95 |
+
"\U00002702-\U000027B0" # dingbats
|
| 96 |
"]+",
|
| 97 |
flags=re.UNICODE
|
| 98 |
)
|
| 99 |
+
return emoji_pattern.sub('', text)
|
| 100 |
|
| 101 |
def clean_reviews(self, reviews: List[str]) -> List[str]:
|
| 102 |
"""
|
| 103 |
Clean a list of reviews.
|
| 104 |
|
| 105 |
+
FIXED: Only removes truly empty reviews, not short ones.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
"""
|
| 107 |
+
self.stats = {
|
| 108 |
+
'total': len(reviews),
|
| 109 |
+
'kept': 0,
|
| 110 |
+
'removed_empty': 0,
|
| 111 |
+
'removed_short': 0,
|
| 112 |
+
'chars_original': 0,
|
| 113 |
+
'chars_cleaned': 0
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
cleaned = []
|
| 117 |
for i, review in enumerate(reviews):
|
| 118 |
+
# Clean the review
|
| 119 |
cleaned_text = self.clean_review(review)
|
| 120 |
+
|
| 121 |
+
# Check if it's still valid
|
| 122 |
+
if not cleaned_text:
|
| 123 |
+
self.stats['removed_empty'] += 1
|
| 124 |
+
if self.verbose:
|
| 125 |
+
print(f" ⚠️ Review {i} was empty/None, skipping")
|
| 126 |
+
continue
|
| 127 |
+
|
| 128 |
+
if len(cleaned_text) < self.MIN_REVIEW_LENGTH:
|
| 129 |
+
self.stats['removed_short'] += 1
|
| 130 |
+
if self.verbose:
|
| 131 |
+
print(f" ⚠️ Review {i} too short ({len(cleaned_text)} chars): '{cleaned_text[:50]}'")
|
| 132 |
+
continue
|
| 133 |
+
|
| 134 |
+
cleaned.append(cleaned_text)
|
| 135 |
+
self.stats['kept'] += 1
|
| 136 |
|
| 137 |
return cleaned
|
| 138 |
|
| 139 |
+
def get_cleaning_stats(self) -> dict:
|
| 140 |
"""Get statistics about the cleaning process."""
|
|
|
|
|
|
|
|
|
|
| 141 |
return {
|
| 142 |
+
"original_count": self.stats['total'],
|
| 143 |
+
"cleaned_count": self.stats['kept'],
|
| 144 |
+
"removed_empty": self.stats['removed_empty'],
|
| 145 |
+
"removed_short": self.stats['removed_short'],
|
| 146 |
+
"original_chars": self.stats['chars_original'],
|
| 147 |
+
"cleaned_chars": self.stats['chars_cleaned'],
|
| 148 |
+
"retention_rate": round(self.stats['kept'] / max(self.stats['total'], 1) * 100, 1)
|
| 149 |
}
|
| 150 |
|
| 151 |
|
|
|
|
| 153 |
"""
|
| 154 |
Convenience function to clean reviews.
|
| 155 |
|
| 156 |
+
FIXED: Better stats reporting, less aggressive cleaning.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
"""
|
| 158 |
+
cleaner = ReviewCleaner(verbose=False) # Don't spam individual messages
|
| 159 |
cleaned = cleaner.clean_reviews(reviews)
|
| 160 |
|
| 161 |
if verbose:
|
| 162 |
+
stats = cleaner.get_cleaning_stats()
|
| 163 |
print(f"🧹 Cleaned {stats['original_count']} reviews:")
|
| 164 |
+
print(f" ✅ Kept: {stats['cleaned_count']} ({stats['retention_rate']}%)")
|
| 165 |
+
if stats['removed_empty'] > 0:
|
| 166 |
+
print(f" ❌ Empty: {stats['removed_empty']}")
|
| 167 |
+
if stats['removed_short'] > 0:
|
| 168 |
+
print(f" ❌ Too short: {stats['removed_short']}")
|
| 169 |
+
|
| 170 |
+
# Warn if we're losing too many reviews
|
| 171 |
+
if stats['retention_rate'] < 50:
|
| 172 |
+
print(f" ⚠️ WARNING: Only {stats['retention_rate']}% retention! Check scraper.")
|
| 173 |
|
| 174 |
return cleaned
|
| 175 |
|
| 176 |
|
| 177 |
+
# Also add a debug function
|
| 178 |
+
def analyze_review_loss(reviews: List[str]) -> None:
|
| 179 |
+
"""
|
| 180 |
+
Debug function to understand why reviews are being lost.
|
| 181 |
+
"""
|
| 182 |
+
print(f"\n{'='*60}")
|
| 183 |
+
print("REVIEW LOSS ANALYSIS")
|
| 184 |
+
print(f"{'='*60}\n")
|
| 185 |
+
|
| 186 |
+
empty_count = 0
|
| 187 |
+
short_count = 0
|
| 188 |
+
valid_count = 0
|
| 189 |
+
|
| 190 |
+
print("Sample of problematic reviews:\n")
|
| 191 |
+
|
| 192 |
+
for i, review in enumerate(reviews):
|
| 193 |
+
if not review or not isinstance(review, str):
|
| 194 |
+
empty_count += 1
|
| 195 |
+
if empty_count <= 3:
|
| 196 |
+
print(f" [{i}] EMPTY: {repr(review)}")
|
| 197 |
+
elif len(review.strip()) < 10:
|
| 198 |
+
short_count += 1
|
| 199 |
+
if short_count <= 3:
|
| 200 |
+
print(f" [{i}] SHORT ({len(review)} chars): '{review[:50]}'")
|
| 201 |
+
else:
|
| 202 |
+
valid_count += 1
|
| 203 |
+
|
| 204 |
+
print(f"\n{'='*60}")
|
| 205 |
+
print(f"SUMMARY:")
|
| 206 |
+
print(f" Total: {len(reviews)}")
|
| 207 |
+
print(f" Valid: {valid_count} ({valid_count/len(reviews)*100:.1f}%)")
|
| 208 |
+
print(f" Empty: {empty_count}")
|
| 209 |
+
print(f" Short: {short_count}")
|
| 210 |
+
print(f"{'='*60}\n")
|
| 211 |
+
|
| 212 |
+
|
| 213 |
if __name__ == "__main__":
|
| 214 |
# Test the cleaner
|
| 215 |
test_reviews = [
|
| 216 |
+
'This place is "amazing"! 😍😍😍 The food was incredible.',
|
| 217 |
+
"The food was great\n\nbut service was slow. Would come back!",
|
| 218 |
+
'Chef said "it\'s the best" and I agree! Great experience.',
|
| 219 |
+
"🍕🍝🍷 Loved everything!!! Best Italian in town.",
|
| 220 |
+
"", # Empty
|
| 221 |
+
"Good", # Too short
|
| 222 |
+
" ", # Just whitespace
|
| 223 |
+
None, # None
|
| 224 |
+
"The pasta was perfectly cooked, al dente just how I like it.",
|
| 225 |
]
|
| 226 |
|
| 227 |
+
print("Testing review cleaner...\n")
|
| 228 |
+
|
| 229 |
+
# First analyze
|
| 230 |
+
analyze_review_loss(test_reviews)
|
| 231 |
+
|
| 232 |
+
# Then clean
|
| 233 |
+
cleaned = clean_reviews_for_ai(test_reviews, verbose=True)
|
| 234 |
+
|
| 235 |
+
print(f"\nCleaned reviews ({len(cleaned)}):")
|
| 236 |
+
for i, review in enumerate(cleaned):
|
| 237 |
+
print(f" {i+1}. {review[:60]}...")
|
src/scrapers/opentable_scraper.py
CHANGED
|
@@ -1,33 +1,42 @@
|
|
| 1 |
"""
|
| 2 |
-
OpenTable
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import time
|
| 7 |
-
from typing import
|
| 8 |
from selenium import webdriver
|
| 9 |
-
from selenium.
|
| 10 |
-
|
| 11 |
-
StaleElementReferenceException,
|
| 12 |
-
TimeoutException
|
| 13 |
-
)
|
| 14 |
from selenium.webdriver.common.by import By
|
| 15 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 16 |
from selenium.webdriver.support import expected_conditions as EC
|
| 17 |
-
from selenium.
|
| 18 |
-
from selenium.webdriver.chrome.service import Service
|
| 19 |
|
| 20 |
|
| 21 |
class OpenTableScraper:
|
| 22 |
-
"""
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
#
|
| 25 |
SELECTORS = {
|
| 26 |
"review_cards": [
|
|
|
|
| 27 |
"//li[@data-test='reviews-list-item']",
|
| 28 |
-
|
| 29 |
-
"//section[
|
| 30 |
-
|
|
|
|
| 31 |
],
|
| 32 |
"next_button": [
|
| 33 |
"//a[@aria-label='Go to the next page']",
|
|
@@ -65,15 +74,16 @@ class OpenTableScraper:
|
|
| 65 |
".//li[contains(., 'Ambience')]//span"
|
| 66 |
],
|
| 67 |
"review_text": [
|
| 68 |
-
#
|
| 69 |
".//span[@data-test='wrapper-tag']",
|
| 70 |
".//div[@data-test='wrapper-tag']",
|
| 71 |
".//p[@data-test='review-text']",
|
| 72 |
-
|
| 73 |
-
".//div[contains(@class,'review')]
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
".//
|
|
|
|
| 77 |
]
|
| 78 |
}
|
| 79 |
|
|
@@ -82,6 +92,7 @@ class OpenTableScraper:
|
|
| 82 |
self.page_load_strategy = page_load_strategy
|
| 83 |
self.driver = None
|
| 84 |
self.wait = None
|
|
|
|
| 85 |
|
| 86 |
def scrape_reviews(
|
| 87 |
self,
|
|
@@ -89,8 +100,11 @@ class OpenTableScraper:
|
|
| 89 |
max_reviews: Optional[int] = None,
|
| 90 |
progress_callback: Optional[Callable[[str], None]] = None
|
| 91 |
) -> Dict[str, Any]:
|
| 92 |
-
"""
|
|
|
|
| 93 |
|
|
|
|
|
|
|
| 94 |
if not self._validate_url(url):
|
| 95 |
return {'success': False, 'error': 'Invalid OpenTable URL', 'reviews': []}
|
| 96 |
|
|
@@ -116,7 +130,8 @@ class OpenTableScraper:
|
|
| 116 |
reviews = []
|
| 117 |
|
| 118 |
page_count = 0
|
| 119 |
-
review_count = 0
|
|
|
|
| 120 |
|
| 121 |
while True:
|
| 122 |
page_count += 1
|
|
@@ -132,32 +147,44 @@ class OpenTableScraper:
|
|
| 132 |
self._log_progress("⚠️ No reviews found on page.", progress_callback)
|
| 133 |
if page_count == 1:
|
| 134 |
# Save page source for debugging
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
| 138 |
break
|
| 139 |
|
| 140 |
-
self._log_progress(f"
|
| 141 |
|
| 142 |
# Extract data from each review
|
|
|
|
|
|
|
|
|
|
| 143 |
for idx, review in enumerate(review_elements):
|
| 144 |
if max_reviews and review_count >= max_reviews:
|
| 145 |
self._log_progress(f"🎯 Reached max reviews ({max_reviews}).", progress_callback)
|
| 146 |
break
|
| 147 |
|
| 148 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
name = self._extract_text_with_fallback(review, self.SELECTORS["name"])
|
| 150 |
date = self._extract_text_with_fallback(review, self.SELECTORS["date"])
|
| 151 |
overall_rating = self._extract_text_with_fallback(review, self.SELECTORS["overall_rating"])
|
| 152 |
food_rating = self._extract_text_with_fallback(review, self.SELECTORS["food_rating"])
|
| 153 |
service_rating = self._extract_text_with_fallback(review, self.SELECTORS["service_rating"])
|
| 154 |
ambience_rating = self._extract_text_with_fallback(review, self.SELECTORS["ambience_rating"])
|
| 155 |
-
review_text = self._extract_text_with_fallback(review, self.SELECTORS["review_text"])
|
| 156 |
-
|
| 157 |
-
# Clean review text (remove date if it leaked in)
|
| 158 |
-
if review_text and "Dined on" in review_text:
|
| 159 |
-
review_text = ""
|
| 160 |
|
|
|
|
| 161 |
names.append(name)
|
| 162 |
dates.append(date)
|
| 163 |
overall_ratings.append(overall_rating)
|
|
@@ -167,14 +194,18 @@ class OpenTableScraper:
|
|
| 167 |
reviews.append(review_text)
|
| 168 |
|
| 169 |
review_count += 1
|
|
|
|
| 170 |
|
| 171 |
-
if review_count %
|
| 172 |
-
self._log_progress(f"📊 Extracted {review_count} reviews so far...", progress_callback)
|
| 173 |
|
| 174 |
except Exception as e:
|
| 175 |
self._log_progress(f"⚠️ Error on review {idx + 1}: {str(e)}", progress_callback)
|
| 176 |
continue
|
| 177 |
|
|
|
|
|
|
|
|
|
|
| 178 |
if max_reviews and review_count >= max_reviews:
|
| 179 |
break
|
| 180 |
|
|
@@ -185,64 +216,130 @@ class OpenTableScraper:
|
|
| 185 |
|
| 186 |
time.sleep(3) # Wait for new page to load
|
| 187 |
|
| 188 |
-
self._log_progress(f"✅ DONE! Scraped {review_count} reviews from {page_count} pages", progress_callback)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
return {
|
| 191 |
'success': True,
|
| 192 |
-
'total_reviews': review_count,
|
| 193 |
-
'
|
| 194 |
-
'
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
| 202 |
}
|
| 203 |
}
|
| 204 |
|
| 205 |
except Exception as e:
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
| 209 |
finally:
|
| 210 |
self._cleanup()
|
| 211 |
|
| 212 |
-
def
|
| 213 |
-
"""
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
try:
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
-
#
|
| 229 |
aria_disabled = (btn.get_attribute("aria-disabled") or "").lower()
|
| 230 |
if aria_disabled in ("true", "1"):
|
| 231 |
return False
|
| 232 |
|
| 233 |
-
#
|
| 234 |
try:
|
| 235 |
self.driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
|
| 236 |
time.sleep(0.15)
|
| 237 |
-
except
|
| 238 |
pass
|
| 239 |
|
| 240 |
-
# Try
|
| 241 |
try:
|
| 242 |
WebDriverWait(self.driver, 5).until(EC.element_to_be_clickable((By.XPATH, xp)))
|
| 243 |
btn.click()
|
| 244 |
-
except
|
| 245 |
-
# JS fallback (needed in headless mode)
|
| 246 |
self.driver.execute_script("arguments[0].click();", btn)
|
| 247 |
|
| 248 |
return True
|
|
@@ -250,55 +347,59 @@ class OpenTableScraper:
|
|
| 250 |
except TimeoutException:
|
| 251 |
continue
|
| 252 |
except StaleElementReferenceException:
|
| 253 |
-
# Re-find once
|
| 254 |
try:
|
| 255 |
btn = self.driver.find_element(By.XPATH, xp)
|
| 256 |
self.driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
|
| 257 |
self.driver.execute_script("arguments[0].click();", btn)
|
| 258 |
return True
|
| 259 |
-
except
|
| 260 |
continue
|
| 261 |
-
except
|
| 262 |
continue
|
| 263 |
|
| 264 |
return False
|
| 265 |
|
| 266 |
-
def
|
| 267 |
-
"""
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
|
| 300 |
def _init_driver(self):
|
| 301 |
-
"""Initialize Chrome WebDriver
|
| 302 |
chrome_options = Options()
|
| 303 |
chrome_options.page_load_strategy = self.page_load_strategy
|
| 304 |
|
|
@@ -308,18 +409,13 @@ class OpenTableScraper:
|
|
| 308 |
chrome_options.add_argument('--disable-dev-shm-usage')
|
| 309 |
chrome_options.add_argument('--disable-gpu')
|
| 310 |
|
| 311 |
-
# Realistic user agent to avoid bot detection
|
| 312 |
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
| 313 |
-
|
| 314 |
-
# Additional anti-detection measures
|
| 315 |
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 316 |
chrome_options.add_experimental_option('useAutomationExtension', False)
|
| 317 |
|
| 318 |
service = Service('/usr/local/bin/chromedriver')
|
| 319 |
self.driver = webdriver.Chrome(service=service, options=chrome_options)
|
| 320 |
self.driver.set_page_load_timeout(30)
|
| 321 |
-
|
| 322 |
-
# Initialize WebDriverWait
|
| 323 |
self.wait = WebDriverWait(self.driver, 10)
|
| 324 |
|
| 325 |
def _cleanup(self):
|
|
@@ -336,7 +432,7 @@ class OpenTableScraper:
|
|
| 336 |
return 'opentable.c' in url.lower()
|
| 337 |
|
| 338 |
def _log_progress(self, message: str, callback: Optional[Callable]):
|
| 339 |
-
"""Log progress
|
| 340 |
print(message)
|
| 341 |
if callback:
|
| 342 |
callback(message)
|
|
@@ -347,49 +443,23 @@ class OpenTableScraper:
|
|
| 347 |
|
| 348 |
def scrape_opentable(url: str, max_reviews: Optional[int] = None, headless: bool = True) -> Dict[str, Any]:
|
| 349 |
"""
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
Args:
|
| 353 |
-
url: OpenTable restaurant URL
|
| 354 |
-
max_reviews: Maximum number of reviews to scrape (None = all)
|
| 355 |
-
headless: Run browser in headless mode
|
| 356 |
|
| 357 |
-
|
| 358 |
-
Dict with 'success', 'total_reviews', 'total_pages', and 'reviews' data
|
| 359 |
"""
|
| 360 |
scraper = OpenTableScraper(headless=headless)
|
| 361 |
-
return scraper.scrape_reviews(url, max_reviews
|
| 362 |
|
| 363 |
|
| 364 |
if __name__ == "__main__":
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
test_url = "https://www.opentable.ca/r/miku-restaurant-vancouver"
|
| 370 |
|
| 371 |
-
print(f"
|
| 372 |
-
print("
|
| 373 |
-
print("
|
| 374 |
-
|
| 375 |
-
result
|
| 376 |
-
|
| 377 |
-
print("
|
| 378 |
-
if result['success']:
|
| 379 |
-
print("✅ SUCCESS!")
|
| 380 |
-
print(f" 📊 Total reviews scraped: {result['total_reviews']}")
|
| 381 |
-
print(f" 📄 Total pages visited: {result['total_pages']}")
|
| 382 |
-
|
| 383 |
-
if result['total_reviews'] > 0:
|
| 384 |
-
print(f"\n 🔍 Sample (first review):")
|
| 385 |
-
print(f" 👤 Name: {result['reviews']['names'][0]}")
|
| 386 |
-
print(f" 📅 Date: {result['reviews']['dates'][0]}")
|
| 387 |
-
print(f" ⭐ Overall: {result['reviews']['overall_ratings'][0]}")
|
| 388 |
-
print(f" 🍜 Food: {result['reviews']['food_ratings'][0]}")
|
| 389 |
-
print(f" 💁 Service: {result['reviews']['service_ratings'][0]}")
|
| 390 |
-
print(f" 🏮 Ambience: {result['reviews']['ambience_ratings'][0]}")
|
| 391 |
-
print(f" 💬 Review: {result['reviews']['review_texts'][0][:150]}...")
|
| 392 |
-
else:
|
| 393 |
-
print("❌ FAILED")
|
| 394 |
-
print(f" Error: {result.get('error', 'Unknown error')}")
|
| 395 |
-
print("=" * 80)
|
|
|
|
| 1 |
"""
|
| 2 |
+
OpenTable Scraper - FIXED VERSION
|
| 3 |
+
Production-ready scraper that doesn't lose reviews.
|
| 4 |
+
|
| 5 |
+
FIXES:
|
| 6 |
+
1. Only counts reviews that have actual text
|
| 7 |
+
2. Better selector specificity
|
| 8 |
+
3. Logs empty vs real reviews for debugging
|
| 9 |
+
4. Continues even if individual reviews fail
|
| 10 |
+
|
| 11 |
+
Author: Tushar Pingle
|
| 12 |
+
Updated: Nov 2024
|
| 13 |
"""
|
| 14 |
|
| 15 |
import time
|
| 16 |
+
from typing import Dict, Any, List, Optional, Callable
|
| 17 |
from selenium import webdriver
|
| 18 |
+
from selenium.webdriver.chrome.options import Options
|
| 19 |
+
from selenium.webdriver.chrome.service import Service
|
|
|
|
|
|
|
|
|
|
| 20 |
from selenium.webdriver.common.by import By
|
| 21 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 22 |
from selenium.webdriver.support import expected_conditions as EC
|
| 23 |
+
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
class OpenTableScraper:
|
| 27 |
+
"""
|
| 28 |
+
Production OpenTable scraper with improved review extraction.
|
| 29 |
+
"""
|
| 30 |
|
| 31 |
+
# Updated selectors - more specific for actual review cards
|
| 32 |
SELECTORS = {
|
| 33 |
"review_cards": [
|
| 34 |
+
# Most specific first - only match list items that contain actual review content
|
| 35 |
"//li[@data-test='reviews-list-item']",
|
| 36 |
+
# Fallback: items in reviews section that have both date AND substantial text
|
| 37 |
+
"//section[@id='reviews']//li[contains(., 'Dined') and .//span[string-length(normalize-space()) > 30]]",
|
| 38 |
+
# Generic fallback
|
| 39 |
+
"//section[.//h2[contains(., 'people are saying') or contains(., 'Reviews')]]//li[.//p[string-length(normalize-space()) > 30] or .//span[string-length(normalize-space()) > 30]]",
|
| 40 |
],
|
| 41 |
"next_button": [
|
| 42 |
"//a[@aria-label='Go to the next page']",
|
|
|
|
| 74 |
".//li[contains(., 'Ambience')]//span"
|
| 75 |
],
|
| 76 |
"review_text": [
|
| 77 |
+
# Priority order - most specific first
|
| 78 |
".//span[@data-test='wrapper-tag']",
|
| 79 |
".//div[@data-test='wrapper-tag']",
|
| 80 |
".//p[@data-test='review-text']",
|
| 81 |
+
# Get text content from review body
|
| 82 |
+
".//div[contains(@class,'review')]//p[string-length(normalize-space()) > 20]",
|
| 83 |
+
".//div[contains(@class,'review')]//span[string-length(normalize-space()) > 20]",
|
| 84 |
+
# Fallback: any paragraph/span with substantial text that's not date/rating
|
| 85 |
+
".//p[not(contains(., 'Dined')) and not(contains(., 'Overall')) and not(contains(., 'Food')) and not(contains(., 'Service')) and not(contains(., 'Ambience')) and string-length(normalize-space()) > 20]",
|
| 86 |
+
".//span[not(contains(., 'Dined')) and not(ancestor::li[contains(., 'Overall')]) and string-length(normalize-space()) > 20]",
|
| 87 |
]
|
| 88 |
}
|
| 89 |
|
|
|
|
| 92 |
self.page_load_strategy = page_load_strategy
|
| 93 |
self.driver = None
|
| 94 |
self.wait = None
|
| 95 |
+
self.empty_count = 0 # Track empty reviews for debugging
|
| 96 |
|
| 97 |
def scrape_reviews(
|
| 98 |
self,
|
|
|
|
| 100 |
max_reviews: Optional[int] = None,
|
| 101 |
progress_callback: Optional[Callable[[str], None]] = None
|
| 102 |
) -> Dict[str, Any]:
|
| 103 |
+
"""
|
| 104 |
+
Scrape reviews from OpenTable restaurant page.
|
| 105 |
|
| 106 |
+
FIXED: Only counts and returns reviews that have actual text content.
|
| 107 |
+
"""
|
| 108 |
if not self._validate_url(url):
|
| 109 |
return {'success': False, 'error': 'Invalid OpenTable URL', 'reviews': []}
|
| 110 |
|
|
|
|
| 130 |
reviews = []
|
| 131 |
|
| 132 |
page_count = 0
|
| 133 |
+
review_count = 0 # Only counts VALID reviews with text
|
| 134 |
+
self.empty_count = 0 # Track skipped empty reviews
|
| 135 |
|
| 136 |
while True:
|
| 137 |
page_count += 1
|
|
|
|
| 147 |
self._log_progress("⚠️ No reviews found on page.", progress_callback)
|
| 148 |
if page_count == 1:
|
| 149 |
# Save page source for debugging
|
| 150 |
+
try:
|
| 151 |
+
with open('debug_page_source.html', 'w', encoding='utf-8') as f:
|
| 152 |
+
f.write(self.driver.page_source)
|
| 153 |
+
self._log_progress("💾 Saved page source to debug_page_source.html", progress_callback)
|
| 154 |
+
except:
|
| 155 |
+
pass
|
| 156 |
break
|
| 157 |
|
| 158 |
+
self._log_progress(f"📋 Found {len(review_elements)} review cards on page", progress_callback)
|
| 159 |
|
| 160 |
# Extract data from each review
|
| 161 |
+
page_valid = 0
|
| 162 |
+
page_empty = 0
|
| 163 |
+
|
| 164 |
for idx, review in enumerate(review_elements):
|
| 165 |
if max_reviews and review_count >= max_reviews:
|
| 166 |
self._log_progress(f"🎯 Reached max reviews ({max_reviews}).", progress_callback)
|
| 167 |
break
|
| 168 |
|
| 169 |
try:
|
| 170 |
+
# Extract review text FIRST - this is the critical field
|
| 171 |
+
review_text = self._extract_review_text(review)
|
| 172 |
+
|
| 173 |
+
# FIXED: Skip reviews without actual text content
|
| 174 |
+
if not review_text or len(review_text.strip()) < 10:
|
| 175 |
+
page_empty += 1
|
| 176 |
+
self.empty_count += 1
|
| 177 |
+
continue # Don't append, don't count
|
| 178 |
+
|
| 179 |
+
# Now extract other fields
|
| 180 |
name = self._extract_text_with_fallback(review, self.SELECTORS["name"])
|
| 181 |
date = self._extract_text_with_fallback(review, self.SELECTORS["date"])
|
| 182 |
overall_rating = self._extract_text_with_fallback(review, self.SELECTORS["overall_rating"])
|
| 183 |
food_rating = self._extract_text_with_fallback(review, self.SELECTORS["food_rating"])
|
| 184 |
service_rating = self._extract_text_with_fallback(review, self.SELECTORS["service_rating"])
|
| 185 |
ambience_rating = self._extract_text_with_fallback(review, self.SELECTORS["ambience_rating"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
+
# Append valid review
|
| 188 |
names.append(name)
|
| 189 |
dates.append(date)
|
| 190 |
overall_ratings.append(overall_rating)
|
|
|
|
| 194 |
reviews.append(review_text)
|
| 195 |
|
| 196 |
review_count += 1
|
| 197 |
+
page_valid += 1
|
| 198 |
|
| 199 |
+
if review_count % 50 == 0:
|
| 200 |
+
self._log_progress(f"📊 Extracted {review_count} valid reviews so far...", progress_callback)
|
| 201 |
|
| 202 |
except Exception as e:
|
| 203 |
self._log_progress(f"⚠️ Error on review {idx + 1}: {str(e)}", progress_callback)
|
| 204 |
continue
|
| 205 |
|
| 206 |
+
# Log page summary
|
| 207 |
+
self._log_progress(f" ✅ Page {page_count}: {page_valid} valid, {page_empty} empty", progress_callback)
|
| 208 |
+
|
| 209 |
if max_reviews and review_count >= max_reviews:
|
| 210 |
break
|
| 211 |
|
|
|
|
| 216 |
|
| 217 |
time.sleep(3) # Wait for new page to load
|
| 218 |
|
| 219 |
+
self._log_progress(f"✅ DONE! Scraped {review_count} valid reviews from {page_count} pages", progress_callback)
|
| 220 |
+
if self.empty_count > 0:
|
| 221 |
+
self._log_progress(f" ℹ️ Skipped {self.empty_count} empty/invalid review cards", progress_callback)
|
| 222 |
+
|
| 223 |
+
# Extract restaurant metadata
|
| 224 |
+
metadata = self._extract_metadata()
|
| 225 |
|
| 226 |
return {
|
| 227 |
'success': True,
|
| 228 |
+
'total_reviews': review_count, # Now correctly represents VALID reviews
|
| 229 |
+
'names': names,
|
| 230 |
+
'dates': dates,
|
| 231 |
+
'overall_ratings': overall_ratings,
|
| 232 |
+
'food_ratings': food_ratings,
|
| 233 |
+
'service_ratings': service_ratings,
|
| 234 |
+
'ambience_ratings': ambience_ratings,
|
| 235 |
+
'reviews': reviews,
|
| 236 |
+
'metadata': metadata,
|
| 237 |
+
'stats': {
|
| 238 |
+
'pages_scraped': page_count,
|
| 239 |
+
'valid_reviews': review_count,
|
| 240 |
+
'empty_skipped': self.empty_count
|
| 241 |
}
|
| 242 |
}
|
| 243 |
|
| 244 |
except Exception as e:
|
| 245 |
+
import traceback
|
| 246 |
+
error_msg = f"Scraping error: {str(e)}\n{traceback.format_exc()}"
|
| 247 |
+
self._log_progress(f"❌ {error_msg}", progress_callback)
|
| 248 |
+
return {'success': False, 'error': error_msg, 'reviews': []}
|
| 249 |
finally:
|
| 250 |
self._cleanup()
|
| 251 |
|
| 252 |
+
def _extract_review_text(self, review_element) -> str:
|
| 253 |
+
"""
|
| 254 |
+
Extract review text with multiple fallback strategies.
|
| 255 |
+
Returns empty string if no valid text found.
|
| 256 |
+
"""
|
| 257 |
+
# Try each selector
|
| 258 |
+
for selector in self.SELECTORS["review_text"]:
|
| 259 |
+
try:
|
| 260 |
+
elements = review_element.find_elements(By.XPATH, selector)
|
| 261 |
+
for elem in elements:
|
| 262 |
+
text = elem.text.strip()
|
| 263 |
+
# Validate it's actual review content
|
| 264 |
+
if text and len(text) > 20:
|
| 265 |
+
# Filter out dates and ratings that might have leaked
|
| 266 |
+
if "Dined on" in text or text.startswith("Overall") or text.startswith("Food"):
|
| 267 |
+
continue
|
| 268 |
+
# Filter out very short generic text
|
| 269 |
+
if text in ["See more", "Read more", "Show more"]:
|
| 270 |
+
continue
|
| 271 |
+
return text
|
| 272 |
+
except:
|
| 273 |
+
continue
|
| 274 |
|
| 275 |
+
# Last resort: try to get all text from the review card and extract the main content
|
| 276 |
+
try:
|
| 277 |
+
full_text = review_element.text
|
| 278 |
+
# Split by newlines and find the longest substantial text
|
| 279 |
+
lines = [line.strip() for line in full_text.split('\n') if line.strip()]
|
| 280 |
+
# Filter out dates, ratings, names
|
| 281 |
+
content_lines = []
|
| 282 |
+
for line in lines:
|
| 283 |
+
if len(line) > 30: # Substantial text
|
| 284 |
+
if not any(skip in line for skip in ['Dined on', 'Overall', 'Food', 'Service', 'Ambience', 'VIP']):
|
| 285 |
+
content_lines.append(line)
|
| 286 |
+
|
| 287 |
+
if content_lines:
|
| 288 |
+
# Return the longest line as the review
|
| 289 |
+
return max(content_lines, key=len)
|
| 290 |
+
except:
|
| 291 |
+
pass
|
| 292 |
+
|
| 293 |
+
return ""
|
| 294 |
+
|
| 295 |
+
def _extract_text_with_fallback(self, parent_element, selectors: List[str]) -> str:
|
| 296 |
+
"""Extract text using fallback XPath selectors."""
|
| 297 |
+
for selector in selectors:
|
| 298 |
try:
|
| 299 |
+
element = parent_element.find_element(By.XPATH, selector)
|
| 300 |
+
text = element.text.strip()
|
| 301 |
+
if text:
|
| 302 |
+
return text
|
| 303 |
+
except:
|
| 304 |
+
continue
|
| 305 |
+
return ""
|
| 306 |
+
|
| 307 |
+
def _find_elements_with_fallback(self, selectors: List[str], by: By) -> List:
|
| 308 |
+
"""Try multiple selectors until one works."""
|
| 309 |
+
for selector in selectors:
|
| 310 |
+
try:
|
| 311 |
+
elements = self.driver.find_elements(by, selector)
|
| 312 |
+
if elements:
|
| 313 |
+
return elements
|
| 314 |
+
except:
|
| 315 |
+
continue
|
| 316 |
+
return []
|
| 317 |
+
|
| 318 |
+
def _click_next(self) -> bool:
|
| 319 |
+
"""Click the next page button."""
|
| 320 |
+
for xp in self.SELECTORS["next_button"]:
|
| 321 |
+
try:
|
| 322 |
+
btn = WebDriverWait(self.driver, 3).until(
|
| 323 |
+
EC.presence_of_element_located((By.XPATH, xp))
|
| 324 |
+
)
|
| 325 |
|
| 326 |
+
# Check if disabled
|
| 327 |
aria_disabled = (btn.get_attribute("aria-disabled") or "").lower()
|
| 328 |
if aria_disabled in ("true", "1"):
|
| 329 |
return False
|
| 330 |
|
| 331 |
+
# Scroll into view
|
| 332 |
try:
|
| 333 |
self.driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
|
| 334 |
time.sleep(0.15)
|
| 335 |
+
except:
|
| 336 |
pass
|
| 337 |
|
| 338 |
+
# Try clicking
|
| 339 |
try:
|
| 340 |
WebDriverWait(self.driver, 5).until(EC.element_to_be_clickable((By.XPATH, xp)))
|
| 341 |
btn.click()
|
| 342 |
+
except:
|
|
|
|
| 343 |
self.driver.execute_script("arguments[0].click();", btn)
|
| 344 |
|
| 345 |
return True
|
|
|
|
| 347 |
except TimeoutException:
|
| 348 |
continue
|
| 349 |
except StaleElementReferenceException:
|
|
|
|
| 350 |
try:
|
| 351 |
btn = self.driver.find_element(By.XPATH, xp)
|
| 352 |
self.driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
|
| 353 |
self.driver.execute_script("arguments[0].click();", btn)
|
| 354 |
return True
|
| 355 |
+
except:
|
| 356 |
continue
|
| 357 |
+
except:
|
| 358 |
continue
|
| 359 |
|
| 360 |
return False
|
| 361 |
|
| 362 |
+
def _extract_metadata(self) -> Dict[str, Any]:
|
| 363 |
+
"""Extract restaurant metadata from page."""
|
| 364 |
+
metadata = {}
|
| 365 |
+
try:
|
| 366 |
+
# Restaurant name
|
| 367 |
+
name_selectors = [
|
| 368 |
+
"//h1",
|
| 369 |
+
"//h1[@data-test='restaurant-name']",
|
| 370 |
+
"//div[contains(@class,'restaurant-name')]//h1"
|
| 371 |
+
]
|
| 372 |
+
for sel in name_selectors:
|
| 373 |
+
try:
|
| 374 |
+
elem = self.driver.find_element(By.XPATH, sel)
|
| 375 |
+
if elem.text.strip():
|
| 376 |
+
metadata['restaurant_name'] = elem.text.strip()
|
| 377 |
+
break
|
| 378 |
+
except:
|
| 379 |
+
continue
|
| 380 |
+
|
| 381 |
+
# Cuisine type
|
| 382 |
+
cuisine_selectors = [
|
| 383 |
+
"//span[contains(@class,'cuisine')]",
|
| 384 |
+
"//p[contains(@class,'cuisine')]",
|
| 385 |
+
"//div[contains(@class,'cuisine')]"
|
| 386 |
+
]
|
| 387 |
+
for sel in cuisine_selectors:
|
| 388 |
+
try:
|
| 389 |
+
elem = self.driver.find_element(By.XPATH, sel)
|
| 390 |
+
if elem.text.strip():
|
| 391 |
+
metadata['cuisine'] = elem.text.strip()
|
| 392 |
+
break
|
| 393 |
+
except:
|
| 394 |
+
continue
|
| 395 |
+
|
| 396 |
+
except:
|
| 397 |
+
pass
|
| 398 |
+
|
| 399 |
+
return metadata
|
| 400 |
|
| 401 |
def _init_driver(self):
|
| 402 |
+
"""Initialize Chrome WebDriver."""
|
| 403 |
chrome_options = Options()
|
| 404 |
chrome_options.page_load_strategy = self.page_load_strategy
|
| 405 |
|
|
|
|
| 409 |
chrome_options.add_argument('--disable-dev-shm-usage')
|
| 410 |
chrome_options.add_argument('--disable-gpu')
|
| 411 |
|
|
|
|
| 412 |
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
|
|
|
|
|
|
| 413 |
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 414 |
chrome_options.add_experimental_option('useAutomationExtension', False)
|
| 415 |
|
| 416 |
service = Service('/usr/local/bin/chromedriver')
|
| 417 |
self.driver = webdriver.Chrome(service=service, options=chrome_options)
|
| 418 |
self.driver.set_page_load_timeout(30)
|
|
|
|
|
|
|
| 419 |
self.wait = WebDriverWait(self.driver, 10)
|
| 420 |
|
| 421 |
def _cleanup(self):
|
|
|
|
| 432 |
return 'opentable.c' in url.lower()
|
| 433 |
|
| 434 |
def _log_progress(self, message: str, callback: Optional[Callable]):
|
| 435 |
+
"""Log progress."""
|
| 436 |
print(message)
|
| 437 |
if callback:
|
| 438 |
callback(message)
|
|
|
|
| 443 |
|
| 444 |
def scrape_opentable(url: str, max_reviews: Optional[int] = None, headless: bool = True) -> Dict[str, Any]:
|
| 445 |
"""
|
| 446 |
+
Convenience function to scrape OpenTable reviews.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
|
| 448 |
+
FIXED: Only returns reviews with actual text content.
|
|
|
|
| 449 |
"""
|
| 450 |
scraper = OpenTableScraper(headless=headless)
|
| 451 |
+
return scraper.scrape_reviews(url, max_reviews)
|
| 452 |
|
| 453 |
|
| 454 |
if __name__ == "__main__":
|
| 455 |
+
# Test the scraper
|
| 456 |
+
test_url = "https://www.opentable.ca/r/dockside-restaurant-vancouver-vancouver"
|
| 457 |
+
result = scrape_opentable(test_url, max_reviews=50)
|
|
|
|
|
|
|
| 458 |
|
| 459 |
+
print(f"\n{'='*60}")
|
| 460 |
+
print(f"Results:")
|
| 461 |
+
print(f" Success: {result.get('success')}")
|
| 462 |
+
print(f" Total valid reviews: {result.get('total_reviews')}")
|
| 463 |
+
if result.get('stats'):
|
| 464 |
+
print(f" Empty skipped: {result['stats'].get('empty_skipped', 0)}")
|
| 465 |
+
print(f"{'='*60}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|