TushP commited on
Commit
df41fce
·
verified ·
1 Parent(s): 7dab48f

Upload folder using huggingface_hub

Browse files
modal_backend.py CHANGED
@@ -1,19 +1,20 @@
1
  """
2
- Modal Backend for Restaurant Intelligence Agent
3
- With TRUE MCP Server Integration
4
 
5
- UPDATED: Now sends slim trend_data instead of full raw_reviews
6
- - Reduces response size by ~97%
7
- - Pre-calculates sentiment in backend
8
- - Fixes HuggingFace timeout issues
 
9
 
10
- Deploys:
11
- 1. Analysis API endpoint
12
- 2. MCP Server endpoint
13
  """
14
 
15
  import modal
16
  from typing import Dict, Any, List
 
 
17
 
18
  # Create Modal app
19
  app = modal.App("restaurant-intelligence")
@@ -33,14 +34,14 @@ image = (
33
  "matplotlib",
34
  "fastapi[standard]",
35
  "httpx",
36
- "fastmcp", # Required by src/mcp_integrations modules
37
  )
38
  .add_local_python_source("src")
39
  )
40
 
41
 
42
  # ============================================================================
43
- # HELPER FUNCTION - Calculate sentiment from text
44
  # ============================================================================
45
 
46
  def calculate_sentiment(text: str) -> float:
@@ -65,186 +66,277 @@ def calculate_sentiment(text: str) -> float:
65
 
66
 
67
  # ============================================================================
68
- # MCP SERVER (TRUE MCP INTEGRATION)
69
  # ============================================================================
70
 
71
- # In-memory storage for MCP
72
- REVIEW_INDEX: Dict[str, List[str]] = {}
73
- ANALYSIS_CACHE: Dict[str, Dict[str, Any]] = {}
74
-
75
-
76
- @app.function(image=image, timeout=300)
77
- @modal.asgi_app()
78
- def mcp_server():
79
- """
80
- TRUE MCP Server - exposes tools via MCP protocol over HTTP.
81
  """
82
- from fastapi import FastAPI, HTTPException
83
- from pydantic import BaseModel
84
- from datetime import datetime
85
 
86
- mcp_api = FastAPI(title="Restaurant Intelligence MCP Server")
 
 
 
 
87
 
88
- class ToolRequest(BaseModel):
89
- tool_name: str
90
- arguments: Dict[str, Any] = {}
 
91
 
92
- class IndexReviewsRequest(BaseModel):
93
- restaurant_name: str
94
- reviews: List[str]
95
 
96
- class QueryReviewsRequest(BaseModel):
97
- restaurant_name: str
98
- question: str
99
- top_k: int = 5
100
 
101
- # MCP Tools
102
- def index_reviews(restaurant_name: str, reviews: List[str]) -> Dict[str, Any]:
103
- REVIEW_INDEX[restaurant_name] = reviews
104
- return {
105
- "success": True,
106
- "restaurant": restaurant_name,
107
- "indexed_count": len(reviews),
108
- "message": f"Indexed {len(reviews)} reviews for {restaurant_name}"
109
- }
110
 
111
- def query_reviews(restaurant_name: str, question: str, top_k: int = 5) -> Dict[str, Any]:
112
- reviews = REVIEW_INDEX.get(restaurant_name, [])
113
- if not reviews:
114
- return {"success": False, "error": f"No reviews indexed for {restaurant_name}"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- question_words = set(question.lower().split())
117
- scored = [(len(question_words & set(r.lower().split())), r) for r in reviews]
118
- scored.sort(reverse=True, key=lambda x: x[0])
119
 
120
- return {
121
- "success": True,
122
- "restaurant": restaurant_name,
123
- "question": question,
124
- "relevant_reviews": [r[1] for r in scored[:top_k]],
125
- "review_count": min(top_k, len(reviews))
126
- }
127
-
128
- def save_report(restaurant_name: str, report_data: Dict, report_type: str = "analysis") -> Dict[str, Any]:
129
- report_id = f"{restaurant_name}_{report_type}_{datetime.now().isoformat()}"
130
- ANALYSIS_CACHE[report_id] = {"restaurant": restaurant_name, "type": report_type, "data": report_data}
131
- return {"success": True, "report_id": report_id}
132
-
133
- def list_tools() -> Dict[str, Any]:
134
- return {
135
- "success": True,
136
- "tools": [
137
- {"name": "index_reviews", "description": "Index reviews for RAG Q&A"},
138
- {"name": "query_reviews", "description": "Answer questions about reviews"},
139
- {"name": "save_report", "description": "Save analysis report"},
140
- ]
141
- }
142
-
143
- @mcp_api.get("/")
144
- async def root():
145
- return {"name": "Restaurant Intelligence MCP Server", "protocol": "MCP", "version": "1.0"}
146
-
147
- @mcp_api.get("/health")
148
- async def health():
149
- return {"status": "healthy", "mcp": "enabled"}
150
-
151
- @mcp_api.get("/tools")
152
- async def get_tools():
153
- return list_tools()
154
-
155
- @mcp_api.post("/mcp/call")
156
- async def call_tool(request: ToolRequest):
157
- """TRUE MCP interface - agent calls tools via this endpoint."""
158
- tool_map = {
159
- "index_reviews": lambda args: index_reviews(args["restaurant_name"], args["reviews"]),
160
- "query_reviews": lambda args: query_reviews(args["restaurant_name"], args["question"], args.get("top_k", 5)),
161
- "save_report": lambda args: save_report(args["restaurant_name"], args["report_data"], args.get("report_type", "analysis")),
162
- "list_tools": lambda args: list_tools()
163
- }
164
 
165
- if request.tool_name not in tool_map:
166
- raise HTTPException(status_code=404, detail=f"Tool '{request.tool_name}' not found")
 
 
 
 
 
 
 
 
 
 
167
 
168
- try:
169
- result = tool_map[request.tool_name](request.arguments)
170
- return {"success": True, "tool": request.tool_name, "result": result}
171
- except Exception as e:
172
- raise HTTPException(status_code=500, detail=str(e))
173
-
174
- @mcp_api.post("/tools/index_reviews")
175
- async def api_index_reviews(request: IndexReviewsRequest):
176
- return index_reviews(request.restaurant_name, request.reviews)
177
-
178
- @mcp_api.post("/tools/query_reviews")
179
- async def api_query_reviews(request: QueryReviewsRequest):
180
- return query_reviews(request.restaurant_name, request.question, request.top_k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
- return mcp_api
183
 
 
184
 
185
- # ============================================================================
186
- # MAIN ANALYSIS FUNCTIONS
187
- # ============================================================================
188
 
189
- @app.function(image=image)
190
- def hello() -> Dict[str, Any]:
191
- return {"status": "Modal is working!", "mcp": "enabled"}
 
192
 
 
 
193
 
194
- @app.function(image=image, timeout=600)
195
- def scrape_restaurant_modal(url: str, max_reviews: int = 100) -> Dict[str, Any]:
196
- """Scrape reviews from OpenTable or Google Maps."""
197
-
198
- # Detect platform
199
- url_lower = url.lower()
200
- if 'opentable' in url_lower:
201
- from src.scrapers.opentable_scraper import scrape_opentable
202
- result = scrape_opentable(url=url, max_reviews=max_reviews, headless=True)
203
- elif any(x in url_lower for x in ['google.com/maps', 'goo.gl/maps', 'maps.google', 'maps.app.goo.gl']):
204
- from src.scrapers.google_maps_scraper import scrape_google_maps
205
- result = scrape_google_maps(url=url, max_reviews=max_reviews, headless=True)
206
- else:
207
- return {"success": False, "error": "Unsupported platform. Use OpenTable or Google Maps."}
208
-
209
- if not result.get("success"):
210
- return {"success": False, "error": result.get("error")}
211
-
212
- from src.data_processing import process_reviews, clean_reviews_for_ai
213
-
214
- df = process_reviews(result)
215
- reviews = clean_reviews_for_ai(df["review_text"].tolist(), verbose=False)
216
-
217
- # Create SLIM trend_data (pre-calculate sentiment, no text!)
218
- trend_data = []
219
- for _, row in df.iterrows():
220
- text = str(row.get("review_text", ""))
221
- trend_data.append({
222
- "date": str(row.get("date", "")),
223
- "rating": float(row.get("overall_rating", 0) or 0),
224
- "sentiment": calculate_sentiment(text) # Pre-calculate!
225
- })
226
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  return {
228
- "success": True,
229
- "total_reviews": len(reviews),
230
- "reviews": reviews,
231
- "trend_data": trend_data, # Slim version, no text!
232
- "metadata": result.get("metadata", {}),
233
  }
234
 
235
 
 
 
 
 
236
  @app.function(
237
  image=image,
238
  secrets=[modal.Secret.from_name("anthropic-api-key")],
239
- timeout=2400,
240
  )
241
- def full_analysis_modal(url: str, max_reviews: int = 100) -> Dict[str, Any]:
242
  """
243
- Complete end-to-end analysis with MCP integration.
 
 
 
 
 
244
 
245
- UPDATED: Returns slim trend_data instead of full raw_reviews.
246
- This reduces response size by ~97% and fixes timeout issues.
247
  """
 
 
 
 
 
248
 
249
  # Detect platform
250
  url_lower = url.lower()
@@ -253,7 +345,10 @@ def full_analysis_modal(url: str, max_reviews: int = 100) -> Dict[str, Any]:
253
  if platform == "unknown":
254
  return {"success": False, "error": "Unsupported platform. Use OpenTable or Google Maps."}
255
 
256
- # Import scrapers
 
 
 
257
  if platform == "opentable":
258
  from src.scrapers.opentable_scraper import scrape_opentable
259
  result = scrape_opentable(url=url, max_reviews=max_reviews, headless=True)
@@ -262,52 +357,181 @@ def full_analysis_modal(url: str, max_reviews: int = 100) -> Dict[str, Any]:
262
  result = scrape_google_maps(url=url, max_reviews=max_reviews, headless=True)
263
 
264
  if not result.get("success"):
265
- return {"success": False, "error": result.get("error")}
266
 
 
 
 
267
  from src.data_processing import process_reviews, clean_reviews_for_ai
268
- from src.agent.base_agent import RestaurantAnalysisAgent
269
 
270
  df = process_reviews(result)
271
  reviews = clean_reviews_for_ai(df["review_text"].tolist(), verbose=False)
272
 
273
- # Create SLIM trend_data (pre-calculate sentiment in backend!)
274
- # This is ~97% smaller than sending full review text
 
275
  trend_data = []
276
  for _, row in df.iterrows():
277
  text = str(row.get("review_text", ""))
278
  trend_data.append({
279
  "date": str(row.get("date", "")),
280
  "rating": float(row.get("overall_rating", 0) or 0),
281
- "sentiment": calculate_sentiment(text) # Pre-calculated!
282
  })
283
 
284
- # Extract restaurant name from URL
285
  if platform == "opentable":
286
  restaurant_name = url.split("/")[-1].split("?")[0].replace("-", " ").title()
287
  else:
288
- # Google Maps
289
  if '/place/' in url:
290
  restaurant_name = url.split('/place/')[1].split('/')[0].replace('+', ' ').replace('%20', ' ')
291
  else:
292
  restaurant_name = "Restaurant"
293
 
294
- # Run analysis
295
- agent = RestaurantAnalysisAgent()
296
- analysis = agent.analyze_restaurant(
297
- restaurant_url=url,
298
- restaurant_name=restaurant_name,
299
- reviews=reviews,
300
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
- # Store in MCP cache for Q&A
303
- REVIEW_INDEX[restaurant_name] = reviews
304
 
305
- # Add slim trend_data (NOT full raw_reviews!)
306
- analysis['trend_data'] = trend_data
307
- analysis['source'] = platform
 
308
 
309
- # Log response size for debugging
310
- import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  response_size = len(json.dumps(analysis))
312
  print(f"[MODAL] Response size: {response_size / 1024:.1f} KB")
313
 
@@ -315,113 +539,61 @@ def full_analysis_modal(url: str, max_reviews: int = 100) -> Dict[str, Any]:
315
 
316
 
317
  # ============================================================================
318
- # FASTAPI APP (serves both analysis and MCP)
319
  # ============================================================================
320
 
321
  @app.function(
322
  image=image,
323
  secrets=[modal.Secret.from_name("anthropic-api-key")],
324
- timeout=2400,
325
  )
326
  @modal.asgi_app()
327
  def fastapi_app():
328
- """Main API with MCP integration."""
329
  from fastapi import FastAPI, HTTPException
330
  from pydantic import BaseModel
331
 
332
- web_app = FastAPI(title="Restaurant Intelligence API with MCP")
333
 
334
  class AnalyzeRequest(BaseModel):
335
  url: str
336
  max_reviews: int = 100
337
 
338
- class MCPCallRequest(BaseModel):
339
- tool_name: str
340
- arguments: Dict[str, Any] = {}
341
-
342
  @web_app.get("/")
343
  async def root():
344
  return {
345
  "name": "Restaurant Intelligence API",
346
- "version": "3.0",
347
- "mcp": "enabled",
348
- "optimizations": ["slim_trend_data", "pre_calculated_sentiment"],
349
- "endpoints": {
350
- "analyze": "/analyze",
351
- "mcp_tools": "/mcp/call",
352
- "mcp_list": "/mcp/tools"
353
- }
354
  }
355
 
356
  @web_app.get("/health")
357
  async def health():
358
- return {"status": "healthy", "mcp": "enabled"}
359
 
360
  @web_app.post("/analyze")
361
  async def analyze(request: AnalyzeRequest):
362
  try:
363
- result = full_analysis_modal.remote(url=request.url, max_reviews=request.max_reviews)
364
  return result
365
  except Exception as e:
 
 
366
  raise HTTPException(status_code=500, detail=str(e))
367
 
368
- # MCP Endpoints
369
- @web_app.get("/mcp/tools")
370
- async def mcp_list_tools():
371
- return {
372
- "tools": [
373
- {"name": "index_reviews", "description": "Index reviews for RAG Q&A"},
374
- {"name": "query_reviews", "description": "Answer questions about reviews"},
375
- {"name": "save_report", "description": "Save analysis report"},
376
- ]
377
- }
378
-
379
- @web_app.post("/mcp/call")
380
- async def mcp_call(request: MCPCallRequest):
381
- """TRUE MCP interface."""
382
- # For now, this delegates to local functions
383
- if request.tool_name == "index_reviews":
384
- args = request.arguments
385
- REVIEW_INDEX[args["restaurant_name"]] = args["reviews"]
386
- return {"success": True, "indexed": len(args["reviews"])}
387
-
388
- elif request.tool_name == "query_reviews":
389
- args = request.arguments
390
- reviews = REVIEW_INDEX.get(args["restaurant_name"], [])
391
- if not reviews:
392
- return {"success": False, "error": "No reviews indexed"}
393
-
394
- question_words = set(args["question"].lower().split())
395
- scored = [(len(question_words & set(r.lower().split())), r) for r in reviews]
396
- scored.sort(reverse=True, key=lambda x: x[0])
397
- top_k = args.get("top_k", 5)
398
-
399
- return {
400
- "success": True,
401
- "relevant_reviews": [r[1] for r in scored[:top_k]]
402
- }
403
-
404
- return {"success": False, "error": f"Unknown tool: {request.tool_name}"}
405
-
406
  return web_app
407
 
408
 
 
 
 
 
409
  @app.local_entrypoint()
410
  def main():
411
- print("🧪 Testing Modal deployment with MCP...\n")
412
-
413
- print("1️⃣ Testing connection...")
414
- result = hello.remote()
415
- print(f"✅ {result}\n")
416
-
417
- print("2️⃣ MCP Server deployed at:")
418
- print(" https://tushar-pingle--restaurant-intelligence-mcp-server.modal.run")
419
 
420
- print("\n3️⃣ Analysis API deployed at:")
421
  print(" https://tushar-pingle--restaurant-intelligence-fastapi-app.modal.run")
422
 
423
- print("\n✅ Both endpoints ready!")
424
- print("\n📊 Optimizations enabled:")
425
- print(" - Slim trend_data (no full review text)")
426
- print(" - Pre-calculated sentiment in backend")
427
- print(" - ~97% smaller response size")
 
1
  """
2
+ Modal Backend for Restaurant Intelligence Agent - PARALLEL OPTIMIZED
3
+ Version 3.0 - Uses Modal's parallel processing for 5x speed improvement
4
 
5
+ KEY OPTIMIZATIONS:
6
+ 1. Parallel batch processing with .map() - Process all batches simultaneously
7
+ 2. Parallel insights generation - Chef + Manager at same time
8
+ 3. Larger batch sizes (30 reviews instead of 20)
9
+ 4. Reduced timeout since parallel is faster
10
 
11
+ TARGET: 1000 reviews in ~5 minutes (down from 15+ minutes)
 
 
12
  """
13
 
14
  import modal
15
  from typing import Dict, Any, List
16
+ import os
17
+ import json
18
 
19
  # Create Modal app
20
  app = modal.App("restaurant-intelligence")
 
34
  "matplotlib",
35
  "fastapi[standard]",
36
  "httpx",
37
+ "fastmcp",
38
  )
39
  .add_local_python_source("src")
40
  )
41
 
42
 
43
  # ============================================================================
44
+ # HELPER FUNCTIONS
45
  # ============================================================================
46
 
47
  def calculate_sentiment(text: str) -> float:
 
66
 
67
 
68
  # ============================================================================
69
+ # PARALLEL BATCH PROCESSOR - The key optimization!
70
  # ============================================================================
71
 
72
+ @app.function(
73
+ image=image,
74
+ secrets=[modal.Secret.from_name("anthropic-api-key")],
75
+ timeout=120, # 2 min per batch is plenty
76
+ retries=2,
77
+ )
78
+ def process_batch(batch_data: Dict[str, Any]) -> Dict[str, Any]:
 
 
 
79
  """
80
+ Process a single batch of reviews - runs in PARALLEL across containers!
 
 
81
 
82
+ This function is called via .map() to process all batches simultaneously.
83
+ Modal will spin up multiple containers to handle batches in parallel.
84
+ """
85
+ from anthropic import Anthropic
86
+ import os
87
 
88
+ reviews = batch_data["reviews"]
89
+ restaurant_name = batch_data["restaurant_name"]
90
+ batch_index = batch_data["batch_index"]
91
+ start_index = batch_data["start_index"]
92
 
93
+ print(f"🔄 Processing batch {batch_index} ({len(reviews)} reviews)...")
 
 
94
 
95
+ client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
 
 
 
96
 
97
+ # Build extraction prompt
98
+ numbered_reviews = []
99
+ for i, review in enumerate(reviews):
100
+ numbered_reviews.append(f"[Review {i}]: {review}")
101
+ reviews_text = "\n\n".join(numbered_reviews)
 
 
 
 
102
 
103
+ prompt = f"""You are analyzing customer reviews for {restaurant_name}. Extract BOTH menu items AND aspects in ONE PASS.
104
+
105
+ REVIEWS:
106
+ {reviews_text}
107
+
108
+ YOUR TASK - Extract THREE things simultaneously:
109
+ 1. **MENU ITEMS** (food & drinks mentioned)
110
+ 2. **ASPECTS** (what customers care about: service, ambience, etc.)
111
+ 3. **SENTIMENT** for each
112
+
113
+ SENTIMENT SCALE (IMPORTANT):
114
+ - **Positive (0.6 to 1.0):** Customer clearly enjoyed/praised this item or aspect
115
+ - **Neutral (0.0 to 0.59):** Mixed feelings, okay but not exceptional, or simply mentioned without strong opinion
116
+ - **Negative (-1.0 to -0.01):** Customer complained, criticized, or expressed disappointment
117
+
118
+ RULES:
119
+ - Specific items only: "salmon sushi", "miso soup", "sake"
120
+ - Separate food from drinks
121
+ - Lowercase names
122
+ - For EACH item/aspect, list which review NUMBERS mention it (just indices, not text)
123
+
124
+ OUTPUT (JSON):
125
+ {{
126
+ "food_items": [
127
+ {{"name": "item name", "mention_count": 2, "sentiment": 0.85, "category": "type", "related_reviews": [0, 5]}}
128
+ ],
129
+ "drinks": [
130
+ {{"name": "drink name", "mention_count": 1, "sentiment": 0.7, "category": "alcohol", "related_reviews": [3]}}
131
+ ],
132
+ "aspects": [
133
+ {{"name": "service speed", "mention_count": 3, "sentiment": 0.65, "description": "brief desc", "related_reviews": [1, 2, 7]}}
134
+ ]
135
+ }}
136
+
137
+ CRITICAL: Output ONLY valid JSON, no other text. Use sentiment scale: >= 0.6 positive, 0-0.59 neutral, < 0 negative
138
+
139
+ Extract everything:"""
140
+
141
+ try:
142
+ response = client.messages.create(
143
+ model="claude-sonnet-4-20250514",
144
+ max_tokens=4000,
145
+ temperature=0.3,
146
+ messages=[{"role": "user", "content": prompt}]
147
+ )
148
 
149
+ result_text = response.content[0].text
150
+ result_text = result_text.replace('```json', '').replace('```', '').strip()
 
151
 
152
+ data = json.loads(result_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ # Map review indices back to full text
155
+ for item in data.get('food_items', []):
156
+ indices = item.get('related_reviews', [])
157
+ item['related_reviews'] = []
158
+ for idx in indices:
159
+ if isinstance(idx, int) and 0 <= idx < len(reviews):
160
+ item['related_reviews'].append({
161
+ 'review_index': start_index + idx,
162
+ 'review_text': reviews[idx]
163
+ })
164
+ if 'name' in item:
165
+ item['name'] = item['name'].lower()
166
 
167
+ for item in data.get('drinks', []):
168
+ indices = item.get('related_reviews', [])
169
+ item['related_reviews'] = []
170
+ for idx in indices:
171
+ if isinstance(idx, int) and 0 <= idx < len(reviews):
172
+ item['related_reviews'].append({
173
+ 'review_index': start_index + idx,
174
+ 'review_text': reviews[idx]
175
+ })
176
+ if 'name' in item:
177
+ item['name'] = item['name'].lower()
178
+
179
+ for aspect in data.get('aspects', []):
180
+ indices = aspect.get('related_reviews', [])
181
+ aspect['related_reviews'] = []
182
+ for idx in indices:
183
+ if isinstance(idx, int) and 0 <= idx < len(reviews):
184
+ aspect['related_reviews'].append({
185
+ 'review_index': start_index + idx,
186
+ 'review_text': reviews[idx]
187
+ })
188
+ if 'name' in aspect:
189
+ aspect['name'] = aspect['name'].lower()
190
+
191
+ print(f"✅ Batch {batch_index} complete: {len(data.get('food_items', []))} food, {len(data.get('drinks', []))} drinks, {len(data.get('aspects', []))} aspects")
192
+ return {"success": True, "batch_index": batch_index, "data": data}
193
+
194
+ except json.JSONDecodeError as e:
195
+ print(f"⚠️ Batch {batch_index} JSON error: {e}")
196
+ return {"success": False, "batch_index": batch_index, "data": {"food_items": [], "drinks": [], "aspects": []}}
197
+ except Exception as e:
198
+ print(f"❌ Batch {batch_index} error: {e}")
199
+ return {"success": False, "batch_index": batch_index, "data": {"food_items": [], "drinks": [], "aspects": []}}
200
+
201
+
202
+ @app.function(
203
+ image=image,
204
+ secrets=[modal.Secret.from_name("anthropic-api-key")],
205
+ timeout=180, # 3 min for insights
206
+ )
207
+ def generate_insights_parallel(analysis_data: Dict[str, Any], restaurant_name: str, role: str) -> Dict[str, Any]:
208
+ """Generate insights for a single role - runs in parallel with other insights."""
209
+ from anthropic import Anthropic
210
+ import os
211
+ import re
212
+
213
+ print(f"🧠 Generating {role} insights...")
214
+
215
+ client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
216
+
217
+ # Build prompt based on role
218
+ menu_items = analysis_data.get('menu_analysis', {}).get('food_items', [])[:20]
219
+ drinks = analysis_data.get('menu_analysis', {}).get('drinks', [])[:10]
220
+ aspects = analysis_data.get('aspect_analysis', {}).get('aspects', [])[:20]
221
+
222
+ # Format menu summary
223
+ menu_lines = ["TOP MENU ITEMS:"]
224
+ for item in menu_items:
225
+ s = item.get('sentiment', 0)
226
+ emoji = "🟢" if s >= 0.6 else "🟡" if s >= 0 else "🔴"
227
+ menu_lines.append(f" {emoji} {item.get('name', '?')}: sentiment {s:+.2f}, {item.get('mention_count', 0)} mentions")
228
+ menu_summary = "\n".join(menu_lines)
229
+
230
+ # Format aspect summary
231
+ aspect_lines = ["TOP ASPECTS:"]
232
+ for a in aspects:
233
+ s = a.get('sentiment', 0)
234
+ emoji = "🟢" if s >= 0.6 else "🟡" if s >= 0 else "🔴"
235
+ aspect_lines.append(f" {emoji} {a.get('name', '?')}: sentiment {s:+.2f}, {a.get('mention_count', 0)} mentions")
236
+ aspect_summary = "\n".join(aspect_lines)
237
+
238
+ if role == 'chef':
239
+ focus = "Focus on: Food quality, menu items, ingredients, presentation, portions, consistency"
240
+ topic_filter = "ONLY on food/kitchen topics"
241
+ else:
242
+ focus = "Focus on: Service, staff, wait times, ambience, value, cleanliness"
243
+ topic_filter = "ONLY on operations/service topics"
244
 
245
+ prompt = f"""You are an expert restaurant consultant analyzing feedback for {restaurant_name}.
246
 
247
+ {menu_summary}
248
 
249
+ {aspect_summary}
 
 
250
 
251
+ SENTIMENT SCALE:
252
+ - 🟢 POSITIVE (>= 0.6): Highlight as STRENGTH
253
+ - 🟡 NEUTRAL (0 to 0.59): Room for improvement
254
+ - 🔴 NEGATIVE (< 0): Flag as CONCERN
255
 
256
+ YOUR TASK: Generate insights for the {"HEAD CHEF" if role == "chef" else "RESTAURANT MANAGER"}.
257
+ {focus}
258
 
259
+ RULES:
260
+ 1. Focus {topic_filter}
261
+ 2. STRENGTHS from items with sentiment >= 0.6
262
+ 3. CONCERNS from items with sentiment < 0
263
+ 4. Output ONLY valid JSON
264
+
265
+ OUTPUT:
266
+ {{
267
+ "summary": "2-3 sentence executive summary",
268
+ "strengths": ["strength 1", "strength 2", "strength 3", "strength 4", "strength 5"],
269
+ "concerns": ["concern 1", "concern 2", "concern 3"],
270
+ "recommendations": [
271
+ {{"priority": "high", "action": "action", "reason": "why", "evidence": "data"}},
272
+ {{"priority": "medium", "action": "action", "reason": "why", "evidence": "data"}},
273
+ {{"priority": "low", "action": "action", "reason": "why", "evidence": "data"}}
274
+ ]
275
+ }}
276
+
277
+ Generate {role} insights:"""
278
+
279
+ try:
280
+ response = client.messages.create(
281
+ model="claude-sonnet-4-20250514",
282
+ max_tokens=2000,
283
+ temperature=0.4,
284
+ messages=[{"role": "user", "content": prompt}]
285
+ )
286
+
287
+ result_text = response.content[0].text.strip()
288
+ result_text = result_text.replace('```json', '').replace('```', '').strip()
289
+
290
+ # Find JSON in response
291
+ match = re.search(r'\{[\s\S]*\}', result_text)
292
+ if match:
293
+ insights = json.loads(match.group())
294
+ print(f"✅ {role.title()} insights generated")
295
+ return {"role": role, "insights": insights}
296
+ else:
297
+ print(f"⚠️ No JSON found in {role} response")
298
+ return {"role": role, "insights": _fallback_insights(role)}
299
+
300
+ except Exception as e:
301
+ print(f"❌ Error generating {role} insights: {e}")
302
+ return {"role": role, "insights": _fallback_insights(role)}
303
+
304
+
305
+ def _fallback_insights(role: str) -> Dict[str, Any]:
306
+ """Fallback insights if generation fails."""
307
  return {
308
+ "summary": f"Analysis complete. See data for {role} insights.",
309
+ "strengths": ["Data available in charts"],
310
+ "concerns": ["Review individual items for details"],
311
+ "recommendations": [{"priority": "medium", "action": "Review data", "reason": "Auto-generated", "evidence": "N/A"}]
 
312
  }
313
 
314
 
315
+ # ============================================================================
316
+ # MAIN ANALYSIS FUNCTION - PARALLEL OPTIMIZED
317
+ # ============================================================================
318
+
319
  @app.function(
320
  image=image,
321
  secrets=[modal.Secret.from_name("anthropic-api-key")],
322
+ timeout=600, # 10 min max (down from 40 min)
323
  )
324
+ def full_analysis_parallel(url: str, max_reviews: int = 100) -> Dict[str, Any]:
325
  """
326
+ PARALLEL OPTIMIZED analysis pipeline.
327
+
328
+ Speed improvements:
329
+ 1. Batches processed in PARALLEL via .map()
330
+ 2. Chef + Manager insights generated in PARALLEL
331
+ 3. Larger batch size (30 reviews)
332
 
333
+ Target: 1000 reviews in ~5 minutes
 
334
  """
335
+ import time
336
+ start_time = time.time()
337
+
338
+ print(f"🚀 Starting PARALLEL analysis for {url}")
339
+ print(f"📊 Max reviews: {max_reviews}")
340
 
341
  # Detect platform
342
  url_lower = url.lower()
 
345
  if platform == "unknown":
346
  return {"success": False, "error": "Unsupported platform. Use OpenTable or Google Maps."}
347
 
348
+ # Phase 1: Scrape reviews
349
+ print("📥 Phase 1: Scraping reviews...")
350
+ scrape_start = time.time()
351
+
352
  if platform == "opentable":
353
  from src.scrapers.opentable_scraper import scrape_opentable
354
  result = scrape_opentable(url=url, max_reviews=max_reviews, headless=True)
 
357
  result = scrape_google_maps(url=url, max_reviews=max_reviews, headless=True)
358
 
359
  if not result.get("success"):
360
+ return {"success": False, "error": result.get("error", "Scraping failed")}
361
 
362
+ print(f"✅ Scraping complete in {time.time() - scrape_start:.1f}s")
363
+
364
+ # Process reviews
365
  from src.data_processing import process_reviews, clean_reviews_for_ai
 
366
 
367
  df = process_reviews(result)
368
  reviews = clean_reviews_for_ai(df["review_text"].tolist(), verbose=False)
369
 
370
+ print(f"📊 Total reviews: {len(reviews)}")
371
+
372
+ # Create trend data
373
  trend_data = []
374
  for _, row in df.iterrows():
375
  text = str(row.get("review_text", ""))
376
  trend_data.append({
377
  "date": str(row.get("date", "")),
378
  "rating": float(row.get("overall_rating", 0) or 0),
379
+ "sentiment": calculate_sentiment(text)
380
  })
381
 
382
+ # Extract restaurant name
383
  if platform == "opentable":
384
  restaurant_name = url.split("/")[-1].split("?")[0].replace("-", " ").title()
385
  else:
 
386
  if '/place/' in url:
387
  restaurant_name = url.split('/place/')[1].split('/')[0].replace('+', ' ').replace('%20', ' ')
388
  else:
389
  restaurant_name = "Restaurant"
390
 
391
+ # Phase 2: PARALLEL batch extraction
392
+ print("🔄 Phase 2: PARALLEL batch extraction...")
393
+ extract_start = time.time()
394
+
395
+ BATCH_SIZE = 30 # Larger batches = fewer API calls
396
+ batches = []
397
+ for i in range(0, len(reviews), BATCH_SIZE):
398
+ batch_reviews = reviews[i:i+BATCH_SIZE]
399
+ batches.append({
400
+ "reviews": batch_reviews,
401
+ "restaurant_name": restaurant_name,
402
+ "batch_index": len(batches) + 1,
403
+ "start_index": i
404
+ })
405
+
406
+ print(f"📦 Created {len(batches)} batches of ~{BATCH_SIZE} reviews each")
407
+ print(f"🚀 Processing ALL batches in PARALLEL...")
408
+
409
+ # THIS IS THE KEY: Process all batches in parallel!
410
+ batch_results = list(process_batch.map(batches))
411
 
412
+ print(f"✅ All batches complete in {time.time() - extract_start:.1f}s")
 
413
 
414
+ # Merge results from all batches
415
+ all_food_items = {}
416
+ all_drinks = {}
417
+ all_aspects = {}
418
 
419
+ for batch_result in batch_results:
420
+ if not batch_result.get("success"):
421
+ continue
422
+
423
+ data = batch_result.get("data", {})
424
+
425
+ # Merge food items
426
+ for item in data.get('food_items', []):
427
+ name = item.get('name', '').lower()
428
+ if not name:
429
+ continue
430
+ if name in all_food_items:
431
+ all_food_items[name]['mention_count'] += item.get('mention_count', 1)
432
+ all_food_items[name]['related_reviews'].extend(item.get('related_reviews', []))
433
+ # Weighted average sentiment
434
+ old_count = all_food_items[name]['mention_count'] - item.get('mention_count', 1)
435
+ new_count = item.get('mention_count', 1)
436
+ if old_count + new_count > 0:
437
+ old_sent = all_food_items[name]['sentiment']
438
+ new_sent = item.get('sentiment', 0)
439
+ all_food_items[name]['sentiment'] = (old_sent * old_count + new_sent * new_count) / (old_count + new_count)
440
+ else:
441
+ all_food_items[name] = item
442
+
443
+ # Merge drinks
444
+ for item in data.get('drinks', []):
445
+ name = item.get('name', '').lower()
446
+ if not name:
447
+ continue
448
+ if name in all_drinks:
449
+ all_drinks[name]['mention_count'] += item.get('mention_count', 1)
450
+ all_drinks[name]['related_reviews'].extend(item.get('related_reviews', []))
451
+ old_count = all_drinks[name]['mention_count'] - item.get('mention_count', 1)
452
+ new_count = item.get('mention_count', 1)
453
+ if old_count + new_count > 0:
454
+ old_sent = all_drinks[name]['sentiment']
455
+ new_sent = item.get('sentiment', 0)
456
+ all_drinks[name]['sentiment'] = (old_sent * old_count + new_sent * new_count) / (old_count + new_count)
457
+ else:
458
+ all_drinks[name] = item
459
+
460
+ # Merge aspects
461
+ for aspect in data.get('aspects', []):
462
+ name = aspect.get('name', '').lower()
463
+ if not name:
464
+ continue
465
+ if name in all_aspects:
466
+ all_aspects[name]['mention_count'] += aspect.get('mention_count', 1)
467
+ all_aspects[name]['related_reviews'].extend(aspect.get('related_reviews', []))
468
+ old_count = all_aspects[name]['mention_count'] - aspect.get('mention_count', 1)
469
+ new_count = aspect.get('mention_count', 1)
470
+ if old_count + new_count > 0:
471
+ old_sent = all_aspects[name]['sentiment']
472
+ new_sent = aspect.get('sentiment', 0)
473
+ all_aspects[name]['sentiment'] = (old_sent * old_count + new_sent * new_count) / (old_count + new_count)
474
+ else:
475
+ all_aspects[name] = aspect
476
+
477
+ # Sort by mention count
478
+ food_list = sorted(all_food_items.values(), key=lambda x: x.get('mention_count', 0), reverse=True)
479
+ drinks_list = sorted(all_drinks.values(), key=lambda x: x.get('mention_count', 0), reverse=True)
480
+ aspects_list = sorted(all_aspects.values(), key=lambda x: x.get('mention_count', 0), reverse=True)
481
+
482
+ print(f"📊 Discovered: {len(food_list)} food + {len(drinks_list)} drinks + {len(aspects_list)} aspects")
483
+
484
+ # Build analysis data
485
+ analysis_data = {
486
+ "menu_analysis": {
487
+ "food_items": food_list,
488
+ "drinks": drinks_list
489
+ },
490
+ "aspect_analysis": {
491
+ "aspects": aspects_list
492
+ }
493
+ }
494
+
495
+ # Phase 3: PARALLEL insights generation
496
+ print("🧠 Phase 3: PARALLEL insights generation...")
497
+ insights_start = time.time()
498
+
499
+ # Generate both insights in parallel!
500
+ insight_inputs = [
501
+ (analysis_data, restaurant_name, "chef"),
502
+ (analysis_data, restaurant_name, "manager")
503
+ ]
504
+
505
+ insight_results = list(generate_insights_parallel.starmap(insight_inputs))
506
+
507
+ insights = {}
508
+ for result in insight_results:
509
+ insights[result["role"]] = result["insights"]
510
+
511
+ print(f"✅ Insights complete in {time.time() - insights_start:.1f}s")
512
+
513
+ # Build final response
514
+ total_time = time.time() - start_time
515
+ print(f"🎉 TOTAL TIME: {total_time:.1f}s ({total_time/60:.1f} min)")
516
+
517
+ analysis = {
518
+ "success": True,
519
+ "restaurant_name": restaurant_name,
520
+ "menu_analysis": analysis_data["menu_analysis"],
521
+ "aspect_analysis": analysis_data["aspect_analysis"],
522
+ "insights": insights,
523
+ "trend_data": trend_data,
524
+ "source": platform,
525
+ "stats": {
526
+ "total_reviews": len(reviews),
527
+ "food_items": len(food_list),
528
+ "drinks": len(drinks_list),
529
+ "aspects": len(aspects_list),
530
+ "processing_time_seconds": round(total_time, 1)
531
+ }
532
+ }
533
+
534
+ # Log response size
535
  response_size = len(json.dumps(analysis))
536
  print(f"[MODAL] Response size: {response_size / 1024:.1f} KB")
537
 
 
539
 
540
 
541
  # ============================================================================
542
+ # FASTAPI APP - Updated to use parallel function
543
  # ============================================================================
544
 
545
  @app.function(
546
  image=image,
547
  secrets=[modal.Secret.from_name("anthropic-api-key")],
548
+ timeout=900, # 15 min timeout for the API endpoint
549
  )
550
  @modal.asgi_app()
551
  def fastapi_app():
552
+ """Main API - uses parallel processing for speed."""
553
  from fastapi import FastAPI, HTTPException
554
  from pydantic import BaseModel
555
 
556
+ web_app = FastAPI(title="Restaurant Intelligence API - PARALLEL OPTIMIZED")
557
 
558
  class AnalyzeRequest(BaseModel):
559
  url: str
560
  max_reviews: int = 100
561
 
 
 
 
 
562
  @web_app.get("/")
563
  async def root():
564
  return {
565
  "name": "Restaurant Intelligence API",
566
+ "version": "3.0-parallel",
567
+ "optimizations": ["parallel_batches", "parallel_insights", "larger_batch_size"],
568
+ "target": "1000 reviews in ~5 minutes"
 
 
 
 
 
569
  }
570
 
571
  @web_app.get("/health")
572
  async def health():
573
+ return {"status": "healthy", "version": "parallel"}
574
 
575
  @web_app.post("/analyze")
576
  async def analyze(request: AnalyzeRequest):
577
  try:
578
+ result = full_analysis_parallel.remote(url=request.url, max_reviews=request.max_reviews)
579
  return result
580
  except Exception as e:
581
+ import traceback
582
+ traceback.print_exc()
583
  raise HTTPException(status_code=500, detail=str(e))
584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585
  return web_app
586
 
587
 
588
+ # ============================================================================
589
+ # LOCAL ENTRYPOINT FOR TESTING
590
+ # ============================================================================
591
+
592
  @app.local_entrypoint()
593
  def main():
594
+ print("🧪 Testing PARALLEL Modal deployment...\n")
 
 
 
 
 
 
 
595
 
596
+ print("1️⃣ API will be deployed at:")
597
  print(" https://tushar-pingle--restaurant-intelligence-fastapi-app.modal.run")
598
 
599
+ print("\n✅ Deploy with: modal deploy modal_backend.py")
 
 
 
 
src/data_processing/review_cleaner.py CHANGED
@@ -1,113 +1,151 @@
1
  """
2
- Review Text Cleaner
3
- Sanitizes review text before sending to AI to prevent JSON parsing errors.
 
 
 
 
 
 
 
 
 
4
  """
5
 
6
  import re
7
  import unicodedata
8
- from typing import List
9
 
10
 
11
  class ReviewCleaner:
12
  """
13
- Cleans review text to prevent JSON parsing errors and reduce tokens.
14
  """
15
 
16
- def __init__(self):
17
- pass
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def clean_review(self, text: str) -> str:
20
  """
21
  Clean a single review text.
22
 
23
- Args:
24
- text: Raw review text
25
-
26
- Returns:
27
- Cleaned text safe for AI processing
28
  """
29
  if not text or not isinstance(text, str):
30
  return ""
31
 
32
- # 1. Remove excessive whitespace
 
 
33
  text = ' '.join(text.split())
34
 
35
- # 2. Remove emojis and special unicode
36
  text = self._remove_emojis(text)
37
 
38
- # 3. Fix quotes - replace smart quotes with straight quotes
39
  text = text.replace('"', '"').replace('"', '"')
40
  text = text.replace("'", "'").replace("'", "'")
41
 
42
- # 4. Remove or escape problematic characters
43
- text = text.replace('\n', ' ') # Remove newlines
44
- text = text.replace('\r', ' ') # Remove carriage returns
45
- text = text.replace('\t', ' ') # Remove tabs
46
 
47
- # 5. Remove control characters
48
- text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C')
49
-
50
- # 6. Normalize multiple spaces
51
  text = re.sub(r'\s+', ' ', text)
52
 
53
- # 7. Truncate very long reviews (>1000 chars)
54
- if len(text) > 1000:
55
- text = text[:997] + "..."
56
 
57
- # 8. Strip leading/trailing whitespace
58
  text = text.strip()
59
 
 
 
 
 
60
  return text
61
 
62
  def _remove_emojis(self, text: str) -> str:
63
- """Remove emojis and other pictographic characters."""
64
- # Emoji pattern
 
 
 
65
  emoji_pattern = re.compile(
66
  "["
67
  "\U0001F600-\U0001F64F" # emoticons
68
- "\U0001F300-\U0001F5FF" # symbols & pictographs
69
  "\U0001F680-\U0001F6FF" # transport & map symbols
70
  "\U0001F1E0-\U0001F1FF" # flags
71
- "\U00002702-\U000027B0"
72
- "\U000024C2-\U0001F251"
 
 
73
  "]+",
74
  flags=re.UNICODE
75
  )
76
- return emoji_pattern.sub(r'', text)
77
 
78
  def clean_reviews(self, reviews: List[str]) -> List[str]:
79
  """
80
  Clean a list of reviews.
81
 
82
- Args:
83
- reviews: List of raw review texts
84
-
85
- Returns:
86
- List of cleaned review texts
87
  """
 
 
 
 
 
 
 
 
 
88
  cleaned = []
89
  for i, review in enumerate(reviews):
 
90
  cleaned_text = self.clean_review(review)
91
- if cleaned_text: # Only include non-empty reviews
92
- cleaned.append(cleaned_text)
93
- else:
94
- print(f" ⚠️ Review {i} became empty after cleaning, skipping")
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  return cleaned
97
 
98
- def get_cleaning_stats(self, original: List[str], cleaned: List[str]) -> dict:
99
  """Get statistics about the cleaning process."""
100
- original_chars = sum(len(r) for r in original)
101
- cleaned_chars = sum(len(r) for r in cleaned)
102
-
103
  return {
104
- "original_count": len(original),
105
- "cleaned_count": len(cleaned),
106
- "removed_count": len(original) - len(cleaned),
107
- "original_chars": original_chars,
108
- "cleaned_chars": cleaned_chars,
109
- "chars_saved": original_chars - cleaned_chars,
110
- "reduction_pct": round((1 - cleaned_chars / original_chars) * 100, 1) if original_chars > 0 else 0
111
  }
112
 
113
 
@@ -115,39 +153,85 @@ def clean_reviews_for_ai(reviews: List[str], verbose: bool = True) -> List[str]:
115
  """
116
  Convenience function to clean reviews.
117
 
118
- Args:
119
- reviews: Raw review texts
120
- verbose: Print cleaning stats
121
-
122
- Returns:
123
- Cleaned review texts
124
  """
125
- cleaner = ReviewCleaner()
126
  cleaned = cleaner.clean_reviews(reviews)
127
 
128
  if verbose:
129
- stats = cleaner.get_cleaning_stats(reviews, cleaned)
130
  print(f"🧹 Cleaned {stats['original_count']} reviews:")
131
- print(f" Removed: {stats['removed_count']} empty reviews")
132
- print(f" Characters: {stats['original_chars']:,} {stats['cleaned_chars']:,}")
133
- print(f" Saved: {stats['chars_saved']:,} chars ({stats['reduction_pct']}% reduction)")
 
 
 
 
 
 
134
 
135
  return cleaned
136
 
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  if __name__ == "__main__":
139
  # Test the cleaner
140
  test_reviews = [
141
- 'This place is "amazing"! ��😍😍',
142
- "The food was great\n\nbut service was slow",
143
- 'Chef said "it\'s the best" and I agree! \t\t\t',
144
- "🍕🍝🍷 Loved everything!!!",
145
- "A" * 1500 # Very long review
 
 
 
 
146
  ]
147
 
148
- cleaner = ReviewCleaner()
149
- for i, review in enumerate(test_reviews):
150
- cleaned = cleaner.clean_review(review)
151
- print(f"Original {i+1}: {review[:50]}...")
152
- print(f"Cleaned {i+1}: {cleaned[:50]}...")
153
- print()
 
 
 
 
 
 
1
  """
2
+ Review Text Cleaner - FIXED VERSION
3
+ Less aggressive cleaning that preserves more reviews.
4
+
5
+ FIXES:
6
+ 1. Don't discard reviews just because they're short
7
+ 2. Keep reviews with minimal cleaning
8
+ 3. Better handling of special characters
9
+ 4. Log what's being cleaned for debugging
10
+
11
+ Author: Tushar Pingle
12
+ Updated: Nov 2024
13
  """
14
 
15
  import re
16
  import unicodedata
17
+ from typing import List, Tuple
18
 
19
 
20
  class ReviewCleaner:
21
  """
22
+ Cleans review text while preserving as much content as possible.
23
  """
24
 
25
+ # Minimum length for a valid review (characters)
26
+ MIN_REVIEW_LENGTH = 10 # Very permissive - was effectively 0 but cleaned to nothing
27
+
28
+ def __init__(self, verbose: bool = False):
29
+ self.verbose = verbose
30
+ self.stats = {
31
+ 'total': 0,
32
+ 'kept': 0,
33
+ 'removed_empty': 0,
34
+ 'removed_short': 0,
35
+ 'chars_original': 0,
36
+ 'chars_cleaned': 0
37
+ }
38
 
39
  def clean_review(self, text: str) -> str:
40
  """
41
  Clean a single review text.
42
 
43
+ FIXED: Less aggressive cleaning, preserves more content.
 
 
 
 
44
  """
45
  if not text or not isinstance(text, str):
46
  return ""
47
 
48
+ original_len = len(text)
49
+
50
+ # 1. Basic whitespace normalization (gentle)
51
  text = ' '.join(text.split())
52
 
53
+ # 2. Remove only truly problematic emojis (keep basic punctuation)
54
  text = self._remove_emojis(text)
55
 
56
+ # 3. Normalize quotes (don't remove them)
57
  text = text.replace('"', '"').replace('"', '"')
58
  text = text.replace("'", "'").replace("'", "'")
59
 
60
+ # 4. Remove control characters only (keep newlines as spaces)
61
+ text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
62
+ text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C' or char == ' ')
 
63
 
64
+ # 5. Normalize multiple spaces
 
 
 
65
  text = re.sub(r'\s+', ' ', text)
66
 
67
+ # 6. Truncate very long reviews (>1500 chars) - increased limit
68
+ if len(text) > 1500:
69
+ text = text[:1497] + "..."
70
 
71
+ # 7. Strip whitespace
72
  text = text.strip()
73
 
74
+ # Track stats
75
+ self.stats['chars_original'] += original_len
76
+ self.stats['chars_cleaned'] += len(text)
77
+
78
  return text
79
 
80
  def _remove_emojis(self, text: str) -> str:
81
+ """
82
+ Remove emojis but keep more unicode characters.
83
+ FIXED: Less aggressive pattern.
84
+ """
85
+ # Only remove actual emoji pictographs, not all unicode
86
  emoji_pattern = re.compile(
87
  "["
88
  "\U0001F600-\U0001F64F" # emoticons
89
+ "\U0001F300-\U0001F5FF" # symbols & pictographs
90
  "\U0001F680-\U0001F6FF" # transport & map symbols
91
  "\U0001F1E0-\U0001F1FF" # flags
92
+ "\U0001F900-\U0001F9FF" # supplemental symbols
93
+ "\U0001FA00-\U0001FA6F" # chess symbols
94
+ "\U0001FA70-\U0001FAFF" # symbols extended
95
+ "\U00002702-\U000027B0" # dingbats
96
  "]+",
97
  flags=re.UNICODE
98
  )
99
+ return emoji_pattern.sub('', text)
100
 
101
  def clean_reviews(self, reviews: List[str]) -> List[str]:
102
  """
103
  Clean a list of reviews.
104
 
105
+ FIXED: Only removes truly empty reviews, not short ones.
 
 
 
 
106
  """
107
+ self.stats = {
108
+ 'total': len(reviews),
109
+ 'kept': 0,
110
+ 'removed_empty': 0,
111
+ 'removed_short': 0,
112
+ 'chars_original': 0,
113
+ 'chars_cleaned': 0
114
+ }
115
+
116
  cleaned = []
117
  for i, review in enumerate(reviews):
118
+ # Clean the review
119
  cleaned_text = self.clean_review(review)
120
+
121
+ # Check if it's still valid
122
+ if not cleaned_text:
123
+ self.stats['removed_empty'] += 1
124
+ if self.verbose:
125
+ print(f" ⚠️ Review {i} was empty/None, skipping")
126
+ continue
127
+
128
+ if len(cleaned_text) < self.MIN_REVIEW_LENGTH:
129
+ self.stats['removed_short'] += 1
130
+ if self.verbose:
131
+ print(f" ⚠️ Review {i} too short ({len(cleaned_text)} chars): '{cleaned_text[:50]}'")
132
+ continue
133
+
134
+ cleaned.append(cleaned_text)
135
+ self.stats['kept'] += 1
136
 
137
  return cleaned
138
 
139
+ def get_cleaning_stats(self) -> dict:
140
  """Get statistics about the cleaning process."""
 
 
 
141
  return {
142
+ "original_count": self.stats['total'],
143
+ "cleaned_count": self.stats['kept'],
144
+ "removed_empty": self.stats['removed_empty'],
145
+ "removed_short": self.stats['removed_short'],
146
+ "original_chars": self.stats['chars_original'],
147
+ "cleaned_chars": self.stats['chars_cleaned'],
148
+ "retention_rate": round(self.stats['kept'] / max(self.stats['total'], 1) * 100, 1)
149
  }
150
 
151
 
 
153
  """
154
  Convenience function to clean reviews.
155
 
156
+ FIXED: Better stats reporting, less aggressive cleaning.
 
 
 
 
 
157
  """
158
+ cleaner = ReviewCleaner(verbose=False) # Don't spam individual messages
159
  cleaned = cleaner.clean_reviews(reviews)
160
 
161
  if verbose:
162
+ stats = cleaner.get_cleaning_stats()
163
  print(f"🧹 Cleaned {stats['original_count']} reviews:")
164
+ print(f" ✅ Kept: {stats['cleaned_count']} ({stats['retention_rate']}%)")
165
+ if stats['removed_empty'] > 0:
166
+ print(f" ❌ Empty: {stats['removed_empty']}")
167
+ if stats['removed_short'] > 0:
168
+ print(f" ❌ Too short: {stats['removed_short']}")
169
+
170
+ # Warn if we're losing too many reviews
171
+ if stats['retention_rate'] < 50:
172
+ print(f" ⚠️ WARNING: Only {stats['retention_rate']}% retention! Check scraper.")
173
 
174
  return cleaned
175
 
176
 
177
+ # Also add a debug function
178
+ def analyze_review_loss(reviews: List[str]) -> None:
179
+ """
180
+ Debug function to understand why reviews are being lost.
181
+ """
182
+ print(f"\n{'='*60}")
183
+ print("REVIEW LOSS ANALYSIS")
184
+ print(f"{'='*60}\n")
185
+
186
+ empty_count = 0
187
+ short_count = 0
188
+ valid_count = 0
189
+
190
+ print("Sample of problematic reviews:\n")
191
+
192
+ for i, review in enumerate(reviews):
193
+ if not review or not isinstance(review, str):
194
+ empty_count += 1
195
+ if empty_count <= 3:
196
+ print(f" [{i}] EMPTY: {repr(review)}")
197
+ elif len(review.strip()) < 10:
198
+ short_count += 1
199
+ if short_count <= 3:
200
+ print(f" [{i}] SHORT ({len(review)} chars): '{review[:50]}'")
201
+ else:
202
+ valid_count += 1
203
+
204
+ print(f"\n{'='*60}")
205
+ print(f"SUMMARY:")
206
+ print(f" Total: {len(reviews)}")
207
+ print(f" Valid: {valid_count} ({valid_count/len(reviews)*100:.1f}%)")
208
+ print(f" Empty: {empty_count}")
209
+ print(f" Short: {short_count}")
210
+ print(f"{'='*60}\n")
211
+
212
+
213
  if __name__ == "__main__":
214
  # Test the cleaner
215
  test_reviews = [
216
+ 'This place is "amazing"! 😍😍😍 The food was incredible.',
217
+ "The food was great\n\nbut service was slow. Would come back!",
218
+ 'Chef said "it\'s the best" and I agree! Great experience.',
219
+ "🍕🍝🍷 Loved everything!!! Best Italian in town.",
220
+ "", # Empty
221
+ "Good", # Too short
222
+ " ", # Just whitespace
223
+ None, # None
224
+ "The pasta was perfectly cooked, al dente just how I like it.",
225
  ]
226
 
227
+ print("Testing review cleaner...\n")
228
+
229
+ # First analyze
230
+ analyze_review_loss(test_reviews)
231
+
232
+ # Then clean
233
+ cleaned = clean_reviews_for_ai(test_reviews, verbose=True)
234
+
235
+ print(f"\nCleaned reviews ({len(cleaned)}):")
236
+ for i, review in enumerate(cleaned):
237
+ print(f" {i+1}. {review[:60]}...")
src/scrapers/opentable_scraper.py CHANGED
@@ -1,33 +1,42 @@
1
  """
2
- OpenTable Review Scraper - 2025 Production Version
3
- Fixed review text extraction based on actual DOM structure.
 
 
 
 
 
 
 
 
 
4
  """
5
 
6
  import time
7
- from typing import List, Dict, Any, Optional, Callable
8
  from selenium import webdriver
9
- from selenium.common.exceptions import (
10
- NoSuchElementException,
11
- StaleElementReferenceException,
12
- TimeoutException
13
- )
14
  from selenium.webdriver.common.by import By
15
  from selenium.webdriver.support.ui import WebDriverWait
16
  from selenium.webdriver.support import expected_conditions as EC
17
- from selenium.webdriver.chrome.options import Options
18
- from selenium.webdriver.chrome.service import Service
19
 
20
 
21
  class OpenTableScraper:
22
- """Scrapes restaurant reviews from OpenTable using production-tested selectors."""
 
 
23
 
24
- # Production selectors discovered from live DOM inspection (Jan 2025)
25
  SELECTORS = {
26
  "review_cards": [
 
27
  "//li[@data-test='reviews-list-item']",
28
- "//section[@id='reviews']//li[contains(., 'Dined')]",
29
- "//section[.//h2[contains(., 'people are saying') or contains(., 'Reviews')]]//li[.//p or .//span or .//time]",
30
- "//li[@data-test='review']"
 
31
  ],
32
  "next_button": [
33
  "//a[@aria-label='Go to the next page']",
@@ -65,15 +74,16 @@ class OpenTableScraper:
65
  ".//li[contains(., 'Ambience')]//span"
66
  ],
67
  "review_text": [
68
- # NEW: Based on actual DOM structure from screenshot
69
  ".//span[@data-test='wrapper-tag']",
70
  ".//div[@data-test='wrapper-tag']",
71
  ".//p[@data-test='review-text']",
72
- ".//div[contains(@class,'review')]/p",
73
- ".//div[contains(@class,'review')]/span",
74
- # Fallback: Get any paragraph/span with substantial text, excluding ratings/dates
75
- ".//p[not(contains(., 'Dined')) and not(.//*) and string-length(normalize-space())>20]",
76
- ".//span[not(contains(., 'Dined')) and not(.//*) and string-length(normalize-space())>20]"
 
77
  ]
78
  }
79
 
@@ -82,6 +92,7 @@ class OpenTableScraper:
82
  self.page_load_strategy = page_load_strategy
83
  self.driver = None
84
  self.wait = None
 
85
 
86
  def scrape_reviews(
87
  self,
@@ -89,8 +100,11 @@ class OpenTableScraper:
89
  max_reviews: Optional[int] = None,
90
  progress_callback: Optional[Callable[[str], None]] = None
91
  ) -> Dict[str, Any]:
92
- """Scrape reviews from OpenTable restaurant page."""
 
93
 
 
 
94
  if not self._validate_url(url):
95
  return {'success': False, 'error': 'Invalid OpenTable URL', 'reviews': []}
96
 
@@ -116,7 +130,8 @@ class OpenTableScraper:
116
  reviews = []
117
 
118
  page_count = 0
119
- review_count = 0
 
120
 
121
  while True:
122
  page_count += 1
@@ -132,32 +147,44 @@ class OpenTableScraper:
132
  self._log_progress("⚠️ No reviews found on page.", progress_callback)
133
  if page_count == 1:
134
  # Save page source for debugging
135
- with open('debug_page_source.html', 'w', encoding='utf-8') as f:
136
- f.write(self.driver.page_source)
137
- self._log_progress("💾 Saved page source to debug_page_source.html", progress_callback)
 
 
 
138
  break
139
 
140
- self._log_progress(f" Found {len(review_elements)} review cards", progress_callback)
141
 
142
  # Extract data from each review
 
 
 
143
  for idx, review in enumerate(review_elements):
144
  if max_reviews and review_count >= max_reviews:
145
  self._log_progress(f"🎯 Reached max reviews ({max_reviews}).", progress_callback)
146
  break
147
 
148
  try:
 
 
 
 
 
 
 
 
 
 
149
  name = self._extract_text_with_fallback(review, self.SELECTORS["name"])
150
  date = self._extract_text_with_fallback(review, self.SELECTORS["date"])
151
  overall_rating = self._extract_text_with_fallback(review, self.SELECTORS["overall_rating"])
152
  food_rating = self._extract_text_with_fallback(review, self.SELECTORS["food_rating"])
153
  service_rating = self._extract_text_with_fallback(review, self.SELECTORS["service_rating"])
154
  ambience_rating = self._extract_text_with_fallback(review, self.SELECTORS["ambience_rating"])
155
- review_text = self._extract_text_with_fallback(review, self.SELECTORS["review_text"])
156
-
157
- # Clean review text (remove date if it leaked in)
158
- if review_text and "Dined on" in review_text:
159
- review_text = ""
160
 
 
161
  names.append(name)
162
  dates.append(date)
163
  overall_ratings.append(overall_rating)
@@ -167,14 +194,18 @@ class OpenTableScraper:
167
  reviews.append(review_text)
168
 
169
  review_count += 1
 
170
 
171
- if review_count % 10 == 0:
172
- self._log_progress(f"📊 Extracted {review_count} reviews so far...", progress_callback)
173
 
174
  except Exception as e:
175
  self._log_progress(f"⚠️ Error on review {idx + 1}: {str(e)}", progress_callback)
176
  continue
177
 
 
 
 
178
  if max_reviews and review_count >= max_reviews:
179
  break
180
 
@@ -185,64 +216,130 @@ class OpenTableScraper:
185
 
186
  time.sleep(3) # Wait for new page to load
187
 
188
- self._log_progress(f"✅ DONE! Scraped {review_count} reviews from {page_count} pages", progress_callback)
 
 
 
 
 
189
 
190
  return {
191
  'success': True,
192
- 'total_reviews': review_count,
193
- 'total_pages': page_count,
194
- 'reviews': {
195
- 'names': names,
196
- 'dates': dates,
197
- 'overall_ratings': overall_ratings,
198
- 'food_ratings': food_ratings,
199
- 'service_ratings': service_ratings,
200
- 'ambience_ratings': ambience_ratings,
201
- 'review_texts': reviews
 
 
 
202
  }
203
  }
204
 
205
  except Exception as e:
206
- self._log_progress(f"❌ Fatal error: {str(e)}", progress_callback)
207
- return {'success': False, 'error': str(e), 'reviews': []}
208
-
 
209
  finally:
210
  self._cleanup()
211
 
212
- def _click_next(self) -> bool:
213
- """Click 'Next' button with robust error handling."""
214
- xpaths = self.SELECTORS["next_button"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
- for xp in xpaths:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  try:
218
- # Wait until present & visible
219
- btn = self.wait.until(EC.presence_of_element_located((By.XPATH, xp)))
220
-
221
- # If we matched the inner <div data-test="pagination-next">, climb to <a>
222
- if btn.tag_name.lower() != "a":
223
- try:
224
- btn = btn.find_element(By.XPATH, "ancestor::a[1]")
225
- except Exception:
226
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- # Guard: disabled?
229
  aria_disabled = (btn.get_attribute("aria-disabled") or "").lower()
230
  if aria_disabled in ("true", "1"):
231
  return False
232
 
233
- # Bring into view
234
  try:
235
  self.driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
236
  time.sleep(0.15)
237
- except Exception:
238
  pass
239
 
240
- # Try native click first
241
  try:
242
  WebDriverWait(self.driver, 5).until(EC.element_to_be_clickable((By.XPATH, xp)))
243
  btn.click()
244
- except Exception:
245
- # JS fallback (needed in headless mode)
246
  self.driver.execute_script("arguments[0].click();", btn)
247
 
248
  return True
@@ -250,55 +347,59 @@ class OpenTableScraper:
250
  except TimeoutException:
251
  continue
252
  except StaleElementReferenceException:
253
- # Re-find once
254
  try:
255
  btn = self.driver.find_element(By.XPATH, xp)
256
  self.driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
257
  self.driver.execute_script("arguments[0].click();", btn)
258
  return True
259
- except Exception:
260
  continue
261
- except Exception:
262
  continue
263
 
264
  return False
265
 
266
- def _find_elements_with_fallback(self, selectors: List[str], by: By) -> List:
267
- """Try multiple selectors until one works."""
268
- for selector in selectors:
269
- try:
270
- elements = self.driver.find_elements(by, selector)
271
- if elements:
272
- return elements
273
- except:
274
- continue
275
- return []
276
-
277
- def _find_element_with_fallback(self, selectors: List[str], by: By):
278
- """Try multiple selectors until one works."""
279
- for selector in selectors:
280
- try:
281
- element = self.driver.find_element(by, selector)
282
- if element:
283
- return element
284
- except:
285
- continue
286
- return None
287
-
288
- def _extract_text_with_fallback(self, parent_element, selectors: List[str]) -> str:
289
- """Extract text using fallback XPath selectors."""
290
- for selector in selectors:
291
- try:
292
- element = parent_element.find_element(By.XPATH, selector)
293
- text = element.text.strip()
294
- if text:
295
- return text
296
- except:
297
- continue
298
- return ""
 
 
 
 
 
299
 
300
  def _init_driver(self):
301
- """Initialize Chrome WebDriver with production settings."""
302
  chrome_options = Options()
303
  chrome_options.page_load_strategy = self.page_load_strategy
304
 
@@ -308,18 +409,13 @@ class OpenTableScraper:
308
  chrome_options.add_argument('--disable-dev-shm-usage')
309
  chrome_options.add_argument('--disable-gpu')
310
 
311
- # Realistic user agent to avoid bot detection
312
  chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
313
-
314
- # Additional anti-detection measures
315
  chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
316
  chrome_options.add_experimental_option('useAutomationExtension', False)
317
 
318
  service = Service('/usr/local/bin/chromedriver')
319
  self.driver = webdriver.Chrome(service=service, options=chrome_options)
320
  self.driver.set_page_load_timeout(30)
321
-
322
- # Initialize WebDriverWait
323
  self.wait = WebDriverWait(self.driver, 10)
324
 
325
  def _cleanup(self):
@@ -336,7 +432,7 @@ class OpenTableScraper:
336
  return 'opentable.c' in url.lower()
337
 
338
  def _log_progress(self, message: str, callback: Optional[Callable]):
339
- """Log progress with emoji indicators."""
340
  print(message)
341
  if callback:
342
  callback(message)
@@ -347,49 +443,23 @@ class OpenTableScraper:
347
 
348
  def scrape_opentable(url: str, max_reviews: Optional[int] = None, headless: bool = True) -> Dict[str, Any]:
349
  """
350
- Scrape reviews from OpenTable.
351
-
352
- Args:
353
- url: OpenTable restaurant URL
354
- max_reviews: Maximum number of reviews to scrape (None = all)
355
- headless: Run browser in headless mode
356
 
357
- Returns:
358
- Dict with 'success', 'total_reviews', 'total_pages', and 'reviews' data
359
  """
360
  scraper = OpenTableScraper(headless=headless)
361
- return scraper.scrape_reviews(url, max_reviews=max_reviews)
362
 
363
 
364
  if __name__ == "__main__":
365
- print("=" * 80)
366
- print("🍽️ OpenTable Review Scraper - Production Test (2025)")
367
- print("=" * 80 + "\n")
368
-
369
- test_url = "https://www.opentable.ca/r/miku-restaurant-vancouver"
370
 
371
- print(f"🎯 Target: {test_url}")
372
- print("📊 Limit: 20 reviews (test mode)")
373
- print("🤖 Mode: HEADLESS\n")
374
-
375
- result = scrape_opentable(test_url, max_reviews=20, headless=True)
376
-
377
- print("\n" + "=" * 80)
378
- if result['success']:
379
- print("✅ SUCCESS!")
380
- print(f" 📊 Total reviews scraped: {result['total_reviews']}")
381
- print(f" 📄 Total pages visited: {result['total_pages']}")
382
-
383
- if result['total_reviews'] > 0:
384
- print(f"\n 🔍 Sample (first review):")
385
- print(f" 👤 Name: {result['reviews']['names'][0]}")
386
- print(f" 📅 Date: {result['reviews']['dates'][0]}")
387
- print(f" ⭐ Overall: {result['reviews']['overall_ratings'][0]}")
388
- print(f" 🍜 Food: {result['reviews']['food_ratings'][0]}")
389
- print(f" 💁 Service: {result['reviews']['service_ratings'][0]}")
390
- print(f" 🏮 Ambience: {result['reviews']['ambience_ratings'][0]}")
391
- print(f" 💬 Review: {result['reviews']['review_texts'][0][:150]}...")
392
- else:
393
- print("❌ FAILED")
394
- print(f" Error: {result.get('error', 'Unknown error')}")
395
- print("=" * 80)
 
1
  """
2
+ OpenTable Scraper - FIXED VERSION
3
+ Production-ready scraper that doesn't lose reviews.
4
+
5
+ FIXES:
6
+ 1. Only counts reviews that have actual text
7
+ 2. Better selector specificity
8
+ 3. Logs empty vs real reviews for debugging
9
+ 4. Continues even if individual reviews fail
10
+
11
+ Author: Tushar Pingle
12
+ Updated: Nov 2024
13
  """
14
 
15
  import time
16
+ from typing import Dict, Any, List, Optional, Callable
17
  from selenium import webdriver
18
+ from selenium.webdriver.chrome.options import Options
19
+ from selenium.webdriver.chrome.service import Service
 
 
 
20
  from selenium.webdriver.common.by import By
21
  from selenium.webdriver.support.ui import WebDriverWait
22
  from selenium.webdriver.support import expected_conditions as EC
23
+ from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
 
24
 
25
 
26
  class OpenTableScraper:
27
+ """
28
+ Production OpenTable scraper with improved review extraction.
29
+ """
30
 
31
+ # Updated selectors - more specific for actual review cards
32
  SELECTORS = {
33
  "review_cards": [
34
+ # Most specific first - only match list items that contain actual review content
35
  "//li[@data-test='reviews-list-item']",
36
+ # Fallback: items in reviews section that have both date AND substantial text
37
+ "//section[@id='reviews']//li[contains(., 'Dined') and .//span[string-length(normalize-space()) > 30]]",
38
+ # Generic fallback
39
+ "//section[.//h2[contains(., 'people are saying') or contains(., 'Reviews')]]//li[.//p[string-length(normalize-space()) > 30] or .//span[string-length(normalize-space()) > 30]]",
40
  ],
41
  "next_button": [
42
  "//a[@aria-label='Go to the next page']",
 
74
  ".//li[contains(., 'Ambience')]//span"
75
  ],
76
  "review_text": [
77
+ # Priority order - most specific first
78
  ".//span[@data-test='wrapper-tag']",
79
  ".//div[@data-test='wrapper-tag']",
80
  ".//p[@data-test='review-text']",
81
+ # Get text content from review body
82
+ ".//div[contains(@class,'review')]//p[string-length(normalize-space()) > 20]",
83
+ ".//div[contains(@class,'review')]//span[string-length(normalize-space()) > 20]",
84
+ # Fallback: any paragraph/span with substantial text that's not date/rating
85
+ ".//p[not(contains(., 'Dined')) and not(contains(., 'Overall')) and not(contains(., 'Food')) and not(contains(., 'Service')) and not(contains(., 'Ambience')) and string-length(normalize-space()) > 20]",
86
+ ".//span[not(contains(., 'Dined')) and not(ancestor::li[contains(., 'Overall')]) and string-length(normalize-space()) > 20]",
87
  ]
88
  }
89
 
 
92
  self.page_load_strategy = page_load_strategy
93
  self.driver = None
94
  self.wait = None
95
+ self.empty_count = 0 # Track empty reviews for debugging
96
 
97
  def scrape_reviews(
98
  self,
 
100
  max_reviews: Optional[int] = None,
101
  progress_callback: Optional[Callable[[str], None]] = None
102
  ) -> Dict[str, Any]:
103
+ """
104
+ Scrape reviews from OpenTable restaurant page.
105
 
106
+ FIXED: Only counts and returns reviews that have actual text content.
107
+ """
108
  if not self._validate_url(url):
109
  return {'success': False, 'error': 'Invalid OpenTable URL', 'reviews': []}
110
 
 
130
  reviews = []
131
 
132
  page_count = 0
133
+ review_count = 0 # Only counts VALID reviews with text
134
+ self.empty_count = 0 # Track skipped empty reviews
135
 
136
  while True:
137
  page_count += 1
 
147
  self._log_progress("⚠️ No reviews found on page.", progress_callback)
148
  if page_count == 1:
149
  # Save page source for debugging
150
+ try:
151
+ with open('debug_page_source.html', 'w', encoding='utf-8') as f:
152
+ f.write(self.driver.page_source)
153
+ self._log_progress("💾 Saved page source to debug_page_source.html", progress_callback)
154
+ except:
155
+ pass
156
  break
157
 
158
+ self._log_progress(f"📋 Found {len(review_elements)} review cards on page", progress_callback)
159
 
160
  # Extract data from each review
161
+ page_valid = 0
162
+ page_empty = 0
163
+
164
  for idx, review in enumerate(review_elements):
165
  if max_reviews and review_count >= max_reviews:
166
  self._log_progress(f"🎯 Reached max reviews ({max_reviews}).", progress_callback)
167
  break
168
 
169
  try:
170
+ # Extract review text FIRST - this is the critical field
171
+ review_text = self._extract_review_text(review)
172
+
173
+ # FIXED: Skip reviews without actual text content
174
+ if not review_text or len(review_text.strip()) < 10:
175
+ page_empty += 1
176
+ self.empty_count += 1
177
+ continue # Don't append, don't count
178
+
179
+ # Now extract other fields
180
  name = self._extract_text_with_fallback(review, self.SELECTORS["name"])
181
  date = self._extract_text_with_fallback(review, self.SELECTORS["date"])
182
  overall_rating = self._extract_text_with_fallback(review, self.SELECTORS["overall_rating"])
183
  food_rating = self._extract_text_with_fallback(review, self.SELECTORS["food_rating"])
184
  service_rating = self._extract_text_with_fallback(review, self.SELECTORS["service_rating"])
185
  ambience_rating = self._extract_text_with_fallback(review, self.SELECTORS["ambience_rating"])
 
 
 
 
 
186
 
187
+ # Append valid review
188
  names.append(name)
189
  dates.append(date)
190
  overall_ratings.append(overall_rating)
 
194
  reviews.append(review_text)
195
 
196
  review_count += 1
197
+ page_valid += 1
198
 
199
+ if review_count % 50 == 0:
200
+ self._log_progress(f"📊 Extracted {review_count} valid reviews so far...", progress_callback)
201
 
202
  except Exception as e:
203
  self._log_progress(f"⚠️ Error on review {idx + 1}: {str(e)}", progress_callback)
204
  continue
205
 
206
+ # Log page summary
207
+ self._log_progress(f" ✅ Page {page_count}: {page_valid} valid, {page_empty} empty", progress_callback)
208
+
209
  if max_reviews and review_count >= max_reviews:
210
  break
211
 
 
216
 
217
  time.sleep(3) # Wait for new page to load
218
 
219
+ self._log_progress(f"✅ DONE! Scraped {review_count} valid reviews from {page_count} pages", progress_callback)
220
+ if self.empty_count > 0:
221
+ self._log_progress(f" ℹ️ Skipped {self.empty_count} empty/invalid review cards", progress_callback)
222
+
223
+ # Extract restaurant metadata
224
+ metadata = self._extract_metadata()
225
 
226
  return {
227
  'success': True,
228
+ 'total_reviews': review_count, # Now correctly represents VALID reviews
229
+ 'names': names,
230
+ 'dates': dates,
231
+ 'overall_ratings': overall_ratings,
232
+ 'food_ratings': food_ratings,
233
+ 'service_ratings': service_ratings,
234
+ 'ambience_ratings': ambience_ratings,
235
+ 'reviews': reviews,
236
+ 'metadata': metadata,
237
+ 'stats': {
238
+ 'pages_scraped': page_count,
239
+ 'valid_reviews': review_count,
240
+ 'empty_skipped': self.empty_count
241
  }
242
  }
243
 
244
  except Exception as e:
245
+ import traceback
246
+ error_msg = f"Scraping error: {str(e)}\n{traceback.format_exc()}"
247
+ self._log_progress(f"❌ {error_msg}", progress_callback)
248
+ return {'success': False, 'error': error_msg, 'reviews': []}
249
  finally:
250
  self._cleanup()
251
 
252
+ def _extract_review_text(self, review_element) -> str:
253
+ """
254
+ Extract review text with multiple fallback strategies.
255
+ Returns empty string if no valid text found.
256
+ """
257
+ # Try each selector
258
+ for selector in self.SELECTORS["review_text"]:
259
+ try:
260
+ elements = review_element.find_elements(By.XPATH, selector)
261
+ for elem in elements:
262
+ text = elem.text.strip()
263
+ # Validate it's actual review content
264
+ if text and len(text) > 20:
265
+ # Filter out dates and ratings that might have leaked
266
+ if "Dined on" in text or text.startswith("Overall") or text.startswith("Food"):
267
+ continue
268
+ # Filter out very short generic text
269
+ if text in ["See more", "Read more", "Show more"]:
270
+ continue
271
+ return text
272
+ except:
273
+ continue
274
 
275
+ # Last resort: try to get all text from the review card and extract the main content
276
+ try:
277
+ full_text = review_element.text
278
+ # Split by newlines and find the longest substantial text
279
+ lines = [line.strip() for line in full_text.split('\n') if line.strip()]
280
+ # Filter out dates, ratings, names
281
+ content_lines = []
282
+ for line in lines:
283
+ if len(line) > 30: # Substantial text
284
+ if not any(skip in line for skip in ['Dined on', 'Overall', 'Food', 'Service', 'Ambience', 'VIP']):
285
+ content_lines.append(line)
286
+
287
+ if content_lines:
288
+ # Return the longest line as the review
289
+ return max(content_lines, key=len)
290
+ except:
291
+ pass
292
+
293
+ return ""
294
+
295
+ def _extract_text_with_fallback(self, parent_element, selectors: List[str]) -> str:
296
+ """Extract text using fallback XPath selectors."""
297
+ for selector in selectors:
298
  try:
299
+ element = parent_element.find_element(By.XPATH, selector)
300
+ text = element.text.strip()
301
+ if text:
302
+ return text
303
+ except:
304
+ continue
305
+ return ""
306
+
307
+ def _find_elements_with_fallback(self, selectors: List[str], by: By) -> List:
308
+ """Try multiple selectors until one works."""
309
+ for selector in selectors:
310
+ try:
311
+ elements = self.driver.find_elements(by, selector)
312
+ if elements:
313
+ return elements
314
+ except:
315
+ continue
316
+ return []
317
+
318
+ def _click_next(self) -> bool:
319
+ """Click the next page button."""
320
+ for xp in self.SELECTORS["next_button"]:
321
+ try:
322
+ btn = WebDriverWait(self.driver, 3).until(
323
+ EC.presence_of_element_located((By.XPATH, xp))
324
+ )
325
 
326
+ # Check if disabled
327
  aria_disabled = (btn.get_attribute("aria-disabled") or "").lower()
328
  if aria_disabled in ("true", "1"):
329
  return False
330
 
331
+ # Scroll into view
332
  try:
333
  self.driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
334
  time.sleep(0.15)
335
+ except:
336
  pass
337
 
338
+ # Try clicking
339
  try:
340
  WebDriverWait(self.driver, 5).until(EC.element_to_be_clickable((By.XPATH, xp)))
341
  btn.click()
342
+ except:
 
343
  self.driver.execute_script("arguments[0].click();", btn)
344
 
345
  return True
 
347
  except TimeoutException:
348
  continue
349
  except StaleElementReferenceException:
 
350
  try:
351
  btn = self.driver.find_element(By.XPATH, xp)
352
  self.driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
353
  self.driver.execute_script("arguments[0].click();", btn)
354
  return True
355
+ except:
356
  continue
357
+ except:
358
  continue
359
 
360
  return False
361
 
362
+ def _extract_metadata(self) -> Dict[str, Any]:
363
+ """Extract restaurant metadata from page."""
364
+ metadata = {}
365
+ try:
366
+ # Restaurant name
367
+ name_selectors = [
368
+ "//h1",
369
+ "//h1[@data-test='restaurant-name']",
370
+ "//div[contains(@class,'restaurant-name')]//h1"
371
+ ]
372
+ for sel in name_selectors:
373
+ try:
374
+ elem = self.driver.find_element(By.XPATH, sel)
375
+ if elem.text.strip():
376
+ metadata['restaurant_name'] = elem.text.strip()
377
+ break
378
+ except:
379
+ continue
380
+
381
+ # Cuisine type
382
+ cuisine_selectors = [
383
+ "//span[contains(@class,'cuisine')]",
384
+ "//p[contains(@class,'cuisine')]",
385
+ "//div[contains(@class,'cuisine')]"
386
+ ]
387
+ for sel in cuisine_selectors:
388
+ try:
389
+ elem = self.driver.find_element(By.XPATH, sel)
390
+ if elem.text.strip():
391
+ metadata['cuisine'] = elem.text.strip()
392
+ break
393
+ except:
394
+ continue
395
+
396
+ except:
397
+ pass
398
+
399
+ return metadata
400
 
401
  def _init_driver(self):
402
+ """Initialize Chrome WebDriver."""
403
  chrome_options = Options()
404
  chrome_options.page_load_strategy = self.page_load_strategy
405
 
 
409
  chrome_options.add_argument('--disable-dev-shm-usage')
410
  chrome_options.add_argument('--disable-gpu')
411
 
 
412
  chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
 
 
413
  chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
414
  chrome_options.add_experimental_option('useAutomationExtension', False)
415
 
416
  service = Service('/usr/local/bin/chromedriver')
417
  self.driver = webdriver.Chrome(service=service, options=chrome_options)
418
  self.driver.set_page_load_timeout(30)
 
 
419
  self.wait = WebDriverWait(self.driver, 10)
420
 
421
  def _cleanup(self):
 
432
  return 'opentable.c' in url.lower()
433
 
434
  def _log_progress(self, message: str, callback: Optional[Callable]):
435
+ """Log progress."""
436
  print(message)
437
  if callback:
438
  callback(message)
 
443
 
444
  def scrape_opentable(url: str, max_reviews: Optional[int] = None, headless: bool = True) -> Dict[str, Any]:
445
  """
446
+ Convenience function to scrape OpenTable reviews.
 
 
 
 
 
447
 
448
+ FIXED: Only returns reviews with actual text content.
 
449
  """
450
  scraper = OpenTableScraper(headless=headless)
451
+ return scraper.scrape_reviews(url, max_reviews)
452
 
453
 
454
  if __name__ == "__main__":
455
+ # Test the scraper
456
+ test_url = "https://www.opentable.ca/r/dockside-restaurant-vancouver-vancouver"
457
+ result = scrape_opentable(test_url, max_reviews=50)
 
 
458
 
459
+ print(f"\n{'='*60}")
460
+ print(f"Results:")
461
+ print(f" Success: {result.get('success')}")
462
+ print(f" Total valid reviews: {result.get('total_reviews')}")
463
+ if result.get('stats'):
464
+ print(f" Empty skipped: {result['stats'].get('empty_skipped', 0)}")
465
+ print(f"{'='*60}")