riazmo commited on
Commit
087ac11
·
verified ·
1 Parent(s): 771474c

Upload 11 files

Browse files
app.py ADDED
@@ -0,0 +1,751 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Spaces - Review Intelligence System (Streamlit)
3
+ Complete app with URL input, progress tracking, and interactive dashboard
4
+ """
5
+
6
+ import streamlit as st
7
+ import pandas as pd
8
+ import plotly.express as px
9
+ import plotly.graph_objects as go
10
+ import os
11
+ from datetime import datetime
12
+ from typing import List, Dict, Optional
13
+ import time
14
+
15
+ from gradio_pipeline import GradioPipeline
16
+
17
+
18
+ # ============================================================================
19
+ # PAGE CONFIGURATION
20
+ # ============================================================================
21
+
22
+ st.set_page_config(
23
+ page_title="Review Intelligence System",
24
+ page_icon="🎯",
25
+ layout="wide",
26
+ initial_sidebar_state="expanded"
27
+ )
28
+
29
+ # Custom CSS
30
+ st.markdown("""
31
+ <style>
32
+ .main {
33
+ padding: 0rem 1rem;
34
+ }
35
+ .stMetric {
36
+ background-color: #f0f2f6;
37
+ padding: 15px;
38
+ border-radius: 5px;
39
+ }
40
+ .big-font {
41
+ font-size: 24px !important;
42
+ font-weight: bold;
43
+ }
44
+ .success-box {
45
+ padding: 20px;
46
+ border-radius: 10px;
47
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
48
+ color: white;
49
+ margin: 20px 0;
50
+ }
51
+ </style>
52
+ """, unsafe_allow_html=True)
53
+
54
+
55
+ # ============================================================================
56
+ # SESSION STATE INITIALIZATION
57
+ # ============================================================================
58
+
59
+ if 'processing_complete' not in st.session_state:
60
+ st.session_state.processing_complete = False
61
+
62
+ if 'results' not in st.session_state:
63
+ st.session_state.results = None
64
+
65
+ if 'insights' not in st.session_state:
66
+ st.session_state.insights = None
67
+
68
+ if 'scraped_count' not in st.session_state:
69
+ st.session_state.scraped_count = 0
70
+
71
+
72
+ # ============================================================================
73
+ # PROCESSING FUNCTIONS
74
+ # ============================================================================
75
+
76
+ def process_reviews_streamlit(app_store_urls: str, play_store_urls: str,
77
+ hf_api_key: str, review_limit: int):
78
+ """
79
+ Process reviews with Streamlit progress tracking
80
+ """
81
+
82
+ # Validate inputs
83
+ if not hf_api_key or not hf_api_key.strip():
84
+ st.error("❌ Please provide your HuggingFace API key")
85
+ return False
86
+
87
+ if not app_store_urls.strip() and not play_store_urls.strip():
88
+ st.error("❌ Please provide at least one App Store or Play Store URL")
89
+ return False
90
+
91
+ try:
92
+ # Set API key
93
+ os.environ['HUGGINGFACE_API_KEY'] = hf_api_key.strip()
94
+
95
+ # Progress indicators
96
+ progress_bar = st.progress(0)
97
+ status_text = st.empty()
98
+
99
+ # Initialize pipeline
100
+ status_text.text("🚀 Initializing pipeline...")
101
+ progress_bar.progress(5)
102
+ pipeline = GradioPipeline(review_limit=review_limit)
103
+
104
+ # Parse URLs
105
+ app_urls = [url.strip() for url in app_store_urls.split('\n') if url.strip()]
106
+ play_urls = [url.strip() for url in play_store_urls.split('\n') if url.strip()]
107
+
108
+ # Stage 0: Scraping
109
+ status_text.text("🕷️ Scraping reviews from stores...")
110
+ progress_bar.progress(10)
111
+
112
+ scraped_count = 0
113
+ total_apps = len(app_urls) + len(play_urls)
114
+
115
+ for i, app_id in enumerate(app_urls, 1):
116
+ status_text.text(f"🍎 Scraping App Store ({i}/{total_apps}): {app_id}")
117
+ reviews = pipeline.scraper.scrape_app_store_rss(app_id, country="ae", limit=review_limit)
118
+ saved = pipeline.scraper.save_reviews_to_db(reviews)
119
+ scraped_count += saved
120
+ progress_bar.progress(10 + int(20 * i / total_apps))
121
+ time.sleep(1)
122
+
123
+ for i, package in enumerate(play_urls, 1):
124
+ status_text.text(f"🤖 Scraping Play Store ({i}/{total_apps}): {package}")
125
+ reviews = pipeline.scraper.scrape_play_store_api(package, country="ae", limit=review_limit)
126
+ saved = pipeline.scraper.save_reviews_to_db(reviews)
127
+ scraped_count += saved
128
+ progress_bar.progress(10 + int(20 * (len(app_urls) + i) / total_apps))
129
+ time.sleep(1)
130
+
131
+ if scraped_count == 0:
132
+ st.warning("⚠️ No reviews scraped. Please check your URLs and try again.")
133
+ progress_bar.empty()
134
+ status_text.empty()
135
+ return False
136
+
137
+ st.session_state.scraped_count = scraped_count
138
+
139
+ # Stage 1-3: Processing
140
+ status_text.text("🤖 Processing reviews with AI models...")
141
+ progress_bar.progress(30)
142
+
143
+ reviews = pipeline.db.get_pending_reviews(limit=review_limit)
144
+ total_reviews = len(reviews)
145
+
146
+ processed_states = []
147
+
148
+ for i, review in enumerate(reviews, 1):
149
+ review_id = review.get('review_id', 'unknown')[:20]
150
+ status_text.text(f"🤖 Processing review {i}/{total_reviews}: {review_id}...")
151
+ progress_bar.progress(30 + int(60 * i / total_reviews))
152
+
153
+ try:
154
+ from langgraph_state import create_initial_state
155
+ state = create_initial_state(review)
156
+ config = {"configurable": {"thread_id": f"review_{review.get('review_id')}"}}
157
+ final_state = pipeline.review_graph.invoke(state, config=config)
158
+ processed_states.append(dict(final_state))
159
+ except Exception as e:
160
+ st.warning(f"⚠️ Error processing review: {str(e)}")
161
+ continue
162
+
163
+ if len(processed_states) == 0:
164
+ st.error("❌ No reviews were processed successfully.")
165
+ progress_bar.empty()
166
+ status_text.empty()
167
+ return False
168
+
169
+ # Stage 4: Batch Analysis
170
+ status_text.text("📊 Generating batch insights...")
171
+ progress_bar.progress(90)
172
+
173
+ insights = pipeline.analyze_batch(processed_states)
174
+
175
+ # Store in session state
176
+ st.session_state.results = processed_states
177
+ st.session_state.insights = insights
178
+ st.session_state.processing_complete = True
179
+
180
+ # Complete
181
+ progress_bar.progress(100)
182
+ status_text.text("✅ Analysis complete!")
183
+ time.sleep(1)
184
+ progress_bar.empty()
185
+ status_text.empty()
186
+
187
+ return True
188
+
189
+ except Exception as e:
190
+ st.error(f"❌ Error during processing: {str(e)}")
191
+ import traceback
192
+ st.code(traceback.format_exc())
193
+ return False
194
+
195
+
196
+ # ============================================================================
197
+ # VISUALIZATION FUNCTIONS
198
+ # ============================================================================
199
+
200
+ def create_summary_section(scraped_count: int, results: List[Dict], insights: Dict):
201
+ """Create summary metrics section"""
202
+
203
+ total = len(results)
204
+ positive = insights.get('sentiment_distribution', {}).get('POSITIVE', 0)
205
+ neutral = insights.get('sentiment_distribution', {}).get('NEUTRAL', 0)
206
+ negative = insights.get('sentiment_distribution', {}).get('NEGATIVE', 0)
207
+ critical = insights.get('priority_distribution', {}).get('critical', 0)
208
+ churn_risk = insights.get('churn_risk', 0)
209
+
210
+ # Success header
211
+ st.markdown(
212
+ f"""
213
+ <div class="success-box">
214
+ <h1 style="margin: 0;">✅ Analysis Complete!</h1>
215
+ <p style="margin: 10px 0 0 0; font-size: 1.2em; opacity: 0.9;">
216
+ Review Intelligence System Results
217
+ </p>
218
+ </div>
219
+ """,
220
+ unsafe_allow_html=True
221
+ )
222
+
223
+ # Metrics
224
+ col1, col2, col3, col4, col5 = st.columns(5)
225
+
226
+ with col1:
227
+ st.metric("Total Reviews", total, f"Scraped: {scraped_count}")
228
+
229
+ with col2:
230
+ pos_pct = (positive / total * 100) if total > 0 else 0
231
+ st.metric("Positive", positive, f"{pos_pct:.1f}%")
232
+
233
+ with col3:
234
+ neg_pct = (negative / total * 100) if total > 0 else 0
235
+ st.metric("Negative", negative, f"{neg_pct:.1f}%")
236
+
237
+ with col4:
238
+ st.metric("Critical Issues", critical, "🚨" if critical > 0 else "✅")
239
+
240
+ with col5:
241
+ delta_color = "inverse" if churn_risk > 30 else "normal"
242
+ st.metric("Churn Risk", f"{churn_risk:.1f}%",
243
+ "⚠️ High" if churn_risk > 30 else "✅ Low")
244
+
245
+ # Recommendations
246
+ st.markdown("### 💡 Key Recommendations")
247
+ for rec in insights.get('recommendations', []):
248
+ st.info(rec)
249
+
250
+
251
+ def create_sentiment_chart(insights: Dict):
252
+ """Create sentiment distribution donut chart"""
253
+ sentiment_dist = insights.get('sentiment_distribution', {})
254
+
255
+ labels = list(sentiment_dist.keys())
256
+ values = list(sentiment_dist.values())
257
+ colors = ['#2ca02c', '#ff7f0e', '#d62728']
258
+
259
+ fig = go.Figure(data=[go.Pie(
260
+ labels=labels,
261
+ values=values,
262
+ hole=0.5,
263
+ marker_colors=colors,
264
+ textinfo='label+percent',
265
+ textposition='outside',
266
+ textfont_size=14
267
+ )])
268
+
269
+ fig.update_layout(
270
+ title="😊 Sentiment Distribution",
271
+ showlegend=True,
272
+ height=400
273
+ )
274
+
275
+ return fig
276
+
277
+
278
+ def create_priority_chart(insights: Dict):
279
+ """Create priority distribution bar chart"""
280
+ priority_dist = insights.get('priority_distribution', {})
281
+
282
+ priority_order = ['critical', 'high', 'medium', 'low']
283
+ labels = [p for p in priority_order if p in priority_dist]
284
+ values = [priority_dist.get(p, 0) for p in labels]
285
+ colors = ['#d62728', '#ff7f0e', '#1f77b4', '#2ca02c']
286
+
287
+ fig = go.Figure(data=[go.Bar(
288
+ x=labels,
289
+ y=values,
290
+ marker_color=colors[:len(labels)],
291
+ text=values,
292
+ textposition='auto'
293
+ )])
294
+
295
+ fig.update_layout(
296
+ title="🎯 Priority Levels",
297
+ xaxis_title="Priority",
298
+ yaxis_title="Count",
299
+ height=400
300
+ )
301
+
302
+ return fig
303
+
304
+
305
+ def create_department_chart(insights: Dict):
306
+ """Create department routing horizontal bar chart"""
307
+ dept_dist = insights.get('department_distribution', {})
308
+
309
+ labels = list(dept_dist.keys())
310
+ values = list(dept_dist.values())
311
+
312
+ fig = go.Figure(data=[go.Bar(
313
+ x=values,
314
+ y=labels,
315
+ orientation='h',
316
+ marker_color='#667eea',
317
+ text=values,
318
+ textposition='auto'
319
+ )])
320
+
321
+ fig.update_layout(
322
+ title="🏢 Department Routing",
323
+ xaxis_title="Number of Issues",
324
+ yaxis_title="Department",
325
+ height=400
326
+ )
327
+
328
+ return fig
329
+
330
+
331
+ def create_emotion_chart(insights: Dict):
332
+ """Create emotion distribution chart"""
333
+ emotion_dist = insights.get('emotion_distribution', {})
334
+
335
+ labels = list(emotion_dist.keys())
336
+ values = list(emotion_dist.values())
337
+
338
+ fig = px.bar(
339
+ x=labels,
340
+ y=values,
341
+ labels={'x': 'Emotion', 'y': 'Count'},
342
+ color=values,
343
+ color_continuous_scale='Viridis'
344
+ )
345
+
346
+ fig.update_layout(
347
+ title="😊 Emotional Analysis",
348
+ xaxis_title="Emotion Type",
349
+ yaxis_title="Number of Reviews",
350
+ height=300,
351
+ showlegend=False
352
+ )
353
+
354
+ return fig
355
+
356
+
357
+ def create_reviews_dataframe(results: List[Dict]) -> pd.DataFrame:
358
+ """Create DataFrame for reviews table"""
359
+
360
+ df_data = []
361
+ for review in results:
362
+ df_data.append({
363
+ 'Review ID': review.get('review_id', 'N/A')[:20],
364
+ 'Rating': review.get('rating', 0),
365
+ 'Review': (review.get('review_text', 'N/A') or '')[:100] + '...',
366
+ 'Sentiment': review.get('stage3_final_sentiment', 'N/A'),
367
+ 'Type': review.get('stage1_llm1_type', 'N/A'),
368
+ 'Department': review.get('stage1_llm1_department', 'N/A'),
369
+ 'Priority': review.get('stage1_llm1_priority', 'N/A'),
370
+ 'Emotion': review.get('stage1_llm2_emotion', 'N/A'),
371
+ 'Needs Review': '🚨 Yes' if review.get('stage3_needs_human_review') else '✅ No'
372
+ })
373
+
374
+ return pd.DataFrame(df_data)
375
+
376
+
377
+ # ============================================================================
378
+ # MAIN APP
379
+ # ============================================================================
380
+
381
+ def main():
382
+ """Main Streamlit app"""
383
+
384
+ # Title
385
+ st.title("🎯 Review Intelligence System")
386
+ st.markdown("### Multi-Stage AI Analysis for App Store & Play Store Reviews")
387
+ st.markdown("Powered by **LangGraph** + **HuggingFace** • 4-Stage Processing Pipeline")
388
+ st.markdown("---")
389
+
390
+ # Sidebar - Input or View Mode
391
+ with st.sidebar:
392
+ st.header("🎛️ Control Panel")
393
+
394
+ if st.session_state.processing_complete:
395
+ st.success("✅ Analysis Complete!")
396
+ if st.button("🔄 Start New Analysis", use_container_width=True):
397
+ st.session_state.processing_complete = False
398
+ st.session_state.results = None
399
+ st.session_state.insights = None
400
+ st.rerun()
401
+ else:
402
+ st.info("👈 Enter URLs below to start")
403
+
404
+ # Main content - Input or Results
405
+ if not st.session_state.processing_complete:
406
+ # INPUT MODE
407
+ show_input_form()
408
+ else:
409
+ # RESULTS MODE
410
+ show_results_dashboard()
411
+
412
+
413
+ def show_input_form():
414
+ """Show input form for URLs and API key"""
415
+
416
+ st.markdown("### 📝 Step 1: Enter Store URLs")
417
+
418
+ col1, col2 = st.columns(2)
419
+
420
+ with col1:
421
+ st.markdown("#### 🍎 App Store IDs")
422
+ st.markdown(
423
+ """
424
+ **Format:** Just paste the app ID
425
+ - Example: `1158907446` (UAE)
426
+ - Example: `1234567890` (US)
427
+ """
428
+ )
429
+ app_store_urls = st.text_area(
430
+ "App Store IDs (one per line)",
431
+ placeholder="1158907446\n1234567890",
432
+ height=150,
433
+ key="app_urls"
434
+ )
435
+
436
+ with col2:
437
+ st.markdown("#### 🤖 Play Store Packages")
438
+ st.markdown(
439
+ """
440
+ **Format:** Package name
441
+ - Example: `com.yas.app`
442
+ - Example: `com.company.app`
443
+ """
444
+ )
445
+ play_store_urls = st.text_area(
446
+ "Play Store Package Names (one per line)",
447
+ placeholder="com.yas.app\ncom.company.app",
448
+ height=150,
449
+ key="play_urls"
450
+ )
451
+
452
+ st.markdown("---")
453
+ st.markdown("### 🔑 Step 2: Configure Settings")
454
+
455
+ col1, col2 = st.columns([2, 1])
456
+
457
+ with col1:
458
+ hf_api_key = st.text_input(
459
+ "🔑 HuggingFace API Key",
460
+ type="password",
461
+ placeholder="hf_...",
462
+ help="Get your key from: https://huggingface.co/settings/tokens",
463
+ key="hf_key"
464
+ )
465
+
466
+ with col2:
467
+ review_limit = st.slider(
468
+ "📊 Reviews per App",
469
+ min_value=5,
470
+ max_value=100,
471
+ value=20,
472
+ step=5,
473
+ help="More reviews = longer processing time",
474
+ key="review_limit"
475
+ )
476
+
477
+ st.markdown("---")
478
+
479
+ # Submit button
480
+ col1, col2, col3 = st.columns([1, 1, 1])
481
+
482
+ with col2:
483
+ if st.button("🚀 Start Analysis", use_container_width=True, type="primary"):
484
+ with st.spinner("Processing..."):
485
+ success = process_reviews_streamlit(
486
+ app_store_urls,
487
+ play_store_urls,
488
+ hf_api_key,
489
+ review_limit
490
+ )
491
+
492
+ if success:
493
+ st.balloons()
494
+ st.rerun()
495
+
496
+ # Documentation
497
+ with st.expander("📚 How to Use"):
498
+ st.markdown("""
499
+ ### 📖 Quick Guide
500
+
501
+ **1. Get HuggingFace API Key:**
502
+ - Visit: https://huggingface.co/settings/tokens
503
+ - Create new token (Read access)
504
+ - Copy token (starts with `hf_`)
505
+
506
+ **2. Enter URLs:**
507
+ - **App Store**: Just the ID number (e.g., `1234567890`)
508
+ - **Play Store**: Package name (e.g., `com.company.app`)
509
+ - One per line
510
+
511
+ **3. Click Start:**
512
+ - Watch progress bar
513
+ - Wait for completion (~7 sec per review)
514
+ - View results automatically
515
+
516
+ ### 🏗️ What Happens:
517
+ - 🕷️ **Stage 0**: Scrapes reviews from stores
518
+ - 🤖 **Stage 1**: Classifies with 3 AI models (Type, Department, Priority)
519
+ - 😊 **Stage 2**: Analyzes sentiment with dual BERT models
520
+ - 📊 **Stage 3**: Synthesizes insights and recommendations
521
+ - 💡 **Stage 4**: Generates batch analytics
522
+
523
+ ### ⚡ Performance:
524
+ - ~7 seconds per review
525
+ - 7 AI models working together
526
+ - Parallel execution for speed
527
+ """)
528
+
529
+
530
+ def show_results_dashboard():
531
+ """Show results dashboard with charts and tables"""
532
+
533
+ results = st.session_state.results
534
+ insights = st.session_state.insights
535
+ scraped_count = st.session_state.scraped_count
536
+
537
+ # Summary section
538
+ create_summary_section(scraped_count, results, insights)
539
+
540
+ st.markdown("---")
541
+
542
+ # Tabs for different views
543
+ tab1, tab2, tab3, tab4 = st.tabs([
544
+ "📊 Sentiment Analysis",
545
+ "🚨 Critical Issues",
546
+ "📋 All Reviews",
547
+ "📥 Export"
548
+ ])
549
+
550
+ # TAB 1: Sentiment Analysis
551
+ with tab1:
552
+ st.header("📊 Sentiment Analysis Overview")
553
+
554
+ col1, col2 = st.columns(2)
555
+
556
+ with col1:
557
+ fig_sentiment = create_sentiment_chart(insights)
558
+ st.plotly_chart(fig_sentiment, use_container_width=True)
559
+
560
+ with col2:
561
+ fig_priority = create_priority_chart(insights)
562
+ st.plotly_chart(fig_priority, use_container_width=True)
563
+
564
+ st.markdown("### 🏢 Department Routing")
565
+ fig_dept = create_department_chart(insights)
566
+ st.plotly_chart(fig_dept, use_container_width=True)
567
+
568
+ st.markdown("### 😊 Emotional Analysis")
569
+ fig_emotion = create_emotion_chart(insights)
570
+ st.plotly_chart(fig_emotion, use_container_width=True)
571
+
572
+ # TAB 2: Critical Issues
573
+ with tab2:
574
+ st.header("🚨 Critical Issues Requiring Attention")
575
+
576
+ # Filter critical reviews
577
+ critical_reviews = [
578
+ r for r in results
579
+ if (r.get('stage1_llm1_priority') == 'critical' or
580
+ r.get('stage3_needs_human_review') or
581
+ (r.get('stage3_final_sentiment') == 'NEGATIVE' and r.get('rating', 5) <= 2))
582
+ ]
583
+
584
+ if len(critical_reviews) == 0:
585
+ st.success("✅ No critical issues found! All reviews are in good shape.")
586
+ else:
587
+ st.warning(f"Found {len(critical_reviews)} critical issues")
588
+
589
+ for review in critical_reviews:
590
+ with st.expander(
591
+ f"⚠️ {review.get('review_id', 'Unknown')[:30]} - "
592
+ f"Rating: {review.get('rating', 'N/A')}/5"
593
+ ):
594
+ col1, col2 = st.columns([2, 1])
595
+
596
+ with col1:
597
+ st.markdown("**Review Text:**")
598
+ st.write(review.get('review_text', 'No text available'))
599
+
600
+ st.markdown("**Reasoning:**")
601
+ st.info(review.get('stage3_reasoning', 'No reasoning available'))
602
+
603
+ with col2:
604
+ st.markdown("**Classification:**")
605
+ st.write(f"📌 Type: {review.get('stage1_llm1_type', 'N/A')}")
606
+ st.write(f"🏢 Department: {review.get('stage1_llm1_department', 'N/A')}")
607
+ st.write(f"🎯 Priority: {review.get('stage1_llm1_priority', 'N/A')}")
608
+ st.write(f"😔 Emotion: {review.get('stage1_llm2_emotion', 'N/A')}")
609
+ st.write(f"💭 Sentiment: {review.get('stage3_final_sentiment', 'N/A')}")
610
+
611
+ st.markdown("**Action:**")
612
+ st.error(review.get('stage3_action_recommendation', 'No action specified'))
613
+
614
+ # TAB 3: All Reviews
615
+ with tab3:
616
+ st.header("📋 Detailed Review Analysis")
617
+
618
+ # Create DataFrame
619
+ df = create_reviews_dataframe(results)
620
+
621
+ # Filters
622
+ col1, col2, col3 = st.columns(3)
623
+
624
+ with col1:
625
+ sentiment_filter = st.multiselect(
626
+ "Filter by Sentiment",
627
+ options=df['Sentiment'].unique(),
628
+ default=df['Sentiment'].unique()
629
+ )
630
+
631
+ with col2:
632
+ dept_filter = st.multiselect(
633
+ "Filter by Department",
634
+ options=df['Department'].unique(),
635
+ default=df['Department'].unique()
636
+ )
637
+
638
+ with col3:
639
+ priority_filter = st.multiselect(
640
+ "Filter by Priority",
641
+ options=df['Priority'].unique(),
642
+ default=df['Priority'].unique()
643
+ )
644
+
645
+ # Apply filters
646
+ filtered_df = df[
647
+ (df['Sentiment'].isin(sentiment_filter)) &
648
+ (df['Department'].isin(dept_filter)) &
649
+ (df['Priority'].isin(priority_filter))
650
+ ]
651
+
652
+ st.info(f"Showing {len(filtered_df)} of {len(df)} reviews")
653
+
654
+ # Display table
655
+ st.dataframe(
656
+ filtered_df,
657
+ use_container_width=True,
658
+ height=600
659
+ )
660
+
661
+ # TAB 4: Export
662
+ with tab4:
663
+ st.header("📥 Export Results")
664
+
665
+ st.markdown("### Download Options")
666
+
667
+ col1, col2 = st.columns(2)
668
+
669
+ with col1:
670
+ st.markdown("#### 📊 CSV Export")
671
+ st.write("Download complete analysis with all classifications")
672
+
673
+ df = create_reviews_dataframe(results)
674
+ csv = df.to_csv(index=False)
675
+
676
+ st.download_button(
677
+ label="📥 Download CSV Report",
678
+ data=csv,
679
+ file_name=f"review_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
680
+ mime="text/csv",
681
+ use_container_width=True
682
+ )
683
+
684
+ with col2:
685
+ st.markdown("#### 📋 JSON Export")
686
+ st.write("Download raw data with all details")
687
+
688
+ import json
689
+ json_data = json.dumps({
690
+ 'results': results,
691
+ 'insights': insights,
692
+ 'scraped_count': scraped_count,
693
+ 'export_date': datetime.now().isoformat()
694
+ }, indent=2)
695
+
696
+ st.download_button(
697
+ label="📥 Download JSON Data",
698
+ data=json_data,
699
+ file_name=f"review_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
700
+ mime="application/json",
701
+ use_container_width=True
702
+ )
703
+
704
+ st.markdown("---")
705
+ st.markdown("### 📊 Summary Statistics")
706
+
707
+ col1, col2, col3 = st.columns(3)
708
+
709
+ with col1:
710
+ st.metric("Total Reviews Analyzed", len(results))
711
+
712
+ with col2:
713
+ positive = insights.get('sentiment_distribution', {}).get('POSITIVE', 0)
714
+ total = len(results)
715
+ pct = (positive / total * 100) if total > 0 else 0
716
+ st.metric("Positive Rate", f"{pct:.1f}%")
717
+
718
+ with col3:
719
+ critical = insights.get('priority_distribution', {}).get('critical', 0)
720
+ st.metric("Critical Issues", critical)
721
+
722
+
723
+ # ============================================================================
724
+ # FOOTER
725
+ # ============================================================================
726
+
727
+ def show_footer():
728
+ """Show footer with credits"""
729
+ st.markdown("---")
730
+ st.markdown(
731
+ """
732
+ <div style='text-align: center'>
733
+ <p>🤖 Powered by Multi-Stage AI Pipeline |
734
+ Stage 1: Classification (Qwen, Mistral, Llama) |
735
+ Stage 2: Sentiment (Twitter-BERT) |
736
+ Stage 3: Finalization (Llama 70B) |
737
+ Stage 4: Batch Analysis</p>
738
+ <p>Built with ❤️ using LangGraph + HuggingFace + Streamlit</p>
739
+ </div>
740
+ """,
741
+ unsafe_allow_html=True
742
+ )
743
+
744
+
745
+ # ============================================================================
746
+ # RUN APP
747
+ # ============================================================================
748
+
749
+ if __name__ == "__main__":
750
+ main()
751
+ show_footer()
config.yaml ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🎯 Review Intelligence System Configuration
2
+ # Edit this file to customize agent behavior, prompts, and models
3
+
4
+ # =============================================================================
5
+ # MODELS CONFIGURATION
6
+ # =============================================================================
7
+ models:
8
+ # Stage 1: Classification Models
9
+ stage1:
10
+ llm1:
11
+ name: "Qwen/Qwen2.5-72B-Instruct"
12
+ temperature: 0.1
13
+ max_tokens: 200
14
+ role: "Type, Department, Priority classifier"
15
+
16
+ llm2:
17
+ name: "mistralai/Mistral-7B-Instruct-v0.3"
18
+ temperature: 0.1
19
+ max_tokens: 200
20
+ role: "User type, Emotion, Context analyzer"
21
+
22
+ manager:
23
+ name: "meta-llama/Llama-3.1-8B-Instruct"
24
+ temperature: 0.1
25
+ max_tokens: 250
26
+ role: "Synthesis manager"
27
+
28
+ # Stage 2: Sentiment Models (Local BERT)
29
+ stage2:
30
+ best_model:
31
+ name: "cardiffnlp/twitter-roberta-base-sentiment-latest"
32
+ type: "local"
33
+ description: "Twitter-trained RoBERTa (124M tweets)"
34
+
35
+ alternate_model:
36
+ name: "finiteautomata/bertweet-base-sentiment-analysis"
37
+ type: "local"
38
+ description: "BERTweet (850M tweets)"
39
+
40
+ # Stage 3: Finalization Model
41
+ stage3:
42
+ llm3:
43
+ name: "meta-llama/Llama-3.1-70B-Instruct"
44
+ temperature: 0.1
45
+ max_tokens: 400
46
+ role: "Final synthesis and reasoning"
47
+
48
+ # =============================================================================
49
+ # AGENT PERSONAS & PROMPTS
50
+ # =============================================================================
51
+ personas:
52
+ # LLM1: Classification Expert
53
+ llm1:
54
+ name: "Classification Specialist"
55
+ expertise: "Expert at classifying customer reviews for theme park and attraction apps"
56
+ personality: "Analytical, precise, focused on categorization"
57
+ tone: "Professional and systematic"
58
+
59
+ system_prompt: |
60
+ You are an expert at classifying customer reviews for theme park and attraction apps.
61
+ Your job is to analyze reviews and categorize them across multiple dimensions.
62
+ Be precise, analytical, and consistent in your classifications.
63
+
64
+ categories:
65
+ type:
66
+ - complaint: "Customer reports a problem"
67
+ - praise: "Customer expresses satisfaction"
68
+ - suggestion: "Customer proposes improvement"
69
+ - question: "Customer asks about something"
70
+ - bug_report: "Technical issue described"
71
+
72
+ department:
73
+ - engineering: "Technical issues, bugs, crashes"
74
+ - ux: "Design, usability, interface issues"
75
+ - support: "Customer service, help needed"
76
+ - business: "Pricing, policies, marketing"
77
+
78
+ priority:
79
+ - critical: "Service down, major blocker"
80
+ - high: "Significant problem affecting use"
81
+ - medium: "Inconvenience but not blocking"
82
+ - low: "Minor issue or suggestion"
83
+
84
+ # LLM2: Psychology Expert
85
+ llm2:
86
+ name: "User Psychology Analyst"
87
+ expertise: "Expert at understanding customer psychology and emotional context"
88
+ personality: "Empathetic, insightful, human-centered"
89
+ tone: "Warm yet professional"
90
+
91
+ system_prompt: |
92
+ You are an expert at understanding customer psychology and emotional context.
93
+ Your job is to analyze the human behind the review - their emotions, user type, and context.
94
+ Be empathetic, insightful, and focus on the human experience.
95
+
96
+ categories:
97
+ user_type:
98
+ - new_user: "First-time or new user"
99
+ - regular_user: "Returning customer"
100
+ - power_user: "Heavy user, tech-savvy"
101
+ - churning_user: "Considering leaving"
102
+
103
+ emotion:
104
+ - anger: "Angry, hostile tone"
105
+ - frustration: "Frustrated but not angry"
106
+ - joy: "Happy, satisfied"
107
+ - satisfaction: "Content, pleased"
108
+ - disappointment: "Let down, sad"
109
+ - confusion: "Unclear, needs help"
110
+
111
+ # Manager: Synthesis Expert
112
+ manager:
113
+ name: "Synthesis Manager"
114
+ expertise: "Expert at reconciling multiple AI analyses and making final decisions"
115
+ personality: "Balanced, fair, decisive"
116
+ tone: "Authoritative yet collaborative"
117
+
118
+ system_prompt: |
119
+ You are a synthesis manager evaluating two AI analyses of the same review.
120
+ Your job is to validate both analyses, resolve conflicts, and make final classification decisions.
121
+ Be thorough, fair, and provide clear reasoning for your decisions.
122
+
123
+ # LLM3: Strategic Analyst
124
+ llm3:
125
+ name: "Strategic Decision Maker"
126
+ expertise: "Expert at synthesizing complex data and providing actionable recommendations"
127
+ personality: "Strategic, comprehensive, business-focused"
128
+ tone: "Executive-level, actionable"
129
+
130
+ system_prompt: |
131
+ You are a final decision-making AI analyzing customer feedback for a theme park/attraction app.
132
+ Your job is to synthesize all previous analysis stages and provide comprehensive, actionable insights.
133
+ Think strategically about business impact, user satisfaction, and operational priorities.
134
+ Your recommendations should be clear, specific, and immediately actionable.
135
+
136
+ # =============================================================================
137
+ # CLASSIFICATION RULES
138
+ # =============================================================================
139
+ classification_rules:
140
+ # Priority escalation rules
141
+ priority_escalation:
142
+ keywords_critical:
143
+ - "crash"
144
+ - "doesn't work"
145
+ - "broken"
146
+ - "can't use"
147
+ - "completely unusable"
148
+ - "emergency"
149
+ - "urgent"
150
+
151
+ keywords_high:
152
+ - "bug"
153
+ - "error"
154
+ - "problem"
155
+ - "issue"
156
+ - "not working"
157
+ - "frustrated"
158
+
159
+ rating_thresholds:
160
+ critical: 1 # 1-star reviews are critical
161
+ high: 2 # 2-star reviews are high priority
162
+
163
+ # Department routing rules
164
+ department_keywords:
165
+ engineering:
166
+ - "crash"
167
+ - "bug"
168
+ - "error"
169
+ - "not loading"
170
+ - "freeze"
171
+ - "slow"
172
+ - "technical"
173
+
174
+ ux:
175
+ - "confusing"
176
+ - "hard to use"
177
+ - "can't find"
178
+ - "design"
179
+ - "layout"
180
+ - "interface"
181
+ - "navigation"
182
+
183
+ support:
184
+ - "help"
185
+ - "contact"
186
+ - "customer service"
187
+ - "support"
188
+ - "assistance"
189
+ - "question"
190
+
191
+ business:
192
+ - "price"
193
+ - "refund"
194
+ - "subscription"
195
+ - "billing"
196
+ - "expensive"
197
+ - "policy"
198
+
199
+ # Churn risk indicators
200
+ churn_indicators:
201
+ high_risk:
202
+ - "switching to"
203
+ - "deleted the app"
204
+ - "uninstalling"
205
+ - "terrible experience"
206
+ - "never again"
207
+ - "disappointed"
208
+
209
+ medium_risk:
210
+ - "might switch"
211
+ - "considering alternatives"
212
+ - "getting worse"
213
+ - "used to be better"
214
+
215
+ # =============================================================================
216
+ # SENTIMENT ANALYSIS SETTINGS
217
+ # =============================================================================
218
+ sentiment:
219
+ # Agreement thresholds
220
+ agreement:
221
+ strong_threshold: 0.9 # Both models >0.9 confidence
222
+ weak_threshold: 0.6 # One model <0.6 confidence
223
+
224
+ # Confidence weighting
225
+ confidence:
226
+ minimum_acceptable: 0.5
227
+ high_confidence: 0.8
228
+ very_high_confidence: 0.95
229
+
230
+ # Override rules
231
+ override_rules:
232
+ # If rating is 1-star but sentiment is positive, flag for review
233
+ rating_sentiment_mismatch:
234
+ enabled: true
235
+ flag_threshold: 2 # 2-star difference
236
+
237
+ # =============================================================================
238
+ # BATCH ANALYSIS SETTINGS
239
+ # =============================================================================
240
+ batch_analysis:
241
+ # Critical issues detection
242
+ critical_issues:
243
+ max_display: 10
244
+ criteria:
245
+ - priority: "critical"
246
+ - sentiment: "NEGATIVE"
247
+ - rating: "<=2"
248
+ - needs_human_review: true
249
+
250
+ # Quick wins detection
251
+ quick_wins:
252
+ max_display: 10
253
+ criteria:
254
+ - type: "suggestion"
255
+ - priority: ["low", "medium"]
256
+ - feasibility: "easy"
257
+
258
+ # Churn risk calculation
259
+ churn_risk:
260
+ weights:
261
+ churning_user: 2.0
262
+ negative_low_rating: 1.5
263
+ rating_1_star: 1.0
264
+
265
+ thresholds:
266
+ high: 30 # >30% is high risk
267
+ medium: 15 # 15-30% is medium risk
268
+ low: 0 # <15% is low risk
269
+
270
+ # =============================================================================
271
+ # PROMPT TEMPLATES
272
+ # =============================================================================
273
+ prompt_templates:
274
+ # Stage 1 LLM1 Prompt
275
+ stage1_llm1: |
276
+ You are an expert at classifying customer reviews for theme park and attraction apps.
277
+
278
+ REVIEW:
279
+ Rating: {rating}/5
280
+ Text: {review_text}
281
+
282
+ Classify this review across these dimensions:
283
+
284
+ 1. TYPE (choose ONE): {type_options}
285
+ 2. DEPARTMENT (choose ONE): {department_options}
286
+ 3. PRIORITY (choose ONE): {priority_options}
287
+ 4. CONFIDENCE (0.0-1.0): How confident are you in this classification?
288
+ 5. REASONING: Brief one-sentence explanation
289
+
290
+ Respond ONLY in valid JSON format:
291
+ {{
292
+ "type": "complaint/praise/suggestion/question/bug_report",
293
+ "department": "engineering/ux/support/business",
294
+ "priority": "critical/high/medium/low",
295
+ "confidence": 0.0-1.0,
296
+ "reasoning": "brief explanation"
297
+ }}
298
+
299
+ # Stage 1 LLM2 Prompt
300
+ stage1_llm2: |
301
+ You are an expert at understanding customer psychology and emotional context.
302
+
303
+ REVIEW:
304
+ Rating: {rating}/5
305
+ Text: {review_text}
306
+
307
+ Analyze the user and emotional context:
308
+
309
+ 1. USER_TYPE (choose ONE): {user_type_options}
310
+ 2. EMOTION (choose ONE): {emotion_options}
311
+ 3. CONTEXT (brief): What is the underlying issue or situation? 1-2 words summary
312
+ 4. CONFIDENCE (0.0-1.0): How confident are you?
313
+ 5. REASONING: Brief one-sentence explanation
314
+
315
+ Respond ONLY in valid JSON format:
316
+ {{
317
+ "user_type": "new_user/regular_user/power_user/churning_user",
318
+ "emotion": "anger/frustration/joy/satisfaction/disappointment/confusion",
319
+ "context": "brief context",
320
+ "confidence": 0.0-1.0,
321
+ "reasoning": "brief explanation"
322
+ }}
323
+
324
+ # Stage 1 Manager Prompt
325
+ stage1_manager: |
326
+ You are a synthesis manager evaluating two AI analyses of the same review.
327
+
328
+ REVIEW:
329
+ Rating: {rating}/5
330
+ Text: {review_text}
331
+
332
+ LLM1 ANALYSIS (Type/Dept/Priority):
333
+ {llm1_result}
334
+
335
+ LLM2 ANALYSIS (User/Emotion/Context):
336
+ {llm2_result}
337
+
338
+ Your task:
339
+ 1. Validate both analyses
340
+ 2. Resolve any conflicts
341
+ 3. Make final classification decision
342
+ 4. Provide synthesis reasoning
343
+
344
+ Respond ONLY in valid JSON format:
345
+ {{
346
+ "final_type": "from llm1 or adjusted",
347
+ "final_department": "from llm1 or adjusted",
348
+ "final_priority": "from llm1 or adjusted",
349
+ "final_user_type": "from llm2 or adjusted",
350
+ "final_emotion": "from llm2 or adjusted",
351
+ "confidence": 0.0-1.0,
352
+ "reasoning": "synthesis explanation",
353
+ "conflicts_found": "any conflicts between LLM1 and LLM2, or 'none'"
354
+ }}
355
+
356
+ # Stage 3 LLM3 Prompt
357
+ stage3_llm3: |
358
+ You are a final decision-making AI analyzing customer feedback for a theme park/attraction app.
359
+
360
+ REVIEW DATA:
361
+ Rating: {rating}/5
362
+ Text: {review_text}
363
+
364
+ STAGE 1 CLASSIFICATION:
365
+ - Review Type: {type}
366
+ - Department: {department}
367
+ - Priority: {priority}
368
+ - User Type: {user_type}
369
+ - Emotion: {emotion}
370
+
371
+ STAGE 2 SENTIMENT ANALYSIS:
372
+ - Best Model: {best_sentiment} (confidence: {best_confidence})
373
+ - Alternate Model: {alt_sentiment} (confidence: {alt_confidence})
374
+ - Models Agreement: {agreement}
375
+
376
+ YOUR TASK:
377
+ 1. Review all data from both stages
378
+ 2. Make FINAL sentiment decision (POSITIVE, NEGATIVE, or NEUTRAL)
379
+ 3. Validate that classification and sentiment align
380
+ 4. Provide comprehensive reasoning
381
+ 5. Identify any conflicts between stages
382
+ 6. Generate action recommendation
383
+ 7. Flag if human review is needed
384
+
385
+ Respond ONLY in valid JSON format:
386
+ {{
387
+ "final_sentiment": "POSITIVE/NEGATIVE/NEUTRAL",
388
+ "confidence": 0.0-1.0,
389
+ "reasoning": "Comprehensive explanation synthesizing all stages",
390
+ "validation_notes": "Does classification match sentiment?",
391
+ "conflicts_found": "any conflicts or 'none'",
392
+ "action_recommendation": "Specific action to take",
393
+ "needs_human_review": true/false
394
+ }}
395
+
396
+ # =============================================================================
397
+ # PROCESSING SETTINGS
398
+ # =============================================================================
399
+ processing:
400
+ # Batch settings
401
+ batch_size: 10
402
+ max_workers: 3
403
+ timeout_seconds: 30
404
+ retry_attempts: 3
405
+
406
+ # Rate limiting (for HF API)
407
+ rate_limit:
408
+ requests_per_minute: 60
409
+ requests_per_day: 10000 # HF Pro limit
410
+
411
+ # Logging
412
+ logging:
413
+ level: "INFO" # DEBUG, INFO, WARNING, ERROR
414
+ save_logs: true
415
+ log_file: "processing.log"
416
+
417
+ # Checkpointing
418
+ checkpoint:
419
+ enabled: true
420
+ save_after_each_stage: true
421
+ auto_resume: true
422
+
423
+ # =============================================================================
424
+ # DASHBOARD SETTINGS
425
+ # =============================================================================
426
+ dashboard:
427
+ # UI Configuration
428
+ ui:
429
+ title: "Review Intelligence System"
430
+ icon: "🎯"
431
+ layout: "wide"
432
+ theme: "light" # light or dark
433
+
434
+ # Chart colors
435
+ colors:
436
+ positive: "#2ca02c"
437
+ neutral: "#ff7f0e"
438
+ negative: "#d62728"
439
+ critical: "#d62728"
440
+ high: "#ff7f0e"
441
+ medium: "#1f77b4"
442
+ low: "#2ca02c"
443
+
444
+ # Filters
445
+ filters:
446
+ enable_sentiment: true
447
+ enable_department: true
448
+ enable_priority: true
449
+ enable_date_range: false # Future feature
450
+
451
+ # Display limits
452
+ display:
453
+ max_critical_issues: 20
454
+ max_quick_wins: 15
455
+ reviews_per_page: 50
456
+ auto_refresh_seconds: 60
457
+
458
+ # =============================================================================
459
+ # DOMAIN-SPECIFIC CUSTOMIZATION (Theme Parks / Attractions)
460
+ # =============================================================================
461
+ domain:
462
+ name: "Theme Parks & Attractions"
463
+
464
+ # Common features to look for
465
+ features:
466
+ - "ticket booking"
467
+ - "queue times"
468
+ - "express pass"
469
+ - "meal plans"
470
+ - "park maps"
471
+ - "show times"
472
+ - "photo pass"
473
+ - "virtual queue"
474
+ - "ride reservations"
475
+ - "mobile ordering"
476
+
477
+ # Pain points to prioritize
478
+ pain_points:
479
+ high_impact:
480
+ - "can't book tickets"
481
+ - "app crashes during booking"
482
+ - "payment fails"
483
+ - "queue times wrong"
484
+ - "can't access tickets"
485
+
486
+ medium_impact:
487
+ - "map doesn't load"
488
+ - "slow performance"
489
+ - "confusing navigation"
490
+ - "notifications not working"
491
+
492
+ # Positive signals
493
+ positive_signals:
494
+ - "easy booking"
495
+ - "fast check-in"
496
+ - "helpful features"
497
+ - "saved time"
498
+ - "convenient"
499
+ - "great experience"
500
+
501
+ # =============================================================================
502
+ # NOTES
503
+ # =============================================================================
504
+ # - Edit this file to customize agent behavior
505
+ # - Prompts support variables in {curly_braces}
506
+ # - Model names must match HuggingFace model IDs
507
+ # - Temperature: 0.0 = deterministic, 1.0 = creative
508
+ # - Changes take effect on next run (no restart needed for some settings)
config_loader.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration Loader
3
+ Loads settings from config.yaml for agent personas and prompts
4
+ """
5
+
6
+ import yaml
7
+ import os
8
+ from typing import Dict, Any
9
+
10
+ class Config:
11
+ """
12
+ Configuration manager for the Review Intelligence System
13
+ Loads and provides access to config.yaml settings
14
+ """
15
+
16
+ def __init__(self, config_file: str = "config.yaml"):
17
+ self.config_file = config_file
18
+ self.config = self._load_config()
19
+
20
+ def _load_config(self) -> Dict[str, Any]:
21
+ """Load configuration from YAML file"""
22
+ if not os.path.exists(self.config_file):
23
+ print(f"⚠️ Config file not found: {self.config_file}")
24
+ print(" Using default configuration")
25
+ return self._default_config()
26
+
27
+ try:
28
+ with open(self.config_file, 'r') as f:
29
+ config = yaml.safe_load(f)
30
+ print(f"✅ Configuration loaded from {self.config_file}")
31
+ return config
32
+ except Exception as e:
33
+ print(f"⚠️ Error loading config: {e}")
34
+ print(" Using default configuration")
35
+ return self._default_config()
36
+
37
+ def _default_config(self) -> Dict[str, Any]:
38
+ """Return default configuration if YAML not available"""
39
+ return {
40
+ 'models': {
41
+ 'stage1': {
42
+ 'llm1': {'name': 'Qwen/Qwen2.5-72B-Instruct', 'temperature': 0.1},
43
+ 'llm2': {'name': 'mistralai/Mistral-7B-Instruct-v0.3', 'temperature': 0.1},
44
+ 'manager': {'name': 'meta-llama/Llama-3.1-8B-Instruct', 'temperature': 0.1}
45
+ },
46
+ 'stage2': {
47
+ 'best_model': {'name': 'cardiffnlp/twitter-roberta-base-sentiment-latest'},
48
+ 'alternate_model': {'name': 'finiteautomata/bertweet-base-sentiment-analysis'}
49
+ },
50
+ 'stage3': {
51
+ 'llm3': {'name': 'meta-llama/Llama-3.1-70B-Instruct', 'temperature': 0.1}
52
+ }
53
+ }
54
+ }
55
+
56
+ def get_model(self, stage: str, model_key: str) -> Dict[str, Any]:
57
+ """Get model configuration for a specific stage"""
58
+ return self.config.get('models', {}).get(stage, {}).get(model_key, {})
59
+
60
+ def get_persona(self, agent: str) -> Dict[str, Any]:
61
+ """Get persona configuration for an agent"""
62
+ return self.config.get('personas', {}).get(agent, {})
63
+
64
+ def get_prompt_template(self, template_name: str) -> str:
65
+ """Get prompt template"""
66
+ return self.config.get('prompt_templates', {}).get(template_name, '')
67
+
68
+ def get_classification_rules(self) -> Dict[str, Any]:
69
+ """Get classification rules"""
70
+ return self.config.get('classification_rules', {})
71
+
72
+ def get_sentiment_settings(self) -> Dict[str, Any]:
73
+ """Get sentiment analysis settings"""
74
+ return self.config.get('sentiment', {})
75
+
76
+ def get_batch_settings(self) -> Dict[str, Any]:
77
+ """Get batch analysis settings"""
78
+ return self.config.get('batch_analysis', {})
79
+
80
+ def get_processing_settings(self) -> Dict[str, Any]:
81
+ """Get processing settings"""
82
+ return self.config.get('processing', {})
83
+
84
+ def get_dashboard_settings(self) -> Dict[str, Any]:
85
+ """Get dashboard settings"""
86
+ return self.config.get('dashboard', {})
87
+
88
+
89
+ # Singleton instance
90
+ _config_instance = None
91
+
92
+ def get_config(config_file: str = "config.yaml") -> Config:
93
+ """Get or create config singleton"""
94
+ global _config_instance
95
+ if _config_instance is None:
96
+ _config_instance = Config(config_file)
97
+ return _config_instance
98
+
99
+
100
+ if __name__ == "__main__":
101
+ # Test config loader
102
+ print("\n" + "="*60)
103
+ print("🧪 TESTING CONFIG LOADER")
104
+ print("="*60 + "\n")
105
+
106
+ config = get_config()
107
+
108
+ # Test model access
109
+ llm1_config = config.get_model('stage1', 'llm1')
110
+ print(f"LLM1 Model: {llm1_config.get('name', 'Not found')}")
111
+
112
+ # Test persona access
113
+ llm1_persona = config.get_persona('llm1')
114
+ print(f"LLM1 Persona: {llm1_persona.get('name', 'Not found')}")
115
+
116
+ # Test prompt template
117
+ prompt = config.get_prompt_template('stage1_llm1')
118
+ print(f"Prompt template loaded: {len(prompt)} characters")
119
+
120
+ print("\n✅ Config loader test complete!")
database_enhanced.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced Database Schema for Multi-Stage Review Analysis
3
+ Adds Stage 1-4 columns to existing reviews table
4
+ """
5
+
6
+ import sqlite3
7
+ from datetime import datetime
8
+ from typing import Dict, List, Any, Optional
9
+ import json
10
+
11
+ class EnhancedDatabase:
12
+ """
13
+ Manages enhanced database schema with Stage 1-4 columns
14
+ """
15
+
16
+ def __init__(self, db_file: str = "review_database.db"):
17
+ self.db_file = db_file
18
+ self.conn = None
19
+ print(f"📁 Database: {db_file}")
20
+
21
+ def connect(self):
22
+ """Connect to database"""
23
+ self.conn = sqlite3.connect(self.db_file)
24
+ self.conn.row_factory = sqlite3.Row
25
+ print("✅ Connected to database")
26
+ return self.conn
27
+
28
+ def close(self):
29
+ """Close database connection"""
30
+ if self.conn:
31
+ self.conn.close()
32
+ print("✅ Database connection closed")
33
+
34
+ def enhance_schema(self):
35
+ """
36
+ Add Stage 1-4 columns to existing reviews table
37
+ Non-destructive: keeps all existing data
38
+ """
39
+ print("\n" + "="*60)
40
+ print("🔧 ENHANCING DATABASE SCHEMA")
41
+ print("="*60)
42
+
43
+ cursor = self.conn.cursor()
44
+
45
+ # Get existing columns
46
+ cursor.execute("PRAGMA table_info(reviews)")
47
+ existing_columns = [row[1] for row in cursor.fetchall()]
48
+ print(f"📋 Existing columns: {len(existing_columns)}")
49
+
50
+ # Stage 1: Classification columns
51
+ stage1_columns = [
52
+ ("stage1_llm1_type", "TEXT"),
53
+ ("stage1_llm1_department", "TEXT"),
54
+ ("stage1_llm1_priority", "TEXT"),
55
+ ("stage1_llm1_confidence", "REAL"),
56
+ ("stage1_llm1_reasoning", "TEXT"),
57
+ ("stage1_llm2_user_type", "TEXT"),
58
+ ("stage1_llm2_emotion", "TEXT"),
59
+ ("stage1_llm2_context", "TEXT"),
60
+ ("stage1_llm2_confidence", "REAL"),
61
+ ("stage1_llm2_reasoning", "TEXT"),
62
+ ("stage1_manager_classification", "TEXT"),
63
+ ("stage1_manager_reasoning", "TEXT"),
64
+ ("stage1_completed_at", "TIMESTAMP"),
65
+ ]
66
+
67
+ # Stage 2: Sentiment columns
68
+ stage2_columns = [
69
+ ("stage2_best_sentiment", "TEXT"),
70
+ ("stage2_best_confidence", "REAL"),
71
+ ("stage2_best_prob_positive", "REAL"),
72
+ ("stage2_best_prob_neutral", "REAL"),
73
+ ("stage2_best_prob_negative", "REAL"),
74
+ ("stage2_alt_sentiment", "TEXT"),
75
+ ("stage2_alt_confidence", "REAL"),
76
+ ("stage2_alt_prob_positive", "REAL"),
77
+ ("stage2_alt_prob_neutral", "REAL"),
78
+ ("stage2_alt_prob_negative", "REAL"),
79
+ ("stage2_agreement", "BOOLEAN"),
80
+ ("stage2_layer_sentiment", "TEXT"),
81
+ ("stage2_completed_at", "TIMESTAMP"),
82
+ ]
83
+
84
+ # Stage 3: Finalization columns
85
+ stage3_columns = [
86
+ ("stage3_final_sentiment", "TEXT"),
87
+ ("stage3_confidence", "REAL"),
88
+ ("stage3_reasoning", "TEXT"),
89
+ ("stage3_validation_notes", "TEXT"),
90
+ ("stage3_conflicts_found", "TEXT"),
91
+ ("stage3_action_recommendation", "TEXT"),
92
+ ("stage3_needs_human_review", "BOOLEAN"),
93
+ ("stage3_completed_at", "TIMESTAMP"),
94
+ ]
95
+
96
+ # Processing metadata
97
+ metadata_columns = [
98
+ ("processing_status", "TEXT DEFAULT 'pending'"),
99
+ ("processing_version", "TEXT DEFAULT 'v1.0'"),
100
+ ("processing_started_at", "TIMESTAMP"),
101
+ ("processing_completed_at", "TIMESTAMP"),
102
+ ]
103
+
104
+ all_new_columns = (
105
+ stage1_columns +
106
+ stage2_columns +
107
+ stage3_columns +
108
+ metadata_columns
109
+ )
110
+
111
+ # Add columns that don't exist
112
+ added_count = 0
113
+ for col_name, col_type in all_new_columns:
114
+ if col_name not in existing_columns:
115
+ try:
116
+ cursor.execute(f"ALTER TABLE reviews ADD COLUMN {col_name} {col_type}")
117
+ added_count += 1
118
+ print(f" ✅ Added column: {col_name}")
119
+ except sqlite3.OperationalError as e:
120
+ if "duplicate column" not in str(e).lower():
121
+ print(f" ⚠️ Error adding {col_name}: {e}")
122
+
123
+ self.conn.commit()
124
+ print(f"\n✅ Schema enhanced: {added_count} new columns added")
125
+
126
+ # Create logs table for LLM decisions
127
+ self._create_logs_table(cursor)
128
+
129
+ # Create batch insights table
130
+ self._create_batch_insights_table(cursor)
131
+
132
+ return added_count
133
+
134
+ def _create_logs_table(self, cursor):
135
+ """Create table for LLM decision logs"""
136
+ cursor.execute("""
137
+ CREATE TABLE IF NOT EXISTS llm_decision_logs (
138
+ log_id INTEGER PRIMARY KEY AUTOINCREMENT,
139
+ review_id TEXT NOT NULL,
140
+ stage TEXT NOT NULL,
141
+ model_name TEXT NOT NULL,
142
+ input_prompt TEXT,
143
+ output_response TEXT,
144
+ confidence REAL,
145
+ reasoning TEXT,
146
+ processing_time_seconds REAL,
147
+ timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
148
+ FOREIGN KEY (review_id) REFERENCES reviews(review_id)
149
+ )
150
+ """)
151
+
152
+ cursor.execute("""
153
+ CREATE INDEX IF NOT EXISTS idx_logs_review_id
154
+ ON llm_decision_logs(review_id)
155
+ """)
156
+
157
+ cursor.execute("""
158
+ CREATE INDEX IF NOT EXISTS idx_logs_stage
159
+ ON llm_decision_logs(stage)
160
+ """)
161
+
162
+ self.conn.commit()
163
+ print(" ✅ Created llm_decision_logs table")
164
+
165
+ def _create_batch_insights_table(self, cursor):
166
+ """Create table for batch analytics"""
167
+ cursor.execute("""
168
+ CREATE TABLE IF NOT EXISTS batch_insights (
169
+ batch_id INTEGER PRIMARY KEY AUTOINCREMENT,
170
+ analysis_date DATE,
171
+ total_reviews INTEGER,
172
+ sentiment_positive INTEGER,
173
+ sentiment_neutral INTEGER,
174
+ sentiment_negative INTEGER,
175
+ priority_critical INTEGER,
176
+ priority_high INTEGER,
177
+ priority_medium INTEGER,
178
+ priority_low INTEGER,
179
+ dept_engineering INTEGER,
180
+ dept_ux INTEGER,
181
+ dept_support INTEGER,
182
+ dept_business INTEGER,
183
+ critical_issues TEXT,
184
+ quick_wins TEXT,
185
+ recommendations TEXT,
186
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
187
+ )
188
+ """)
189
+
190
+ self.conn.commit()
191
+ print(" ✅ Created batch_insights table")
192
+
193
+ def get_pending_reviews(self, limit: Optional[int] = None) -> List[Dict]:
194
+ """Get reviews that haven't been processed yet"""
195
+ cursor = self.conn.cursor()
196
+
197
+ query = """
198
+ SELECT * FROM reviews
199
+ WHERE processing_status IS NULL OR processing_status = 'pending'
200
+ ORDER BY scraped_at DESC
201
+ """
202
+
203
+ if limit:
204
+ query += f" LIMIT {limit}"
205
+
206
+ cursor.execute(query)
207
+ rows = cursor.fetchall()
208
+
209
+ return [dict(row) for row in rows]
210
+
211
+ def update_stage1(self, review_id: str, data: Dict[str, Any]):
212
+ """Update Stage 1 classification data"""
213
+ cursor = self.conn.cursor()
214
+
215
+ cursor.execute("""
216
+ UPDATE reviews SET
217
+ stage1_llm1_type = ?,
218
+ stage1_llm1_department = ?,
219
+ stage1_llm1_priority = ?,
220
+ stage1_llm1_confidence = ?,
221
+ stage1_llm1_reasoning = ?,
222
+ stage1_llm2_user_type = ?,
223
+ stage1_llm2_emotion = ?,
224
+ stage1_llm2_context = ?,
225
+ stage1_llm2_confidence = ?,
226
+ stage1_llm2_reasoning = ?,
227
+ stage1_manager_classification = ?,
228
+ stage1_manager_reasoning = ?,
229
+ stage1_completed_at = ?,
230
+ processing_status = 'stage1_complete'
231
+ WHERE review_id = ?
232
+ """, (
233
+ data.get('llm1_type'),
234
+ data.get('llm1_department'),
235
+ data.get('llm1_priority'),
236
+ data.get('llm1_confidence'),
237
+ data.get('llm1_reasoning'),
238
+ data.get('llm2_user_type'),
239
+ data.get('llm2_emotion'),
240
+ data.get('llm2_context'),
241
+ data.get('llm2_confidence'),
242
+ data.get('llm2_reasoning'),
243
+ data.get('manager_classification'),
244
+ data.get('manager_reasoning'),
245
+ datetime.now().isoformat(),
246
+ review_id
247
+ ))
248
+
249
+ self.conn.commit()
250
+
251
+ def update_stage2(self, review_id: str, data: Dict[str, Any]):
252
+ """Update Stage 2 sentiment data"""
253
+ cursor = self.conn.cursor()
254
+
255
+ cursor.execute("""
256
+ UPDATE reviews SET
257
+ stage2_best_sentiment = ?,
258
+ stage2_best_confidence = ?,
259
+ stage2_best_prob_positive = ?,
260
+ stage2_best_prob_neutral = ?,
261
+ stage2_best_prob_negative = ?,
262
+ stage2_alt_sentiment = ?,
263
+ stage2_alt_confidence = ?,
264
+ stage2_alt_prob_positive = ?,
265
+ stage2_alt_prob_neutral = ?,
266
+ stage2_alt_prob_negative = ?,
267
+ stage2_agreement = ?,
268
+ stage2_layer_sentiment = ?,
269
+ stage2_completed_at = ?,
270
+ processing_status = 'stage2_complete'
271
+ WHERE review_id = ?
272
+ """, (
273
+ data.get('best_sentiment'),
274
+ data.get('best_confidence'),
275
+ data.get('best_prob_positive'),
276
+ data.get('best_prob_neutral'),
277
+ data.get('best_prob_negative'),
278
+ data.get('alt_sentiment'),
279
+ data.get('alt_confidence'),
280
+ data.get('alt_prob_positive'),
281
+ data.get('alt_prob_neutral'),
282
+ data.get('alt_prob_negative'),
283
+ data.get('agreement'),
284
+ data.get('layer_sentiment'),
285
+ datetime.now().isoformat(),
286
+ review_id
287
+ ))
288
+
289
+ self.conn.commit()
290
+
291
+ def update_stage3(self, review_id: str, data: Dict[str, Any]):
292
+ """Update Stage 3 finalization data"""
293
+ cursor = self.conn.cursor()
294
+
295
+ cursor.execute("""
296
+ UPDATE reviews SET
297
+ stage3_final_sentiment = ?,
298
+ stage3_confidence = ?,
299
+ stage3_reasoning = ?,
300
+ stage3_validation_notes = ?,
301
+ stage3_conflicts_found = ?,
302
+ stage3_action_recommendation = ?,
303
+ stage3_needs_human_review = ?,
304
+ stage3_completed_at = ?,
305
+ processing_status = 'complete',
306
+ processing_completed_at = ?
307
+ WHERE review_id = ?
308
+ """, (
309
+ data.get('final_sentiment'),
310
+ data.get('confidence'),
311
+ data.get('reasoning'),
312
+ data.get('validation_notes'),
313
+ data.get('conflicts_found'),
314
+ data.get('action_recommendation'),
315
+ data.get('needs_human_review'),
316
+ datetime.now().isoformat(),
317
+ datetime.now().isoformat(),
318
+ review_id
319
+ ))
320
+
321
+ self.conn.commit()
322
+
323
+ def log_llm_decision(self, review_id: str, stage: str, model_name: str,
324
+ input_prompt: str, output_response: str,
325
+ confidence: float, reasoning: str, processing_time: float):
326
+ """Log LLM decision for audit trail"""
327
+ cursor = self.conn.cursor()
328
+
329
+ cursor.execute("""
330
+ INSERT INTO llm_decision_logs
331
+ (review_id, stage, model_name, input_prompt, output_response,
332
+ confidence, reasoning, processing_time_seconds)
333
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
334
+ """, (
335
+ review_id, stage, model_name, input_prompt, output_response,
336
+ confidence, reasoning, processing_time
337
+ ))
338
+
339
+ self.conn.commit()
340
+
341
+ def get_all_processed_reviews(self) -> List[Dict]:
342
+ """Get all reviews that have been fully processed"""
343
+ cursor = self.conn.cursor()
344
+
345
+ cursor.execute("""
346
+ SELECT * FROM reviews
347
+ WHERE processing_status = 'complete'
348
+ ORDER BY processing_completed_at DESC
349
+ """)
350
+
351
+ rows = cursor.fetchall()
352
+ return [dict(row) for row in rows]
353
+
354
+ def save_batch_insights(self, insights: Dict[str, Any]):
355
+ """Save batch analytics to database"""
356
+ cursor = self.conn.cursor()
357
+
358
+ cursor.execute("""
359
+ INSERT INTO batch_insights
360
+ (analysis_date, total_reviews, sentiment_positive, sentiment_neutral,
361
+ sentiment_negative, priority_critical, priority_high, priority_medium,
362
+ priority_low, dept_engineering, dept_ux, dept_support, dept_business,
363
+ critical_issues, quick_wins, recommendations)
364
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
365
+ """, (
366
+ datetime.now().date(),
367
+ insights.get('total_reviews', 0),
368
+ insights.get('sentiment_positive', 0),
369
+ insights.get('sentiment_neutral', 0),
370
+ insights.get('sentiment_negative', 0),
371
+ insights.get('priority_critical', 0),
372
+ insights.get('priority_high', 0),
373
+ insights.get('priority_medium', 0),
374
+ insights.get('priority_low', 0),
375
+ insights.get('dept_engineering', 0),
376
+ insights.get('dept_ux', 0),
377
+ insights.get('dept_support', 0),
378
+ insights.get('dept_business', 0),
379
+ json.dumps(insights.get('critical_issues', [])),
380
+ json.dumps(insights.get('quick_wins', [])),
381
+ json.dumps(insights.get('recommendations', []))
382
+ ))
383
+
384
+ self.conn.commit()
385
+ print(" ✅ Batch insights saved to database")
386
+
387
+
388
+ if __name__ == "__main__":
389
+ # Test database enhancement
390
+ print("\n" + "="*60)
391
+ print("🧪 TESTING DATABASE ENHANCEMENT")
392
+ print("="*60 + "\n")
393
+
394
+ db = EnhancedDatabase()
395
+ db.connect()
396
+ db.enhance_schema()
397
+
398
+ # Get pending reviews
399
+ pending = db.get_pending_reviews(limit=5)
400
+ print(f"\n📋 Found {len(pending)} pending reviews")
401
+
402
+ db.close()
403
+ print("\n✅ Database enhancement test complete!")
gradio_pipeline.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio Pipeline - Streamlined processing for HuggingFace Spaces
3
+ Integrates scraping, classification, sentiment, and batch analysis with progress tracking
4
+ """
5
+
6
+ import os
7
+ import sqlite3
8
+ import time
9
+ from typing import List, Dict, Any, Optional, Callable
10
+ from datetime import datetime
11
+ import json
12
+
13
+ # Import existing modules
14
+ from stage0_scraper import Stage0WebScraper
15
+ from langgraph_state import ReviewState, create_initial_state
16
+ from langgraph_graph import build_review_graph, build_batch_graph
17
+ from database_enhanced import EnhancedDatabase
18
+ from stage4_batch_analysis import Stage4BatchAnalysis
19
+
20
+
21
+ class GradioPipeline:
22
+ """
23
+ Streamlined pipeline for Gradio interface
24
+ Handles scraping, processing, and analysis with progress callbacks
25
+ """
26
+
27
+ def __init__(self, db_file: str = "review_database.db", review_limit: int = 20):
28
+ self.db_file = db_file
29
+ self.review_limit = review_limit
30
+
31
+ # Initialize database
32
+ self.db = EnhancedDatabase(db_file)
33
+ self.db.connect()
34
+ self.db.enhance_schema()
35
+
36
+ # Initialize scraper
37
+ self.scraper = Stage0WebScraper(db_file)
38
+ self.scraper.create_reviews_table()
39
+
40
+ # Build graphs
41
+ self.review_graph = build_review_graph()
42
+ self.batch_graph = build_batch_graph()
43
+
44
+ print("✅ Gradio Pipeline initialized")
45
+
46
+ def scrape_reviews(
47
+ self,
48
+ app_store_ids: List[str],
49
+ play_store_packages: List[str],
50
+ progress_callback: Optional[Callable] = None
51
+ ) -> int:
52
+ """
53
+ Scrape reviews from App Store and Play Store
54
+
55
+ Args:
56
+ app_store_ids: List of App Store IDs
57
+ play_store_packages: List of Play Store package names
58
+ progress_callback: Optional Gradio progress callback
59
+
60
+ Returns:
61
+ Total number of reviews scraped
62
+ """
63
+ total_scraped = 0
64
+ total_apps = len(app_store_ids) + len(play_store_packages)
65
+
66
+ if total_apps == 0:
67
+ return 0
68
+
69
+ current_app = 0
70
+
71
+ # Scrape App Store
72
+ for app_id in app_store_ids:
73
+ current_app += 1
74
+ if progress_callback:
75
+ progress_val = 0.1 + (0.2 * current_app / total_apps)
76
+ progress_callback(
77
+ progress_val,
78
+ desc=f"🍎 Scraping App Store ({current_app}/{total_apps}): {app_id}"
79
+ )
80
+
81
+ try:
82
+ reviews = self.scraper.scrape_app_store_rss(
83
+ app_id,
84
+ country="ae",
85
+ limit=self.review_limit
86
+ )
87
+ saved = self.scraper.save_reviews_to_db(reviews)
88
+ total_scraped += saved
89
+ print(f" ✅ App Store {app_id}: {saved} reviews")
90
+ except Exception as e:
91
+ print(f" ❌ App Store {app_id} error: {e}")
92
+ continue
93
+
94
+ time.sleep(1) # Rate limiting
95
+
96
+ # Scrape Play Store
97
+ for package in play_store_packages:
98
+ current_app += 1
99
+ if progress_callback:
100
+ progress_val = 0.1 + (0.2 * current_app / total_apps)
101
+ progress_callback(
102
+ progress_val,
103
+ desc=f"🤖 Scraping Play Store ({current_app}/{total_apps}): {package}"
104
+ )
105
+
106
+ try:
107
+ reviews = self.scraper.scrape_play_store_api(
108
+ package,
109
+ country="ae",
110
+ limit=self.review_limit
111
+ )
112
+ saved = self.scraper.save_reviews_to_db(reviews)
113
+ total_scraped += saved
114
+ print(f" ✅ Play Store {package}: {saved} reviews")
115
+ except Exception as e:
116
+ print(f" ❌ Play Store {package} error: {e}")
117
+ continue
118
+
119
+ time.sleep(1) # Rate limiting
120
+
121
+ print(f"\n✅ Total scraped: {total_scraped} reviews")
122
+ return total_scraped
123
+
124
+ def process_reviews(
125
+ self,
126
+ progress_callback: Optional[Callable] = None
127
+ ) -> List[Dict[str, Any]]:
128
+ """
129
+ Process reviews through Stages 1-3
130
+
131
+ Args:
132
+ progress_callback: Optional Gradio progress callback
133
+
134
+ Returns:
135
+ List of processed review dictionaries
136
+ """
137
+ # Get pending reviews
138
+ reviews = self.db.get_pending_reviews(limit=self.review_limit)
139
+ total_reviews = len(reviews)
140
+
141
+ if total_reviews == 0:
142
+ print("⚠️ No pending reviews to process")
143
+ return []
144
+
145
+ print(f"\n📊 Processing {total_reviews} reviews...")
146
+
147
+ processed_states = []
148
+
149
+ for i, review in enumerate(reviews, 1):
150
+ review_id = review.get('review_id', 'unknown')
151
+
152
+ if progress_callback:
153
+ progress_val = 0.3 + (0.6 * i / total_reviews)
154
+ progress_callback(
155
+ progress_val,
156
+ desc=f"🤖 Processing review {i}/{total_reviews}: {review_id[:20]}..."
157
+ )
158
+
159
+ try:
160
+ # Create initial state
161
+ state = create_initial_state(review)
162
+
163
+ # Run through LangGraph
164
+ config = {"configurable": {"thread_id": f"review_{review_id}"}}
165
+ final_state = self.review_graph.invoke(state, config=config)
166
+
167
+ # Convert state to dict for easier handling
168
+ processed_states.append(dict(final_state))
169
+
170
+ print(f" ✅ Review {i}/{total_reviews} processed")
171
+
172
+ except Exception as e:
173
+ print(f" ❌ Error processing review {review_id}: {e}")
174
+ continue
175
+
176
+ print(f"\n✅ Processed {len(processed_states)}/{total_reviews} reviews")
177
+ return processed_states
178
+
179
+ def analyze_batch(
180
+ self,
181
+ processed_reviews: List[Dict[str, Any]]
182
+ ) -> Dict[str, Any]:
183
+ """
184
+ Run Stage 4: Batch Analysis
185
+
186
+ Args:
187
+ processed_reviews: List of processed review states
188
+
189
+ Returns:
190
+ Batch insights dictionary
191
+ """
192
+ if not processed_reviews:
193
+ return {}
194
+
195
+ print(f"\n📊 Running batch analysis on {len(processed_reviews)} reviews...")
196
+
197
+ # Convert states to review dicts for Stage 4
198
+ reviews_for_analysis = []
199
+ for state in processed_reviews:
200
+ review_dict = {
201
+ 'review_id': state.get('review_id'),
202
+ 'review_text': state.get('review_text'),
203
+ 'rating': state.get('rating'),
204
+ 'stage1_llm1_type': state.get('classification_type'),
205
+ 'stage1_llm1_department': state.get('department'),
206
+ 'stage1_llm1_priority': state.get('priority'),
207
+ 'stage1_llm2_user_type': state.get('user_type'),
208
+ 'stage1_llm2_emotion': state.get('emotion'),
209
+ 'stage2_agreement': state.get('sentiment_agreement'),
210
+ 'stage3_final_sentiment': state.get('final_sentiment'),
211
+ 'stage3_needs_human_review': state.get('needs_human_review'),
212
+ 'stage3_reasoning': state.get('reasoning'),
213
+ 'stage3_action_recommendation': state.get('action_recommendation'),
214
+ }
215
+ reviews_for_analysis.append(review_dict)
216
+
217
+ # Run Stage 4
218
+ stage4 = Stage4BatchAnalysis()
219
+ insights = stage4.analyze_batch(reviews_for_analysis)
220
+
221
+ # Save to database
222
+ self.db.save_batch_insights(insights)
223
+
224
+ print("✅ Batch analysis complete")
225
+ return insights
226
+
227
+ def get_all_processed_reviews(self) -> List[Dict[str, Any]]:
228
+ """Get all processed reviews from database"""
229
+ return self.db.get_all_processed_reviews()
230
+
231
+ def close(self):
232
+ """Clean up"""
233
+ self.db.close()
234
+
235
+
236
+ # ============================================================================
237
+ # HELPER FUNCTIONS FOR GRADIO
238
+ # ============================================================================
239
+
240
+ def parse_app_store_url(url: str) -> Optional[str]:
241
+ """
242
+ Extract App Store ID from URL or return as-is if already an ID
243
+
244
+ Examples:
245
+ - "1234567890" -> "1234567890"
246
+ - "https://apps.apple.com/us/app/name/id1234567890" -> "1234567890"
247
+ """
248
+ url = url.strip()
249
+
250
+ # Check if it's already just a number
251
+ if url.isdigit():
252
+ return url
253
+
254
+ # Extract from URL
255
+ if 'apps.apple.com' in url:
256
+ parts = url.split('/id')
257
+ if len(parts) > 1:
258
+ app_id = parts[1].split('?')[0].split('/')[0]
259
+ if app_id.isdigit():
260
+ return app_id
261
+
262
+ # Try to find any number in the string
263
+ import re
264
+ numbers = re.findall(r'\d+', url)
265
+ if numbers:
266
+ # Return the longest number (likely the app ID)
267
+ return max(numbers, key=len)
268
+
269
+ return None
270
+
271
+
272
+ def parse_play_store_url(url: str) -> Optional[str]:
273
+ """
274
+ Extract package name from Play Store URL or return as-is
275
+
276
+ Examples:
277
+ - "com.company.app" -> "com.company.app"
278
+ - "https://play.google.com/store/apps/details?id=com.company.app" -> "com.company.app"
279
+ """
280
+ url = url.strip()
281
+
282
+ # Check if it's already a package name (has dots)
283
+ if '.' in url and not url.startswith('http'):
284
+ return url
285
+
286
+ # Extract from URL
287
+ if 'play.google.com' in url:
288
+ if 'id=' in url:
289
+ package = url.split('id=')[1].split('&')[0]
290
+ return package
291
+
292
+ return url if '.' in url else None
293
+
294
+
295
+ if __name__ == "__main__":
296
+ print("\n" + "="*60)
297
+ print("🧪 TESTING GRADIO PIPELINE")
298
+ print("="*60)
299
+
300
+ # Test URL parsing
301
+ print("\n📱 Testing URL parsing:")
302
+
303
+ test_app_urls = [
304
+ "1234567890",
305
+ "https://apps.apple.com/us/app/name/id1234567890",
306
+ ]
307
+
308
+ for url in test_app_urls:
309
+ app_id = parse_app_store_url(url)
310
+ print(f" {url} -> {app_id}")
311
+
312
+ test_play_urls = [
313
+ "com.company.app",
314
+ "https://play.google.com/store/apps/details?id=com.company.app",
315
+ ]
316
+
317
+ for url in test_play_urls:
318
+ package = parse_play_store_url(url)
319
+ print(f" {url} -> {package}")
320
+
321
+ print("\n✅ Gradio pipeline test complete!")
langgraph_graph.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LangGraph Graph Definition
3
+ Defines the review processing workflow with conditional routing
4
+ """
5
+
6
+ from langgraph.graph import StateGraph, END
7
+ from langgraph.checkpoint.memory import MemorySaver
8
+ from typing import Literal
9
+
10
+ from langgraph_state import ReviewState, BatchState, create_initial_state
11
+ from langgraph_nodes import (
12
+ stage1_classification_node,
13
+ stage2_sentiment_node,
14
+ stage3_finalization_node
15
+ )
16
+ from stage4_batch_analysis import Stage4BatchAnalysis
17
+ from database_enhanced import EnhancedDatabase
18
+
19
+
20
+ # ============================================================================
21
+ # DATABASE SYNC NODES
22
+ # ============================================================================
23
+
24
+ def save_stage1_to_db_node(state: ReviewState) -> dict:
25
+ """Save Stage 1 results to database"""
26
+ db = EnhancedDatabase()
27
+ db.connect()
28
+
29
+ try:
30
+ stage1_data = {
31
+ 'llm1_type': state['llm1_result'].get('type'),
32
+ 'llm1_department': state['llm1_result'].get('department'),
33
+ 'llm1_priority': state['llm1_result'].get('priority'),
34
+ 'llm1_confidence': state['llm1_result'].get('confidence'),
35
+ 'llm1_reasoning': state['llm1_result'].get('reasoning'),
36
+
37
+ 'llm2_user_type': state['llm2_result'].get('user_type'),
38
+ 'llm2_emotion': state['llm2_result'].get('emotion'),
39
+ 'llm2_context': state['llm2_result'].get('context'),
40
+ 'llm2_confidence': state['llm2_result'].get('confidence'),
41
+ 'llm2_reasoning': state['llm2_result'].get('reasoning'),
42
+
43
+ 'manager_classification': str(state['manager_result']),
44
+ 'manager_reasoning': state['manager_result'].get('reasoning'),
45
+ }
46
+
47
+ db.update_stage1(state['review_id'], stage1_data)
48
+ db.close()
49
+
50
+ return {"db_stage1_saved": True}
51
+ except Exception as e:
52
+ db.close()
53
+ errors = state.get('errors', [])
54
+ errors.append(f"DB Stage 1 save error: {str(e)}")
55
+ return {"errors": errors}
56
+
57
+
58
+ def save_stage2_to_db_node(state: ReviewState) -> dict:
59
+ """Save Stage 2 results to database"""
60
+ db = EnhancedDatabase()
61
+ db.connect()
62
+
63
+ try:
64
+ stage2_data = {
65
+ 'best_sentiment': state['best_sentiment_result']['sentiment'],
66
+ 'best_confidence': state['best_sentiment_result']['confidence'],
67
+ 'best_prob_positive': state['best_sentiment_result']['prob_positive'],
68
+ 'best_prob_neutral': state['best_sentiment_result']['prob_neutral'],
69
+ 'best_prob_negative': state['best_sentiment_result']['prob_negative'],
70
+
71
+ 'alt_sentiment': state['alt_sentiment_result']['sentiment'],
72
+ 'alt_confidence': state['alt_sentiment_result']['confidence'],
73
+ 'alt_prob_positive': state['alt_sentiment_result']['prob_positive'],
74
+ 'alt_prob_neutral': state['alt_sentiment_result']['prob_neutral'],
75
+ 'alt_prob_negative': state['alt_sentiment_result']['prob_negative'],
76
+
77
+ 'agreement': state['sentiment_agreement'],
78
+ 'layer_sentiment': state['sentiment'],
79
+ }
80
+
81
+ db.update_stage2(state['review_id'], stage2_data)
82
+ db.close()
83
+
84
+ return {"db_stage2_saved": True}
85
+ except Exception as e:
86
+ db.close()
87
+ errors = state.get('errors', [])
88
+ errors.append(f"DB Stage 2 save error: {str(e)}")
89
+ return {"errors": errors}
90
+
91
+
92
+ def save_stage3_to_db_node(state: ReviewState) -> dict:
93
+ """Save Stage 3 results to database"""
94
+ db = EnhancedDatabase()
95
+ db.connect()
96
+
97
+ try:
98
+ stage3_data = {
99
+ 'final_sentiment': state['final_sentiment'],
100
+ 'confidence': state['final_confidence'],
101
+ 'reasoning': state['reasoning'],
102
+ 'validation_notes': state['validation_notes'],
103
+ 'conflicts_found': state['conflicts_found'],
104
+ 'action_recommendation': state['action_recommendation'],
105
+ 'needs_human_review': state['needs_human_review'],
106
+ }
107
+
108
+ db.update_stage3(state['review_id'], stage3_data)
109
+ db.close()
110
+
111
+ return {"db_stage3_saved": True}
112
+ except Exception as e:
113
+ db.close()
114
+ errors = state.get('errors', [])
115
+ errors.append(f"DB Stage 3 save error: {str(e)}")
116
+ return {"errors": errors}
117
+
118
+
119
+ # ============================================================================
120
+ # STAGE 4: BATCH ANALYSIS NODE
121
+ # ============================================================================
122
+
123
+ def stage4_batch_analysis_node(state: BatchState) -> dict:
124
+ """
125
+ Stage 4 Node: Batch analysis
126
+ Runs after all reviews are processed
127
+ """
128
+ print(f"\n{'='*70}")
129
+ print(f"📊 STAGE 4: BATCH ANALYSIS")
130
+ print(f"{'='*70}")
131
+
132
+ stage4 = Stage4BatchAnalysis()
133
+
134
+ # Convert ReviewState list to dict format for Stage4
135
+ reviews_for_analysis = []
136
+ for review_state in state['all_reviews']:
137
+ review_dict = {
138
+ 'review_id': review_state['review_id'],
139
+ 'review_text': review_state['review_text'],
140
+ 'rating': review_state['rating'],
141
+ 'stage1_llm1_type': review_state.get('classification_type'),
142
+ 'stage1_llm1_department': review_state.get('department'),
143
+ 'stage1_llm1_priority': review_state.get('priority'),
144
+ 'stage1_llm2_user_type': review_state.get('user_type'),
145
+ 'stage1_llm2_emotion': review_state.get('emotion'),
146
+ 'stage2_agreement': review_state.get('sentiment_agreement'),
147
+ 'stage3_final_sentiment': review_state.get('final_sentiment'),
148
+ 'stage3_needs_human_review': review_state.get('needs_human_review'),
149
+ 'stage3_reasoning': review_state.get('reasoning'),
150
+ 'stage3_action_recommendation': review_state.get('action_recommendation'),
151
+ }
152
+ reviews_for_analysis.append(review_dict)
153
+
154
+ # Analyze batch
155
+ insights = stage4.analyze_batch(reviews_for_analysis)
156
+
157
+ # Save to database
158
+ db = EnhancedDatabase()
159
+ db.connect()
160
+ db.save_batch_insights(insights)
161
+ db.close()
162
+
163
+ return {
164
+ 'sentiment_distribution': insights.get('sentiment_distribution'),
165
+ 'priority_distribution': insights.get('priority_distribution'),
166
+ 'department_distribution': insights.get('department_distribution'),
167
+ 'emotion_distribution': insights.get('emotion_distribution'),
168
+ 'critical_issues': insights.get('critical_issues'),
169
+ 'quick_wins': insights.get('quick_wins'),
170
+ 'churn_risk': insights.get('churn_risk'),
171
+ 'model_agreement_rate': insights.get('model_agreement_rate'),
172
+ 'recommendations': insights.get('recommendations'),
173
+ 'batch_completed_at': insights.get('batch_completed_at')
174
+ }
175
+
176
+
177
+ # ============================================================================
178
+ # ROUTING FUNCTIONS
179
+ # ============================================================================
180
+
181
+ def route_after_stage3(state: ReviewState) -> Literal["human_review", "complete"]:
182
+ """
183
+ Conditional routing after Stage 3
184
+ Decides if human review is needed
185
+ """
186
+ # Check if human review needed
187
+ if state.get('needs_human_review', False):
188
+ return "human_review"
189
+
190
+ # Check confidence threshold
191
+ if state.get('final_confidence', 1.0) < 0.5:
192
+ return "human_review"
193
+
194
+ # Check for conflicts
195
+ if state.get('conflicts_found', 'none') != 'none':
196
+ return "human_review"
197
+
198
+ # Check priority
199
+ if state.get('priority') == 'critical':
200
+ return "human_review"
201
+
202
+ return "complete"
203
+
204
+
205
+ def human_review_queue_node(state: ReviewState) -> dict:
206
+ """
207
+ Node for reviews flagged for human review
208
+ Just marks them in the database
209
+ """
210
+ print(f" 🚨 FLAGGED for human review")
211
+
212
+ # Could integrate with ticketing system, email alerts, etc.
213
+ # For now, just mark in state
214
+ return {
215
+ "route_to": "human_review"
216
+ }
217
+
218
+
219
+ # ============================================================================
220
+ # BUILD REVIEW PROCESSING GRAPH
221
+ # ============================================================================
222
+
223
+ def build_review_graph():
224
+ """
225
+ Build the complete review processing graph
226
+ """
227
+
228
+ # Create graph
229
+ workflow = StateGraph(ReviewState)
230
+
231
+ # Add all nodes
232
+ workflow.add_node("stage1_classify", stage1_classification_node)
233
+ workflow.add_node("save_stage1", save_stage1_to_db_node)
234
+
235
+ workflow.add_node("stage2_sentiment", stage2_sentiment_node)
236
+ workflow.add_node("save_stage2", save_stage2_to_db_node)
237
+
238
+ workflow.add_node("stage3_finalize", stage3_finalization_node)
239
+ workflow.add_node("save_stage3", save_stage3_to_db_node)
240
+
241
+ workflow.add_node("human_review_queue", human_review_queue_node)
242
+
243
+ # Add edges (sequential flow through stages)
244
+ workflow.add_edge("stage1_classify", "save_stage1")
245
+ workflow.add_edge("save_stage1", "stage2_sentiment")
246
+ workflow.add_edge("stage2_sentiment", "save_stage2")
247
+ workflow.add_edge("save_stage2", "stage3_finalize")
248
+ workflow.add_edge("stage3_finalize", "save_stage3")
249
+
250
+ # Add conditional routing after Stage 3
251
+ workflow.add_conditional_edges(
252
+ "save_stage3",
253
+ route_after_stage3,
254
+ {
255
+ "human_review": "human_review_queue",
256
+ "complete": END
257
+ }
258
+ )
259
+
260
+ # Human review goes to END
261
+ workflow.add_edge("human_review_queue", END)
262
+
263
+ # Set entry point
264
+ workflow.set_entry_point("stage1_classify")
265
+
266
+ # Compile with checkpointing
267
+ memory = MemorySaver()
268
+ graph = workflow.compile(checkpointer=memory)
269
+
270
+ return graph
271
+
272
+
273
+ # ============================================================================
274
+ # BUILD BATCH ANALYSIS GRAPH (Stage 4)
275
+ # ============================================================================
276
+
277
+ def build_batch_graph():
278
+ """
279
+ Build the batch analysis graph (Stage 4)
280
+ This runs after all reviews are processed
281
+ """
282
+
283
+ workflow = StateGraph(BatchState)
284
+
285
+ # Add batch analysis node
286
+ workflow.add_node("stage4_batch", stage4_batch_analysis_node)
287
+
288
+ # Simple linear flow
289
+ workflow.set_entry_point("stage4_batch")
290
+ workflow.add_edge("stage4_batch", END)
291
+
292
+ # Compile
293
+ graph = workflow.compile()
294
+
295
+ return graph
296
+
297
+
298
+ if __name__ == "__main__":
299
+ print("\n" + "="*60)
300
+ print("🧪 TESTING LANGGRAPH GRAPH BUILDER")
301
+ print("="*60)
302
+
303
+ # Build review graph
304
+ print("\n📊 Building review processing graph...")
305
+ review_graph = build_review_graph()
306
+ print(" ✅ Review graph built!")
307
+
308
+ # Build batch graph
309
+ print("\n📊 Building batch analysis graph...")
310
+ batch_graph = build_batch_graph()
311
+ print(" ✅ Batch graph built!")
312
+
313
+ print("\n✅ Graph builder test complete!")
langgraph_nodes.py ADDED
@@ -0,0 +1,583 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LangGraph Nodes
3
+ All node functions for the review processing graph
4
+ Implements parallel execution where possible
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import time
10
+ from typing import Dict, Any
11
+ from datetime import datetime
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
+ from huggingface_hub import InferenceClient
14
+ import torch
15
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
16
+ import warnings
17
+ warnings.filterwarnings('ignore')
18
+
19
+ from langgraph_state import ReviewState, BatchState
20
+ from database_enhanced import EnhancedDatabase
21
+
22
+ # Initialize HF client (singleton)
23
+ HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
24
+ hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else None
25
+
26
+ # Initialize sentiment models (singleton) - load once
27
+ _sentiment_models_loaded = False
28
+ _best_tokenizer = None
29
+ _best_model = None
30
+ _alt_tokenizer = None
31
+ _alt_model = None
32
+
33
+ def load_sentiment_models():
34
+ """Load sentiment models once (singleton pattern)"""
35
+ global _sentiment_models_loaded, _best_tokenizer, _best_model, _alt_tokenizer, _alt_model
36
+
37
+ if _sentiment_models_loaded:
38
+ return
39
+
40
+ print(" 📦 Loading Twitter-BERT models (one-time)...")
41
+
42
+ # Best Model
43
+ _best_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
44
+ _best_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
45
+ _best_model.eval()
46
+
47
+ # Alternate Model
48
+ _alt_tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
49
+ _alt_model = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
50
+ _alt_model.eval()
51
+
52
+ _sentiment_models_loaded = True
53
+ print(" ✅ Sentiment models loaded!")
54
+
55
+
56
+ # ============================================================================
57
+ # STAGE 1: CLASSIFICATION NODE (Parallel LLM1 + LLM2)
58
+ # ============================================================================
59
+
60
+ def llm1_classify(review: Dict[str, Any]) -> Dict[str, Any]:
61
+ """LLM1: Type, Department, Priority classification"""
62
+ review_text = review.get('review_text', '')
63
+ rating = review.get('rating', 3)
64
+
65
+ prompt = f"""You are an expert at classifying customer reviews for theme park and attraction apps.
66
+
67
+ REVIEW:
68
+ Rating: {rating}/5
69
+ Text: {review_text}
70
+
71
+ Classify this review across these dimensions:
72
+
73
+ 1. TYPE (choose ONE):
74
+ - complaint: Customer reports a problem
75
+ - praise: Customer expresses satisfaction
76
+ - suggestion: Customer proposes improvement
77
+ - question: Customer asks about something
78
+ - bug_report: Technical issue described
79
+
80
+ 2. DEPARTMENT (choose ONE):
81
+ - engineering: Technical issues, bugs, crashes
82
+ - ux: Design, usability, interface issues
83
+ - support: Customer service, help needed
84
+ - business: Pricing, policies, marketing
85
+
86
+ 3. PRIORITY (choose ONE):
87
+ - critical: Service down, major blocker
88
+ - high: Significant problem affecting use
89
+ - medium: Inconvenience but not blocking
90
+ - low: Minor issue or suggestion
91
+
92
+ 4. CONFIDENCE (0.0-1.0): How confident are you?
93
+
94
+ 5. REASONING: Brief one-sentence explanation
95
+
96
+ Respond ONLY in valid JSON format:
97
+ {{
98
+ "type": "complaint/praise/suggestion/question/bug_report",
99
+ "department": "engineering/ux/support/business",
100
+ "priority": "critical/high/medium/low",
101
+ "confidence": 0.0-1.0,
102
+ "reasoning": "brief explanation"
103
+ }}"""
104
+
105
+ try:
106
+ response = hf_client.text_generation(
107
+ prompt,
108
+ model="Qwen/Qwen2.5-72B-Instruct",
109
+ max_new_tokens=200,
110
+ temperature=0.1
111
+ )
112
+
113
+ # Clean and parse JSON
114
+ response_clean = response.strip()
115
+ if response_clean.startswith('```'):
116
+ response_clean = response_clean.split('```')[1]
117
+ if response_clean.startswith('json'):
118
+ response_clean = response_clean[4:]
119
+ response_clean = response_clean.strip()
120
+
121
+ result = json.loads(response_clean)
122
+ result['model'] = 'Qwen/Qwen2.5-72B-Instruct'
123
+ return result
124
+
125
+ except Exception as e:
126
+ return {
127
+ 'type': 'unknown',
128
+ 'department': 'unknown',
129
+ 'priority': 'medium',
130
+ 'confidence': 0.0,
131
+ 'reasoning': f'Error: {str(e)}',
132
+ 'model': 'Qwen/Qwen2.5-72B-Instruct'
133
+ }
134
+
135
+
136
+ def llm2_analyze(review: Dict[str, Any]) -> Dict[str, Any]:
137
+ """LLM2: User type, Emotion, Context analysis"""
138
+ review_text = review.get('review_text', '')
139
+ rating = review.get('rating', 3)
140
+
141
+ prompt = f"""You are an expert at understanding customer psychology and emotional context.
142
+
143
+ REVIEW:
144
+ Rating: {rating}/5
145
+ Text: {review_text}
146
+
147
+ Analyze the user and emotional context:
148
+
149
+ 1. USER_TYPE (choose ONE):
150
+ - new_user: First-time or new user
151
+ - regular_user: Returning customer
152
+ - power_user: Heavy user, tech-savvy
153
+ - churning_user: Considering leaving
154
+
155
+ 2. EMOTION (choose ONE):
156
+ - anger: Angry, hostile tone
157
+ - frustration: Frustrated but not angry
158
+ - joy: Happy, satisfied
159
+ - satisfaction: Content, pleased
160
+ - disappointment: Let down, sad
161
+ - confusion: Unclear, needs help
162
+
163
+ 3. CONTEXT (brief): What is the underlying issue? 1-2 words
164
+
165
+ 4. CONFIDENCE (0.0-1.0): How confident are you?
166
+
167
+ 5. REASONING: Brief one-sentence explanation
168
+
169
+ Respond ONLY in valid JSON format:
170
+ {{
171
+ "user_type": "new_user/regular_user/power_user/churning_user",
172
+ "emotion": "anger/frustration/joy/satisfaction/disappointment/confusion",
173
+ "context": "brief context",
174
+ "confidence": 0.0-1.0,
175
+ "reasoning": "brief explanation"
176
+ }}"""
177
+
178
+ try:
179
+ response = hf_client.text_generation(
180
+ prompt,
181
+ model="mistralai/Mistral-7B-Instruct-v0.3",
182
+ max_new_tokens=200,
183
+ temperature=0.1
184
+ )
185
+
186
+ # Clean and parse JSON
187
+ response_clean = response.strip()
188
+ if response_clean.startswith('```'):
189
+ response_clean = response_clean.split('```')[1]
190
+ if response_clean.startswith('json'):
191
+ response_clean = response_clean[4:]
192
+ response_clean = response_clean.strip()
193
+
194
+ result = json.loads(response_clean)
195
+ result['model'] = 'mistralai/Mistral-7B-Instruct-v0.3'
196
+ return result
197
+
198
+ except Exception as e:
199
+ return {
200
+ 'user_type': 'unknown',
201
+ 'emotion': 'unknown',
202
+ 'context': 'unknown',
203
+ 'confidence': 0.0,
204
+ 'reasoning': f'Error: {str(e)}',
205
+ 'model': 'mistralai/Mistral-7B-Instruct-v0.3'
206
+ }
207
+
208
+
209
+ def manager_synthesize(llm1_result: Dict, llm2_result: Dict, review: Dict) -> Dict[str, Any]:
210
+ """Manager: Synthesize LLM1 and LLM2 results"""
211
+ review_text = review.get('review_text', '')
212
+ rating = review.get('rating', 3)
213
+
214
+ prompt = f"""You are a synthesis manager evaluating two AI analyses of the same review.
215
+
216
+ REVIEW:
217
+ Rating: {rating}/5
218
+ Text: {review_text}
219
+
220
+ LLM1 ANALYSIS (Type/Dept/Priority):
221
+ {json.dumps(llm1_result, indent=2)}
222
+
223
+ LLM2 ANALYSIS (User/Emotion/Context):
224
+ {json.dumps(llm2_result, indent=2)}
225
+
226
+ Your task:
227
+ 1. Validate both analyses
228
+ 2. Resolve any conflicts
229
+ 3. Make final classification decision
230
+ 4. Provide synthesis reasoning
231
+
232
+ Respond ONLY in valid JSON format:
233
+ {{
234
+ "final_type": "from llm1 or adjusted",
235
+ "final_department": "from llm1 or adjusted",
236
+ "final_priority": "from llm1 or adjusted",
237
+ "final_user_type": "from llm2 or adjusted",
238
+ "final_emotion": "from llm2 or adjusted",
239
+ "confidence": 0.0-1.0,
240
+ "reasoning": "synthesis explanation",
241
+ "conflicts_found": "any conflicts or 'none'"
242
+ }}"""
243
+
244
+ try:
245
+ response = hf_client.text_generation(
246
+ prompt,
247
+ model="meta-llama/Llama-3.1-8B-Instruct",
248
+ max_new_tokens=250,
249
+ temperature=0.1
250
+ )
251
+
252
+ # Clean and parse JSON
253
+ response_clean = response.strip()
254
+ if response_clean.startswith('```'):
255
+ response_clean = response_clean.split('```')[1]
256
+ if response_clean.startswith('json'):
257
+ response_clean = response_clean[4:]
258
+ response_clean = response_clean.strip()
259
+
260
+ result = json.loads(response_clean)
261
+ result['model'] = 'meta-llama/Llama-3.1-8B-Instruct'
262
+ return result
263
+
264
+ except Exception as e:
265
+ # Fallback to LLM1 results
266
+ return {
267
+ 'final_type': llm1_result.get('type', 'unknown'),
268
+ 'final_department': llm1_result.get('department', 'unknown'),
269
+ 'final_priority': llm1_result.get('priority', 'medium'),
270
+ 'final_user_type': llm2_result.get('user_type', 'unknown'),
271
+ 'final_emotion': llm2_result.get('emotion', 'unknown'),
272
+ 'confidence': 0.5,
273
+ 'reasoning': f'Manager error, used LLM1 results: {str(e)}',
274
+ 'conflicts_found': 'error',
275
+ 'model': 'meta-llama/Llama-3.1-8B-Instruct'
276
+ }
277
+
278
+
279
+ def stage1_classification_node(state: ReviewState) -> Dict[str, Any]:
280
+ """
281
+ Stage 1 Node: Classification with PARALLEL execution
282
+ Runs LLM1 and LLM2 in parallel, then Manager synthesizes
283
+ """
284
+ print(f"\n 📝 Review ID: {state['review_id']}")
285
+ print(f" ⏳ STAGE 1: Classification (Parallel LLM1 + LLM2)...")
286
+
287
+ start_time = time.time()
288
+
289
+ # PARALLEL EXECUTION: LLM1 and LLM2 run simultaneously
290
+ with ThreadPoolExecutor(max_workers=2) as executor:
291
+ future1 = executor.submit(llm1_classify, state['review'])
292
+ future2 = executor.submit(llm2_analyze, state['review'])
293
+
294
+ llm1_result = future1.result()
295
+ llm2_result = future2.result()
296
+
297
+ print(f" ✅ LLM1: {llm1_result.get('type')} → {llm1_result.get('department')} (Priority: {llm1_result.get('priority')})")
298
+ print(f" ✅ LLM2: {llm2_result.get('user_type')}, {llm2_result.get('emotion')}")
299
+
300
+ # Manager synthesizes sequentially (needs both results)
301
+ print(f" 🤖 Manager synthesizing...")
302
+ manager_result = manager_synthesize(llm1_result, llm2_result, state['review'])
303
+
304
+ stage1_time = time.time() - start_time
305
+ print(f" ✅ Stage 1 complete ({stage1_time:.2f}s)")
306
+
307
+ # Update state
308
+ return {
309
+ "llm1_result": llm1_result,
310
+ "llm2_result": llm2_result,
311
+ "manager_result": manager_result,
312
+ "classification_type": manager_result.get('final_type'),
313
+ "department": manager_result.get('final_department'),
314
+ "priority": manager_result.get('final_priority'),
315
+ "user_type": manager_result.get('final_user_type'),
316
+ "emotion": manager_result.get('final_emotion'),
317
+ "stage1_completed": True,
318
+ "stage1_time": stage1_time,
319
+ "errors": state.get('errors', [])
320
+ }
321
+
322
+
323
+ # ============================================================================
324
+ # STAGE 2: SENTIMENT NODE (Parallel Best + Alternate)
325
+ # ============================================================================
326
+
327
+ def analyze_best_sentiment(text: str) -> Dict[str, Any]:
328
+ """Best Model: Twitter-RoBERTa"""
329
+ load_sentiment_models()
330
+
331
+ try:
332
+ inputs = _best_tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
333
+
334
+ with torch.no_grad():
335
+ outputs = _best_model(**inputs)
336
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
337
+ prediction = torch.argmax(probs, dim=-1).item()
338
+ confidence = probs[0][prediction].item()
339
+
340
+ label_map = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
341
+
342
+ return {
343
+ 'sentiment': label_map[prediction],
344
+ 'confidence': confidence,
345
+ 'prob_negative': probs[0][0].item(),
346
+ 'prob_neutral': probs[0][1].item(),
347
+ 'prob_positive': probs[0][2].item(),
348
+ 'model': 'twitter-roberta-base-sentiment-latest'
349
+ }
350
+ except Exception as e:
351
+ return {
352
+ 'sentiment': 'NEUTRAL',
353
+ 'confidence': 0.0,
354
+ 'prob_negative': 0.33,
355
+ 'prob_neutral': 0.34,
356
+ 'prob_positive': 0.33,
357
+ 'model': 'error',
358
+ 'error': str(e)
359
+ }
360
+
361
+
362
+ def analyze_alt_sentiment(text: str) -> Dict[str, Any]:
363
+ """Alternate Model: BERTweet"""
364
+ load_sentiment_models()
365
+
366
+ try:
367
+ inputs = _alt_tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
368
+
369
+ with torch.no_grad():
370
+ outputs = _alt_model(**inputs)
371
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
372
+ prediction = torch.argmax(probs, dim=-1).item()
373
+ confidence = probs[0][prediction].item()
374
+
375
+ label_map = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
376
+
377
+ return {
378
+ 'sentiment': label_map[prediction],
379
+ 'confidence': confidence,
380
+ 'prob_negative': probs[0][0].item(),
381
+ 'prob_neutral': probs[0][1].item(),
382
+ 'prob_positive': probs[0][2].item(),
383
+ 'model': 'bertweet-base-sentiment-analysis'
384
+ }
385
+ except Exception as e:
386
+ return {
387
+ 'sentiment': 'NEUTRAL',
388
+ 'confidence': 0.0,
389
+ 'prob_negative': 0.33,
390
+ 'prob_neutral': 0.34,
391
+ 'prob_positive': 0.33,
392
+ 'model': 'error',
393
+ 'error': str(e)
394
+ }
395
+
396
+
397
+ def sentiment_layer(best_result: Dict, alt_result: Dict) -> Dict[str, Any]:
398
+ """Sentiment Layer: Combine with confidence weighting"""
399
+ best_sentiment = best_result.get('sentiment')
400
+ best_confidence = best_result.get('confidence', 0.0)
401
+
402
+ alt_sentiment = alt_result.get('sentiment')
403
+ alt_confidence = alt_result.get('confidence', 0.0)
404
+
405
+ agreement = (best_sentiment == alt_sentiment)
406
+
407
+ if agreement:
408
+ final_sentiment = best_sentiment
409
+ combined_confidence = max(best_confidence, alt_confidence)
410
+ agreement_strength = "STRONG"
411
+ else:
412
+ if best_confidence > alt_confidence:
413
+ final_sentiment = best_sentiment
414
+ combined_confidence = best_confidence
415
+ else:
416
+ final_sentiment = alt_sentiment
417
+ combined_confidence = alt_confidence
418
+ agreement_strength = "WEAK"
419
+
420
+ return {
421
+ 'layer_sentiment': final_sentiment,
422
+ 'combined_confidence': combined_confidence,
423
+ 'agreement': agreement,
424
+ 'agreement_strength': agreement_strength
425
+ }
426
+
427
+
428
+ def stage2_sentiment_node(state: ReviewState) -> Dict[str, Any]:
429
+ """
430
+ Stage 2 Node: Sentiment with PARALLEL execution
431
+ Runs Best and Alternate models in parallel, then combines
432
+ """
433
+ print(f"\n ⏳ STAGE 2: Sentiment Analysis (Parallel Best + Alternate)...")
434
+
435
+ start_time = time.time()
436
+ review_text = state['review_text']
437
+
438
+ # PARALLEL EXECUTION: Best and Alternate models run simultaneously
439
+ with ThreadPoolExecutor(max_workers=2) as executor:
440
+ future_best = executor.submit(analyze_best_sentiment, review_text)
441
+ future_alt = executor.submit(analyze_alt_sentiment, review_text)
442
+
443
+ best_result = future_best.result()
444
+ alt_result = future_alt.result()
445
+
446
+ print(f" ✅ Best: {best_result['sentiment']} ({best_result['confidence']:.3f})")
447
+ print(f" ✅ Alt: {alt_result['sentiment']} ({alt_result['confidence']:.3f})")
448
+
449
+ # Sentiment Layer combines results
450
+ layer_result = sentiment_layer(best_result, alt_result)
451
+
452
+ agreement_icon = "✅" if layer_result['agreement'] else "⚠️ "
453
+ print(f" {agreement_icon} Final: {layer_result['layer_sentiment']} (agreement: {layer_result['agreement']})")
454
+
455
+ stage2_time = time.time() - start_time
456
+ print(f" ✅ Stage 2 complete ({stage2_time:.2f}s)")
457
+
458
+ return {
459
+ "best_sentiment_result": best_result,
460
+ "alt_sentiment_result": alt_result,
461
+ "sentiment_layer_result": layer_result,
462
+ "sentiment": layer_result['layer_sentiment'],
463
+ "sentiment_confidence": layer_result['combined_confidence'],
464
+ "sentiment_agreement": layer_result['agreement'],
465
+ "stage2_completed": True,
466
+ "stage2_time": stage2_time,
467
+ "errors": state.get('errors', [])
468
+ }
469
+
470
+
471
+ # ============================================================================
472
+ # STAGE 3: FINALIZATION NODE
473
+ # ============================================================================
474
+
475
+ def stage3_finalization_node(state: ReviewState) -> Dict[str, Any]:
476
+ """
477
+ Stage 3 Node: Final synthesis with LLM3 (Llama 70B)
478
+ """
479
+ print(f"\n ⏳ STAGE 3: Finalization (LLM3)...")
480
+
481
+ start_time = time.time()
482
+
483
+ review_text = state['review_text']
484
+ rating = state['rating']
485
+
486
+ prompt = f"""You are a final decision-making AI analyzing customer feedback for a theme park/attraction app.
487
+
488
+ REVIEW DATA:
489
+ Rating: {rating}/5
490
+ Text: {review_text}
491
+
492
+ STAGE 1 CLASSIFICATION:
493
+ - Type: {state.get('classification_type')}
494
+ - Department: {state.get('department')}
495
+ - Priority: {state.get('priority')}
496
+ - User Type: {state.get('user_type')}
497
+ - Emotion: {state.get('emotion')}
498
+
499
+ STAGE 2 SENTIMENT:
500
+ - Best: {state['best_sentiment_result'].get('sentiment')} ({state['best_sentiment_result'].get('confidence'):.2f})
501
+ - Alternate: {state['alt_sentiment_result'].get('sentiment')} ({state['alt_sentiment_result'].get('confidence'):.2f})
502
+ - Agreement: {state.get('sentiment_agreement')}
503
+
504
+ YOUR TASK:
505
+ 1. Review all data from both stages
506
+ 2. Make FINAL sentiment decision
507
+ 3. Provide comprehensive reasoning
508
+ 4. Generate action recommendation
509
+ 5. Flag if human review needed
510
+
511
+ Respond ONLY in valid JSON format:
512
+ {{
513
+ "final_sentiment": "POSITIVE/NEGATIVE/NEUTRAL",
514
+ "confidence": 0.0-1.0,
515
+ "reasoning": "Comprehensive explanation",
516
+ "validation_notes": "Does classification match sentiment?",
517
+ "conflicts_found": "any conflicts or 'none'",
518
+ "action_recommendation": "Specific action",
519
+ "needs_human_review": true/false
520
+ }}"""
521
+
522
+ try:
523
+ response = hf_client.text_generation(
524
+ prompt,
525
+ model="meta-llama/Llama-3.1-70B-Instruct",
526
+ max_new_tokens=400,
527
+ temperature=0.1
528
+ )
529
+
530
+ response_clean = response.strip()
531
+ if response_clean.startswith('```'):
532
+ response_clean = response_clean.split('```')[1]
533
+ if response_clean.startswith('json'):
534
+ response_clean = response_clean[4:]
535
+ response_clean = response_clean.strip()
536
+
537
+ result = json.loads(response_clean)
538
+ result['model'] = 'meta-llama/Llama-3.1-70B-Instruct'
539
+
540
+ except Exception as e:
541
+ result = {
542
+ 'final_sentiment': state.get('sentiment', 'NEUTRAL'),
543
+ 'confidence': state.get('sentiment_confidence', 0.5),
544
+ 'reasoning': f'Error in LLM3: {str(e)}',
545
+ 'validation_notes': 'Error',
546
+ 'conflicts_found': 'error',
547
+ 'action_recommendation': f"Route to {state.get('department')}",
548
+ 'needs_human_review': True,
549
+ 'model': 'meta-llama/Llama-3.1-70B-Instruct'
550
+ }
551
+
552
+ stage3_time = time.time() - start_time
553
+
554
+ print(f" ✅ Final: {result['final_sentiment']} ({result.get('confidence', 0):.3f})")
555
+ print(f" 📋 Needs Review: {result.get('needs_human_review', False)}")
556
+ print(f" ✅ Stage 3 complete ({stage3_time:.2f}s)")
557
+
558
+ # Calculate total time
559
+ total_time = state.get('stage1_time', 0) + state.get('stage2_time', 0) + stage3_time
560
+
561
+ return {
562
+ "final_result": result,
563
+ "final_sentiment": result['final_sentiment'],
564
+ "final_confidence": result['confidence'],
565
+ "reasoning": result['reasoning'],
566
+ "action_recommendation": result['action_recommendation'],
567
+ "conflicts_found": result['conflicts_found'],
568
+ "validation_notes": result['validation_notes'],
569
+ "needs_human_review": result['needs_human_review'],
570
+ "stage3_completed": True,
571
+ "stage3_time": stage3_time,
572
+ "total_time": total_time,
573
+ "processing_completed_at": datetime.now().isoformat(),
574
+ "errors": state.get('errors', [])
575
+ }
576
+
577
+
578
+ if __name__ == "__main__":
579
+ print("\n✅ LangGraph nodes module loaded!")
580
+ print(" Nodes available:")
581
+ print(" - stage1_classification_node (parallel LLM1+LLM2)")
582
+ print(" - stage2_sentiment_node (parallel Best+Alt)")
583
+ print(" - stage3_finalization_node (LLM3)")
langgraph_state.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LangGraph State Schema
3
+ Defines the state that flows through the graph
4
+ """
5
+
6
+ from typing import TypedDict, Optional, Dict, Any, List
7
+ from datetime import datetime
8
+
9
+ class ReviewState(TypedDict):
10
+ """
11
+ State schema for review processing graph
12
+ All stages add to this state as it flows through the graph
13
+ """
14
+
15
+ # Input data
16
+ review: Dict[str, Any]
17
+ review_id: str
18
+ review_text: str
19
+ rating: int
20
+
21
+ # Stage 1: Classification outputs
22
+ llm1_result: Optional[Dict[str, Any]]
23
+ llm2_result: Optional[Dict[str, Any]]
24
+ manager_result: Optional[Dict[str, Any]]
25
+
26
+ # Stage 1: Extracted fields for easy access
27
+ classification_type: Optional[str]
28
+ department: Optional[str]
29
+ priority: Optional[str]
30
+ user_type: Optional[str]
31
+ emotion: Optional[str]
32
+
33
+ # Stage 2: Sentiment outputs
34
+ best_sentiment_result: Optional[Dict[str, Any]]
35
+ alt_sentiment_result: Optional[Dict[str, Any]]
36
+ sentiment_layer_result: Optional[Dict[str, Any]]
37
+
38
+ # Stage 2: Extracted fields
39
+ sentiment: Optional[str] # POSITIVE, NEGATIVE, NEUTRAL
40
+ sentiment_confidence: Optional[float]
41
+ sentiment_agreement: Optional[bool]
42
+
43
+ # Stage 3: Finalization outputs
44
+ final_result: Optional[Dict[str, Any]]
45
+
46
+ # Stage 3: Extracted fields
47
+ final_sentiment: Optional[str]
48
+ final_confidence: Optional[float]
49
+ reasoning: Optional[str]
50
+ action_recommendation: Optional[str]
51
+ conflicts_found: Optional[str]
52
+ validation_notes: Optional[str]
53
+
54
+ # Routing decisions
55
+ needs_human_review: bool
56
+ route_to: Optional[str] # 'human_review', 'complete', 'batch_analysis'
57
+
58
+ # Processing metadata
59
+ stage1_completed: bool
60
+ stage2_completed: bool
61
+ stage3_completed: bool
62
+ processing_started_at: Optional[str]
63
+ processing_completed_at: Optional[str]
64
+
65
+ # Timing information
66
+ stage1_time: Optional[float]
67
+ stage2_time: Optional[float]
68
+ stage3_time: Optional[float]
69
+ total_time: Optional[float]
70
+
71
+ # Error handling
72
+ errors: List[str]
73
+ retry_count: int
74
+
75
+ # Database sync status
76
+ db_stage1_saved: bool
77
+ db_stage2_saved: bool
78
+ db_stage3_saved: bool
79
+
80
+
81
+ class BatchState(TypedDict):
82
+ """
83
+ State for batch analysis (Stage 4)
84
+ Aggregates results from multiple reviews
85
+ """
86
+
87
+ # Input
88
+ all_reviews: List[ReviewState]
89
+ total_count: int
90
+
91
+ # Aggregated metrics
92
+ sentiment_distribution: Optional[Dict[str, int]]
93
+ priority_distribution: Optional[Dict[str, int]]
94
+ department_distribution: Optional[Dict[str, int]]
95
+ emotion_distribution: Optional[Dict[str, int]]
96
+
97
+ # Analysis outputs
98
+ critical_issues: Optional[List[Dict[str, Any]]]
99
+ quick_wins: Optional[List[Dict[str, Any]]]
100
+ churn_risk: Optional[float]
101
+ model_agreement_rate: Optional[float]
102
+
103
+ # Recommendations
104
+ recommendations: Optional[List[str]]
105
+
106
+ # Processing metadata
107
+ batch_started_at: Optional[str]
108
+ batch_completed_at: Optional[str]
109
+ batch_processing_time: Optional[float]
110
+
111
+
112
+ def create_initial_state(review: Dict[str, Any]) -> ReviewState:
113
+ """
114
+ Create initial state for a review
115
+ """
116
+ return ReviewState(
117
+ # Input
118
+ review=review,
119
+ review_id=review.get('review_id', 'unknown'),
120
+ review_text=review.get('review_text', ''),
121
+ rating=review.get('rating', 3),
122
+
123
+ # Stage 1
124
+ llm1_result=None,
125
+ llm2_result=None,
126
+ manager_result=None,
127
+ classification_type=None,
128
+ department=None,
129
+ priority=None,
130
+ user_type=None,
131
+ emotion=None,
132
+
133
+ # Stage 2
134
+ best_sentiment_result=None,
135
+ alt_sentiment_result=None,
136
+ sentiment_layer_result=None,
137
+ sentiment=None,
138
+ sentiment_confidence=None,
139
+ sentiment_agreement=None,
140
+
141
+ # Stage 3
142
+ final_result=None,
143
+ final_sentiment=None,
144
+ final_confidence=None,
145
+ reasoning=None,
146
+ action_recommendation=None,
147
+ conflicts_found=None,
148
+ validation_notes=None,
149
+
150
+ # Routing
151
+ needs_human_review=False,
152
+ route_to=None,
153
+
154
+ # Processing metadata
155
+ stage1_completed=False,
156
+ stage2_completed=False,
157
+ stage3_completed=False,
158
+ processing_started_at=datetime.now().isoformat(),
159
+ processing_completed_at=None,
160
+
161
+ # Timing
162
+ stage1_time=None,
163
+ stage2_time=None,
164
+ stage3_time=None,
165
+ total_time=None,
166
+
167
+ # Errors
168
+ errors=[],
169
+ retry_count=0,
170
+
171
+ # Database
172
+ db_stage1_saved=False,
173
+ db_stage2_saved=False,
174
+ db_stage3_saved=False
175
+ )
176
+
177
+
178
+ def create_batch_state(reviews: List[ReviewState]) -> BatchState:
179
+ """
180
+ Create batch state from processed reviews
181
+ """
182
+ return BatchState(
183
+ all_reviews=reviews,
184
+ total_count=len(reviews),
185
+ sentiment_distribution=None,
186
+ priority_distribution=None,
187
+ department_distribution=None,
188
+ emotion_distribution=None,
189
+ critical_issues=None,
190
+ quick_wins=None,
191
+ churn_risk=None,
192
+ model_agreement_rate=None,
193
+ recommendations=None,
194
+ batch_started_at=datetime.now().isoformat(),
195
+ batch_completed_at=None,
196
+ batch_processing_time=None
197
+ )
198
+
199
+
200
+ if __name__ == "__main__":
201
+ # Test state creation
202
+ print("\n" + "="*60)
203
+ print("🧪 TESTING LANGGRAPH STATE")
204
+ print("="*60)
205
+
206
+ test_review = {
207
+ 'review_id': 'test_001',
208
+ 'review_text': 'App crashes!',
209
+ 'rating': 1
210
+ }
211
+
212
+ state = create_initial_state(test_review)
213
+ print(f"\n✅ Initial state created for: {state['review_id']}")
214
+ print(f" Review text: {state['review_text']}")
215
+ print(f" Stage 1 completed: {state['stage1_completed']}")
216
+
217
+ print("\n✅ State schema test complete!")
requirements.txt CHANGED
@@ -1,3 +1,27 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Dependencies
2
+ python-dotenv==1.0.0
3
+ pandas>=2.2.0
4
+ PyYAML==6.0.1
5
+
6
+ # LangGraph & LangChain
7
+ langgraph>=0.2.0
8
+ langchain>=0.2.0
9
+ langchain-core>=0.2.0
10
+
11
+ # HuggingFace
12
+ huggingface-hub>=0.20.3
13
+ transformers>=4.36.2
14
+ torch>=2.1.2
15
+
16
+ # Gradio (for HuggingFace Spaces UI)
17
+ gradio>=4.0.0
18
+
19
+ # Plotly for visualizations
20
+ plotly>=5.18.0
21
+
22
+ # Web Scraping
23
+ google-play-scraper>=1.2.4
24
+ requests>=2.31.0
25
+
26
+ # Database (SQLite is built-in to Python)
27
+ # sqlite3 is included with Python
stage0_scraper.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stage 0: Web Scraping (App Store & Play Store)
3
+ Scrapes reviews and stores in database
4
+ This integrates with your existing scraper or can be used standalone
5
+ """
6
+
7
+ import os
8
+ import sqlite3
9
+ import requests
10
+ import json
11
+ import time
12
+ from datetime import datetime
13
+ from typing import List, Dict, Any
14
+ import re
15
+
16
+ class Stage0WebScraper:
17
+ """
18
+ Stage 0: Web scraping for App Store and Play Store reviews
19
+ Integrates with existing database structure
20
+ """
21
+
22
+ def __init__(self, db_file: str = "review_database.db"):
23
+ self.db_file = db_file
24
+ print(f" 📁 Database: {db_file}")
25
+
26
+ def create_reviews_table(self):
27
+ """
28
+ Create reviews table if it doesn't exist
29
+ This is your Stage 0 schema
30
+ """
31
+ conn = sqlite3.connect(self.db_file)
32
+ cursor = conn.cursor()
33
+
34
+ cursor.execute("""
35
+ CREATE TABLE IF NOT EXISTS reviews (
36
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
37
+ review_id TEXT UNIQUE,
38
+ product_url TEXT,
39
+ platform TEXT,
40
+ app_name TEXT,
41
+ user_name TEXT,
42
+ review_text TEXT,
43
+ rating INTEGER,
44
+ review_date TEXT,
45
+ app_version TEXT,
46
+ scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
47
+ )
48
+ """)
49
+
50
+ # Create index for faster lookups
51
+ cursor.execute("""
52
+ CREATE INDEX IF NOT EXISTS idx_review_id
53
+ ON reviews(review_id)
54
+ """)
55
+
56
+ cursor.execute("""
57
+ CREATE INDEX IF NOT EXISTS idx_platform
58
+ ON reviews(platform)
59
+ """)
60
+
61
+ conn.commit()
62
+ conn.close()
63
+
64
+ print(" ✅ Reviews table ready (Stage 0)")
65
+
66
+ def scrape_app_store_rss(self, app_id: str, country: str = "us",
67
+ limit: int = 100) -> List[Dict]:
68
+ """
69
+ Scrape App Store reviews using RSS feed
70
+ This is a simple, free method (no API key needed)
71
+
72
+ Args:
73
+ app_id: App Store app ID (e.g., "1234567890")
74
+ country: Country code (e.g., "us", "ae", "uk")
75
+ limit: Number of reviews to fetch (max 500 per request)
76
+ """
77
+ print(f" 🍎 Scraping App Store: {app_id} ({country})")
78
+
79
+ # App Store RSS feed URL
80
+ url = f"https://itunes.apple.com/{country}/rss/customerreviews/id={app_id}/sortBy=mostRecent/json"
81
+
82
+ try:
83
+ response = requests.get(url, timeout=30)
84
+ response.raise_for_status()
85
+
86
+ data = response.json()
87
+
88
+ reviews = []
89
+ entries = data.get('feed', {}).get('entry', [])
90
+
91
+ # Skip first entry (it's the app info)
92
+ if entries and 'author' not in entries[0]:
93
+ entries = entries[1:]
94
+
95
+ for entry in entries[:limit]:
96
+ try:
97
+ review = {
98
+ 'review_id': entry.get('id', {}).get('label', ''),
99
+ 'platform': 'app_store',
100
+ 'app_name': data.get('feed', {}).get('title', {}).get('label', 'Unknown'),
101
+ 'user_name': entry.get('author', {}).get('name', {}).get('label', 'Anonymous'),
102
+ 'review_text': entry.get('content', {}).get('label', ''),
103
+ 'rating': int(entry.get('im:rating', {}).get('label', '3')),
104
+ 'review_date': entry.get('updated', {}).get('label', ''),
105
+ 'app_version': entry.get('im:version', {}).get('label', ''),
106
+ 'product_url': entry.get('link', {}).get('attributes', {}).get('href', '')
107
+ }
108
+ reviews.append(review)
109
+ except Exception as e:
110
+ print(f" ⚠️ Error parsing review: {e}")
111
+ continue
112
+
113
+ print(f" ✅ Scraped {len(reviews)} reviews")
114
+ return reviews
115
+
116
+ except Exception as e:
117
+ print(f" ❌ Error scraping App Store: {e}")
118
+ return []
119
+
120
+ def scrape_play_store_api(self, app_id: str, country: str = "us",
121
+ limit: int = 100) -> List[Dict]:
122
+ """
123
+ Scrape Google Play Store reviews
124
+ Note: This is a simplified version. For production, use google-play-scraper library
125
+
126
+ Args:
127
+ app_id: Play Store package name (e.g., "com.company.app")
128
+ country: Country code
129
+ limit: Number of reviews to fetch
130
+ """
131
+ print(f" 🤖 Scraping Play Store: {app_id} ({country})")
132
+
133
+ try:
134
+ # Using unofficial API endpoint (works without auth)
135
+ # For production, recommend: pip install google-play-scraper
136
+ from google_play_scraper import Sort, reviews_all
137
+
138
+ result = reviews_all(
139
+ app_id,
140
+ sleep_milliseconds=0,
141
+ lang='en',
142
+ country=country,
143
+ sort=Sort.NEWEST
144
+ )
145
+
146
+ reviews = []
147
+ for r in result[:limit]:
148
+ review = {
149
+ 'review_id': r.get('reviewId', ''),
150
+ 'platform': 'play_store',
151
+ 'app_name': app_id,
152
+ 'user_name': r.get('userName', 'Anonymous'),
153
+ 'review_text': r.get('content', ''),
154
+ 'rating': r.get('score', 3),
155
+ 'review_date': r.get('at', '').isoformat() if r.get('at') else '',
156
+ 'app_version': r.get('reviewCreatedVersion', ''),
157
+ 'product_url': f"https://play.google.com/store/apps/details?id={app_id}"
158
+ }
159
+ reviews.append(review)
160
+
161
+ print(f" ✅ Scraped {len(reviews)} reviews")
162
+ return reviews
163
+
164
+ except ImportError:
165
+ print(" ⚠️ google-play-scraper not installed")
166
+ print(" Run: pip install google-play-scraper")
167
+ return []
168
+ except Exception as e:
169
+ print(f" ❌ Error scraping Play Store: {e}")
170
+ return []
171
+
172
+ def save_reviews_to_db(self, reviews: List[Dict]) -> int:
173
+ """
174
+ Save scraped reviews to database
175
+ Returns number of new reviews saved
176
+ """
177
+ if not reviews:
178
+ return 0
179
+
180
+ conn = sqlite3.connect(self.db_file)
181
+ cursor = conn.cursor()
182
+
183
+ saved_count = 0
184
+
185
+ for review in reviews:
186
+ try:
187
+ cursor.execute("""
188
+ INSERT OR IGNORE INTO reviews
189
+ (review_id, product_url, platform, app_name, user_name,
190
+ review_text, rating, review_date, app_version)
191
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
192
+ """, (
193
+ review.get('review_id'),
194
+ review.get('product_url', ''),
195
+ review.get('platform'),
196
+ review.get('app_name', ''),
197
+ review.get('user_name'),
198
+ review.get('review_text'),
199
+ review.get('rating'),
200
+ review.get('review_date', ''),
201
+ review.get('app_version', '')
202
+ ))
203
+
204
+ if cursor.rowcount > 0:
205
+ saved_count += 1
206
+
207
+ except Exception as e:
208
+ print(f" ⚠️ Error saving review: {e}")
209
+ continue
210
+
211
+ conn.commit()
212
+ conn.close()
213
+
214
+ print(f" ✅ Saved {saved_count} new reviews to database")
215
+ return saved_count
216
+
217
+ def scrape_from_urls_file(self, urls_file: str = "urls.txt") -> int:
218
+ """
219
+ Scrape reviews from URLs listed in a text file
220
+
221
+ URLs file format (one per line):
222
+ app_store:1234567890:us
223
+ play_store:com.company.app:us
224
+ """
225
+ print(f"\n 📄 Reading URLs from: {urls_file}")
226
+
227
+ if not os.path.exists(urls_file):
228
+ print(f" ⚠️ File not found: {urls_file}")
229
+ return 0
230
+
231
+ total_saved = 0
232
+
233
+ with open(urls_file, 'r') as f:
234
+ urls = [line.strip() for line in f if line.strip() and not line.startswith('#')]
235
+
236
+ print(f" ✅ Found {len(urls)} URLs")
237
+
238
+ for i, url in enumerate(urls, 1):
239
+ print(f"\n [{i}/{len(urls)}] Processing: {url}")
240
+
241
+ parts = url.split(':')
242
+ if len(parts) < 2:
243
+ print(f" ⚠️ Invalid format: {url}")
244
+ continue
245
+
246
+ platform = parts[0].lower()
247
+ app_id = parts[1]
248
+ country = parts[2] if len(parts) > 2 else 'us'
249
+
250
+ if platform == 'app_store':
251
+ reviews = self.scrape_app_store_rss(app_id, country)
252
+ elif platform == 'play_store':
253
+ reviews = self.scrape_play_store_api(app_id, country)
254
+ else:
255
+ print(f" ⚠️ Unknown platform: {platform}")
256
+ continue
257
+
258
+ saved = self.save_reviews_to_db(reviews)
259
+ total_saved += saved
260
+
261
+ # Be nice to servers
262
+ time.sleep(2)
263
+
264
+ return total_saved
265
+
266
+ def get_review_count(self) -> int:
267
+ """Get total number of reviews in database"""
268
+ conn = sqlite3.connect(self.db_file)
269
+ cursor = conn.cursor()
270
+ cursor.execute("SELECT COUNT(*) FROM reviews")
271
+ count = cursor.fetchone()[0]
272
+ conn.close()
273
+ return count
274
+
275
+
276
+ if __name__ == "__main__":
277
+ # Run Stage 0 scraper - reads from urls.txt
278
+ print("\n" + "="*70)
279
+ print("🕷️ STAGE 0: WEB SCRAPER")
280
+ print("="*70)
281
+
282
+ scraper = Stage0WebScraper(db_file="review_database.db")
283
+
284
+ # Create table if not exists
285
+ print("\n📁 Ensuring database table exists...")
286
+ scraper.create_reviews_table()
287
+
288
+ # Scrape from urls.txt
289
+ print("\n🔄 Starting scraping from urls.txt...")
290
+ total_saved = scraper.scrape_from_urls_file("urls.txt")
291
+
292
+ # Show results
293
+ total_reviews = scraper.get_review_count()
294
+
295
+ print("\n" + "="*70)
296
+ print("✅ SCRAPING COMPLETE!")
297
+ print("="*70)
298
+ print(f"📊 New reviews saved: {total_saved}")
299
+ print(f"📊 Total reviews in database: {total_reviews}")
300
+ print("\n🎯 Next step: Run the analysis!")
301
+ print(" python main_langgraph.py")
302
+ print("="*70 + "\n")
stage4_batch_analysis.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stage 4: Batch Analysis & Aggregation
3
+ - Aggregate insights across all processed reviews
4
+ - Identify patterns, trends, critical issues
5
+ - Generate actionable recommendations
6
+ """
7
+
8
+ import json
9
+ from typing import Dict, Any, List
10
+ from collections import Counter
11
+
12
+ class Stage4BatchAnalysis:
13
+ """
14
+ Stage 4: Batch-level intelligence and recommendations
15
+ """
16
+
17
+ def __init__(self):
18
+ print(" 📊 Stage 4: Batch Analysis initialized")
19
+
20
+ def analyze_batch(self, reviews: List[Dict[str, Any]]) -> Dict[str, Any]:
21
+ """
22
+ Analyze a batch of processed reviews
23
+ """
24
+ if not reviews:
25
+ print(" ⚠️ No reviews to analyze")
26
+ return self._empty_insights()
27
+
28
+ print(f"\n 📊 Analyzing batch of {len(reviews)} reviews...")
29
+
30
+ # Initialize counters
31
+ total = len(reviews)
32
+
33
+ # Sentiment distribution
34
+ sentiment_counts = Counter()
35
+ for review in reviews:
36
+ sentiment = review.get('stage3_final_sentiment', 'NEUTRAL')
37
+ sentiment_counts[sentiment] += 1
38
+
39
+ print(f" 📈 Sentiment: "
40
+ f"POS={sentiment_counts.get('POSITIVE', 0)}, "
41
+ f"NEU={sentiment_counts.get('NEUTRAL', 0)}, "
42
+ f"NEG={sentiment_counts.get('NEGATIVE', 0)}")
43
+
44
+ # Priority distribution
45
+ priority_counts = Counter()
46
+ for review in reviews:
47
+ priority = review.get('stage1_llm1_priority', 'unknown')
48
+ priority_counts[priority] += 1
49
+
50
+ print(f" 🎯 Priority: "
51
+ f"Critical={priority_counts.get('critical', 0)}, "
52
+ f"High={priority_counts.get('high', 0)}, "
53
+ f"Medium={priority_counts.get('medium', 0)}, "
54
+ f"Low={priority_counts.get('low', 0)}")
55
+
56
+ # Department routing
57
+ dept_counts = Counter()
58
+ for review in reviews:
59
+ dept = review.get('stage1_llm1_department', 'unknown')
60
+ dept_counts[dept] += 1
61
+
62
+ print(f" 🏢 Departments: "
63
+ f"Eng={dept_counts.get('engineering', 0)}, "
64
+ f"UX={dept_counts.get('ux', 0)}, "
65
+ f"Support={dept_counts.get('support', 0)}, "
66
+ f"Business={dept_counts.get('business', 0)}")
67
+
68
+ # Emotion distribution
69
+ emotion_counts = Counter()
70
+ for review in reviews:
71
+ emotion = review.get('stage1_llm2_emotion', 'unknown')
72
+ emotion_counts[emotion] += 1
73
+
74
+ # Review type distribution
75
+ type_counts = Counter()
76
+ for review in reviews:
77
+ review_type = review.get('stage1_llm1_type', 'unknown')
78
+ type_counts[review_type] += 1
79
+
80
+ # Identify critical issues
81
+ critical_issues = self._identify_critical_issues(reviews)
82
+ print(f" 🚨 Critical Issues: {len(critical_issues)}")
83
+
84
+ # Identify quick wins
85
+ quick_wins = self._identify_quick_wins(reviews)
86
+ print(f" ⚡ Quick Wins: {len(quick_wins)}")
87
+
88
+ # Calculate churn risk
89
+ churn_risk = self._calculate_churn_risk(reviews)
90
+ print(f" ⚠️ Churn Risk: {churn_risk:.1f}%")
91
+
92
+ # Model agreement rate
93
+ agreement_count = sum(1 for r in reviews if r.get('stage2_agreement', False))
94
+ agreement_rate = (agreement_count / total * 100) if total > 0 else 0
95
+ print(f" 🤝 Model Agreement: {agreement_rate:.1f}%")
96
+
97
+ # Generate recommendations
98
+ recommendations = self._generate_recommendations(
99
+ sentiment_counts, priority_counts, dept_counts,
100
+ critical_issues, quick_wins, churn_risk
101
+ )
102
+
103
+ # Compile batch insights
104
+ insights = {
105
+ 'total_reviews': total,
106
+
107
+ # Sentiment
108
+ 'sentiment_positive': sentiment_counts.get('POSITIVE', 0),
109
+ 'sentiment_neutral': sentiment_counts.get('NEUTRAL', 0),
110
+ 'sentiment_negative': sentiment_counts.get('NEGATIVE', 0),
111
+ 'sentiment_distribution': dict(sentiment_counts),
112
+
113
+ # Priority
114
+ 'priority_critical': priority_counts.get('critical', 0),
115
+ 'priority_high': priority_counts.get('high', 0),
116
+ 'priority_medium': priority_counts.get('medium', 0),
117
+ 'priority_low': priority_counts.get('low', 0),
118
+ 'priority_distribution': dict(priority_counts),
119
+
120
+ # Department
121
+ 'dept_engineering': dept_counts.get('engineering', 0),
122
+ 'dept_ux': dept_counts.get('ux', 0),
123
+ 'dept_support': dept_counts.get('support', 0),
124
+ 'dept_business': dept_counts.get('business', 0),
125
+ 'department_distribution': dict(dept_counts),
126
+
127
+ # Additional insights
128
+ 'emotion_distribution': dict(emotion_counts),
129
+ 'type_distribution': dict(type_counts),
130
+ 'model_agreement_rate': agreement_rate,
131
+ 'churn_risk': churn_risk,
132
+
133
+ # Actionable lists
134
+ 'critical_issues': critical_issues,
135
+ 'quick_wins': quick_wins,
136
+ 'recommendations': recommendations
137
+ }
138
+
139
+ return insights
140
+
141
+ def _identify_critical_issues(self, reviews: List[Dict]) -> List[Dict]:
142
+ """Identify critical issues requiring immediate attention"""
143
+ critical = []
144
+
145
+ for review in reviews:
146
+ priority = review.get('stage1_llm1_priority', '')
147
+ sentiment = review.get('stage3_final_sentiment', '')
148
+ needs_review = review.get('stage3_needs_human_review', False)
149
+
150
+ if priority == 'critical' or (sentiment == 'NEGATIVE' and needs_review):
151
+ critical.append({
152
+ 'review_id': review.get('review_id', 'unknown'),
153
+ 'type': review.get('stage1_llm1_type', 'unknown'),
154
+ 'department': review.get('stage1_llm1_department', 'unknown'),
155
+ 'reasoning': review.get('stage3_reasoning', ''),
156
+ 'action': review.get('stage3_action_recommendation', ''),
157
+ 'rating': review.get('rating', 0)
158
+ })
159
+
160
+ # Sort by rating (lowest first)
161
+ critical.sort(key=lambda x: x['rating'])
162
+
163
+ return critical[:10] # Top 10 critical issues
164
+
165
+ def _identify_quick_wins(self, reviews: List[Dict]) -> List[Dict]:
166
+ """Identify easy-to-fix issues for quick wins"""
167
+ quick_wins = []
168
+
169
+ for review in reviews:
170
+ review_type = review.get('stage1_llm1_type', '')
171
+ priority = review.get('stage1_llm1_priority', '')
172
+ sentiment = review.get('stage3_final_sentiment', '')
173
+
174
+ # Suggestions with low priority = quick wins
175
+ if review_type == 'suggestion' and priority in ['low', 'medium']:
176
+ quick_wins.append({
177
+ 'review_id': review.get('review_id', 'unknown'),
178
+ 'suggestion': review.get('review_text', '')[:100],
179
+ 'department': review.get('stage1_llm1_department', 'unknown'),
180
+ 'action': review.get('stage3_action_recommendation', ''),
181
+ 'rating': review.get('rating', 0)
182
+ })
183
+
184
+ return quick_wins[:10] # Top 10 quick wins
185
+
186
+ def _calculate_churn_risk(self, reviews: List[Dict]) -> float:
187
+ """Calculate overall churn risk percentage"""
188
+ if not reviews:
189
+ return 0.0
190
+
191
+ churn_indicators = 0
192
+
193
+ for review in reviews:
194
+ user_type = review.get('stage1_llm2_user_type', '')
195
+ sentiment = review.get('stage3_final_sentiment', '')
196
+ rating = review.get('rating', 3)
197
+
198
+ # Churn indicators
199
+ if user_type == 'churning_user':
200
+ churn_indicators += 2
201
+ elif sentiment == 'NEGATIVE' and rating <= 2:
202
+ churn_indicators += 1
203
+ elif rating == 1:
204
+ churn_indicators += 1
205
+
206
+ # Calculate percentage
207
+ max_possible = len(reviews) * 2
208
+ churn_risk = (churn_indicators / max_possible * 100) if max_possible > 0 else 0.0
209
+
210
+ return min(churn_risk, 100.0)
211
+
212
+ def _generate_recommendations(self, sentiment_counts, priority_counts,
213
+ dept_counts, critical_issues, quick_wins,
214
+ churn_risk) -> List[str]:
215
+ """Generate actionable recommendations"""
216
+ recommendations = []
217
+
218
+ # Sentiment-based
219
+ total = sum(sentiment_counts.values())
220
+ if total > 0:
221
+ neg_pct = (sentiment_counts.get('NEGATIVE', 0) / total * 100)
222
+ if neg_pct > 40:
223
+ recommendations.append(
224
+ f"🚨 HIGH: {neg_pct:.0f}% negative sentiment. Immediate investigation needed."
225
+ )
226
+ elif neg_pct > 25:
227
+ recommendations.append(
228
+ f"⚠️ MEDIUM: {neg_pct:.0f}% negative sentiment. Monitor closely."
229
+ )
230
+
231
+ # Priority-based
232
+ if priority_counts.get('critical', 0) > 0:
233
+ recommendations.append(
234
+ f"🔥 URGENT: {priority_counts['critical']} critical issues require immediate attention."
235
+ )
236
+
237
+ # Department-based
238
+ if dept_counts:
239
+ top_dept = max(dept_counts, key=dept_counts.get)
240
+ top_count = dept_counts[top_dept]
241
+ recommendations.append(
242
+ f"🎯 FOCUS: {top_count} issues routed to {top_dept} department."
243
+ )
244
+
245
+ # Churn risk
246
+ if churn_risk > 30:
247
+ recommendations.append(
248
+ f"⚠️ CHURN: {churn_risk:.0f}% churn risk detected. Implement retention strategy."
249
+ )
250
+
251
+ # Quick wins
252
+ if quick_wins:
253
+ recommendations.append(
254
+ f"⚡ OPPORTUNITY: {len(quick_wins)} quick wins available for easy improvements."
255
+ )
256
+
257
+ return recommendations
258
+
259
+ def _empty_insights(self) -> Dict[str, Any]:
260
+ """Return empty insights structure"""
261
+ return {
262
+ 'total_reviews': 0,
263
+ 'sentiment_positive': 0,
264
+ 'sentiment_neutral': 0,
265
+ 'sentiment_negative': 0,
266
+ 'priority_critical': 0,
267
+ 'priority_high': 0,
268
+ 'priority_medium': 0,
269
+ 'priority_low': 0,
270
+ 'dept_engineering': 0,
271
+ 'dept_ux': 0,
272
+ 'dept_support': 0,
273
+ 'dept_business': 0,
274
+ 'critical_issues': [],
275
+ 'quick_wins': [],
276
+ 'recommendations': []
277
+ }
278
+
279
+
280
+ if __name__ == "__main__":
281
+ # Test Stage 4
282
+ print("\n" + "="*60)
283
+ print("🧪 TESTING STAGE 4 BATCH ANALYSIS")
284
+ print("="*60)
285
+
286
+ # Sample processed reviews
287
+ sample_reviews = [
288
+ {
289
+ 'review_id': '001',
290
+ 'review_text': 'App crashes!',
291
+ 'rating': 1,
292
+ 'stage1_llm1_type': 'bug_report',
293
+ 'stage1_llm1_department': 'engineering',
294
+ 'stage1_llm1_priority': 'critical',
295
+ 'stage1_llm2_user_type': 'power_user',
296
+ 'stage1_llm2_emotion': 'frustration',
297
+ 'stage2_agreement': True,
298
+ 'stage3_final_sentiment': 'NEGATIVE',
299
+ 'stage3_needs_human_review': True,
300
+ 'stage3_reasoning': 'Critical bug',
301
+ 'stage3_action_recommendation': 'Fix immediately'
302
+ },
303
+ {
304
+ 'review_id': '002',
305
+ 'review_text': 'Great app!',
306
+ 'rating': 5,
307
+ 'stage1_llm1_type': 'praise',
308
+ 'stage1_llm1_department': 'ux',
309
+ 'stage1_llm1_priority': 'low',
310
+ 'stage1_llm2_user_type': 'regular_user',
311
+ 'stage1_llm2_emotion': 'joy',
312
+ 'stage2_agreement': True,
313
+ 'stage3_final_sentiment': 'POSITIVE',
314
+ 'stage3_needs_human_review': False
315
+ }
316
+ ]
317
+
318
+ stage4 = Stage4BatchAnalysis()
319
+ insights = stage4.analyze_batch(sample_reviews)
320
+
321
+ print("\n📊 BATCH INSIGHTS:")
322
+ print(json.dumps(insights, indent=2))
323
+ print("\n✅ Stage 4 test complete!")