Spaces:

riazmo
/

CxSentimentAnalysisAI

Sleeping

App Files Files Community

riazmo commited on Dec 31, 2025

Commit

087ac11

verified ·

1 Parent(s): 771474c

Upload 11 files

Browse files

Files changed (11) hide show

app.py +751 -0
config.yaml +508 -0
config_loader.py +120 -0
database_enhanced.py +403 -0
gradio_pipeline.py +321 -0
langgraph_graph.py +313 -0
langgraph_nodes.py +583 -0
langgraph_state.py +217 -0
requirements.txt +27 -3
stage0_scraper.py +302 -0
stage4_batch_analysis.py +323 -0

app.py ADDED Viewed

	@@ -0,0 +1,751 @@

+"""
+HuggingFace Spaces - Review Intelligence System (Streamlit)
+Complete app with URL input, progress tracking, and interactive dashboard
+"""
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+import os
+from datetime import datetime
+from typing import List, Dict, Optional
+import time
+from gradio_pipeline import GradioPipeline
+# ============================================================================
+# PAGE CONFIGURATION
+# ============================================================================
+st.set_page_config(
+    page_title="Review Intelligence System",
+    page_icon="🎯",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+    <style>
+    .main {
+        padding: 0rem 1rem;
+    }
+    .stMetric {
+        background-color: #f0f2f6;
+        padding: 15px;
+        border-radius: 5px;
+    }
+    .big-font {
+        font-size: 24px !important;
+        font-weight: bold;
+    }
+    .success-box {
+        padding: 20px;
+        border-radius: 10px;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        margin: 20px 0;
+    }
+    </style>
+""", unsafe_allow_html=True)
+# ============================================================================
+# SESSION STATE INITIALIZATION
+# ============================================================================
+if 'processing_complete' not in st.session_state:
+    st.session_state.processing_complete = False
+if 'results' not in st.session_state:
+    st.session_state.results = None
+if 'insights' not in st.session_state:
+    st.session_state.insights = None
+if 'scraped_count' not in st.session_state:
+    st.session_state.scraped_count = 0
+# ============================================================================
+# PROCESSING FUNCTIONS
+# ============================================================================
+def process_reviews_streamlit(app_store_urls: str, play_store_urls: str,
+                              hf_api_key: str, review_limit: int):
+    """
+    Process reviews with Streamlit progress tracking
+    """
+    # Validate inputs
+    if not hf_api_key or not hf_api_key.strip():
+        st.error("❌ Please provide your HuggingFace API key")
+        return False
+    if not app_store_urls.strip() and not play_store_urls.strip():
+        st.error("❌ Please provide at least one App Store or Play Store URL")
+        return False
+    try:
+        # Set API key
+        os.environ['HUGGINGFACE_API_KEY'] = hf_api_key.strip()
+        # Progress indicators
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        # Initialize pipeline
+        status_text.text("🚀 Initializing pipeline...")
+        progress_bar.progress(5)
+        pipeline = GradioPipeline(review_limit=review_limit)
+        # Parse URLs
+        app_urls = [url.strip() for url in app_store_urls.split('\n') if url.strip()]
+        play_urls = [url.strip() for url in play_store_urls.split('\n') if url.strip()]
+        # Stage 0: Scraping
+        status_text.text("🕷️ Scraping reviews from stores...")
+        progress_bar.progress(10)
+        scraped_count = 0
+        total_apps = len(app_urls) + len(play_urls)
+        for i, app_id in enumerate(app_urls, 1):
+            status_text.text(f"🍎 Scraping App Store ({i}/{total_apps}): {app_id}")
+            reviews = pipeline.scraper.scrape_app_store_rss(app_id, country="ae", limit=review_limit)
+            saved = pipeline.scraper.save_reviews_to_db(reviews)
+            scraped_count += saved
+            progress_bar.progress(10 + int(20 * i / total_apps))
+            time.sleep(1)
+        for i, package in enumerate(play_urls, 1):
+            status_text.text(f"🤖 Scraping Play Store ({i}/{total_apps}): {package}")
+            reviews = pipeline.scraper.scrape_play_store_api(package, country="ae", limit=review_limit)
+            saved = pipeline.scraper.save_reviews_to_db(reviews)
+            scraped_count += saved
+            progress_bar.progress(10 + int(20 * (len(app_urls) + i) / total_apps))
+            time.sleep(1)
+        if scraped_count == 0:
+            st.warning("⚠️ No reviews scraped. Please check your URLs and try again.")
+            progress_bar.empty()
+            status_text.empty()
+            return False
+        st.session_state.scraped_count = scraped_count
+        # Stage 1-3: Processing
+        status_text.text("🤖 Processing reviews with AI models...")
+        progress_bar.progress(30)
+        reviews = pipeline.db.get_pending_reviews(limit=review_limit)
+        total_reviews = len(reviews)
+        processed_states = []
+        for i, review in enumerate(reviews, 1):
+            review_id = review.get('review_id', 'unknown')[:20]
+            status_text.text(f"🤖 Processing review {i}/{total_reviews}: {review_id}...")
+            progress_bar.progress(30 + int(60 * i / total_reviews))
+            try:
+                from langgraph_state import create_initial_state
+                state = create_initial_state(review)
+                config = {"configurable": {"thread_id": f"review_{review.get('review_id')}"}}
+                final_state = pipeline.review_graph.invoke(state, config=config)
+                processed_states.append(dict(final_state))
+            except Exception as e:
+                st.warning(f"⚠️ Error processing review: {str(e)}")
+                continue
+        if len(processed_states) == 0:
+            st.error("❌ No reviews were processed successfully.")
+            progress_bar.empty()
+            status_text.empty()
+            return False
+        # Stage 4: Batch Analysis
+        status_text.text("📊 Generating batch insights...")
+        progress_bar.progress(90)
+        insights = pipeline.analyze_batch(processed_states)
+        # Store in session state
+        st.session_state.results = processed_states
+        st.session_state.insights = insights
+        st.session_state.processing_complete = True
+        # Complete
+        progress_bar.progress(100)
+        status_text.text("✅ Analysis complete!")
+        time.sleep(1)
+        progress_bar.empty()
+        status_text.empty()
+        return True
+    except Exception as e:
+        st.error(f"❌ Error during processing: {str(e)}")
+        import traceback
+        st.code(traceback.format_exc())
+        return False
+# ============================================================================
+# VISUALIZATION FUNCTIONS
+# ============================================================================
+def create_summary_section(scraped_count: int, results: List[Dict], insights: Dict):
+    """Create summary metrics section"""
+    total = len(results)
+    positive = insights.get('sentiment_distribution', {}).get('POSITIVE', 0)
+    neutral = insights.get('sentiment_distribution', {}).get('NEUTRAL', 0)
+    negative = insights.get('sentiment_distribution', {}).get('NEGATIVE', 0)
+    critical = insights.get('priority_distribution', {}).get('critical', 0)
+    churn_risk = insights.get('churn_risk', 0)
+    # Success header
+    st.markdown(
+        f"""
+        <div class="success-box">
+            <h1 style="margin: 0;">✅ Analysis Complete!</h1>
+            <p style="margin: 10px 0 0 0; font-size: 1.2em; opacity: 0.9;">
+                Review Intelligence System Results
+            </p>
+        </div>
+        """,
+        unsafe_allow_html=True
+    )
+    # Metrics
+    col1, col2, col3, col4, col5 = st.columns(5)
+    with col1:
+        st.metric("Total Reviews", total, f"Scraped: {scraped_count}")
+    with col2:
+        pos_pct = (positive / total * 100) if total > 0 else 0
+        st.metric("Positive", positive, f"{pos_pct:.1f}%")
+    with col3:
+        neg_pct = (negative / total * 100) if total > 0 else 0
+        st.metric("Negative", negative, f"{neg_pct:.1f}%")
+    with col4:
+        st.metric("Critical Issues", critical, "🚨" if critical > 0 else "✅")
+    with col5:
+        delta_color = "inverse" if churn_risk > 30 else "normal"
+        st.metric("Churn Risk", f"{churn_risk:.1f}%",
+                 "⚠️ High" if churn_risk > 30 else "✅ Low")
+    # Recommendations
+    st.markdown("### 💡 Key Recommendations")
+    for rec in insights.get('recommendations', []):
+        st.info(rec)
+def create_sentiment_chart(insights: Dict):
+    """Create sentiment distribution donut chart"""
+    sentiment_dist = insights.get('sentiment_distribution', {})
+    labels = list(sentiment_dist.keys())
+    values = list(sentiment_dist.values())
+    colors = ['#2ca02c', '#ff7f0e', '#d62728']
+    fig = go.Figure(data=[go.Pie(
+        labels=labels,
+        values=values,
+        hole=0.5,
+        marker_colors=colors,
+        textinfo='label+percent',
+        textposition='outside',
+        textfont_size=14
+    )])
+    fig.update_layout(
+        title="😊 Sentiment Distribution",
+        showlegend=True,
+        height=400
+    )
+    return fig
+def create_priority_chart(insights: Dict):
+    """Create priority distribution bar chart"""
+    priority_dist = insights.get('priority_distribution', {})
+    priority_order = ['critical', 'high', 'medium', 'low']
+    labels = [p for p in priority_order if p in priority_dist]
+    values = [priority_dist.get(p, 0) for p in labels]
+    colors = ['#d62728', '#ff7f0e', '#1f77b4', '#2ca02c']
+    fig = go.Figure(data=[go.Bar(
+        x=labels,
+        y=values,
+        marker_color=colors[:len(labels)],
+        text=values,
+        textposition='auto'
+    )])
+    fig.update_layout(
+        title="🎯 Priority Levels",
+        xaxis_title="Priority",
+        yaxis_title="Count",
+        height=400
+    )
+    return fig
+def create_department_chart(insights: Dict):
+    """Create department routing horizontal bar chart"""
+    dept_dist = insights.get('department_distribution', {})
+    labels = list(dept_dist.keys())
+    values = list(dept_dist.values())
+    fig = go.Figure(data=[go.Bar(
+        x=values,
+        y=labels,
+        orientation='h',
+        marker_color='#667eea',
+        text=values,
+        textposition='auto'
+    )])
+    fig.update_layout(
+        title="🏢 Department Routing",
+        xaxis_title="Number of Issues",
+        yaxis_title="Department",
+        height=400
+    )
+    return fig
+def create_emotion_chart(insights: Dict):
+    """Create emotion distribution chart"""
+    emotion_dist = insights.get('emotion_distribution', {})
+    labels = list(emotion_dist.keys())
+    values = list(emotion_dist.values())
+    fig = px.bar(
+        x=labels,
+        y=values,
+        labels={'x': 'Emotion', 'y': 'Count'},
+        color=values,
+        color_continuous_scale='Viridis'
+    )
+    fig.update_layout(
+        title="😊 Emotional Analysis",
+        xaxis_title="Emotion Type",
+        yaxis_title="Number of Reviews",
+        height=300,
+        showlegend=False
+    )
+    return fig
+def create_reviews_dataframe(results: List[Dict]) -> pd.DataFrame:
+    """Create DataFrame for reviews table"""
+    df_data = []
+    for review in results:
+        df_data.append({
+            'Review ID': review.get('review_id', 'N/A')[:20],
+            'Rating': review.get('rating', 0),
+            'Review': (review.get('review_text', 'N/A') or '')[:100] + '...',
+            'Sentiment': review.get('stage3_final_sentiment', 'N/A'),
+            'Type': review.get('stage1_llm1_type', 'N/A'),
+            'Department': review.get('stage1_llm1_department', 'N/A'),
+            'Priority': review.get('stage1_llm1_priority', 'N/A'),
+            'Emotion': review.get('stage1_llm2_emotion', 'N/A'),
+            'Needs Review': '🚨 Yes' if review.get('stage3_needs_human_review') else '✅ No'
+        })
+    return pd.DataFrame(df_data)
+# ============================================================================
+# MAIN APP
+# ============================================================================
+def main():
+    """Main Streamlit app"""
+    # Title
+    st.title("🎯 Review Intelligence System")
+    st.markdown("### Multi-Stage AI Analysis for App Store & Play Store Reviews")
+    st.markdown("Powered by **LangGraph** + **HuggingFace** • 4-Stage Processing Pipeline")
+    st.markdown("---")
+    # Sidebar - Input or View Mode
+    with st.sidebar:
+        st.header("🎛️ Control Panel")
+        if st.session_state.processing_complete:
+            st.success("✅ Analysis Complete!")
+            if st.button("🔄 Start New Analysis", use_container_width=True):
+                st.session_state.processing_complete = False
+                st.session_state.results = None
+                st.session_state.insights = None
+                st.rerun()
+        else:
+            st.info("👈 Enter URLs below to start")
+    # Main content - Input or Results
+    if not st.session_state.processing_complete:
+        # INPUT MODE
+        show_input_form()
+    else:
+        # RESULTS MODE
+        show_results_dashboard()
+def show_input_form():
+    """Show input form for URLs and API key"""
+    st.markdown("### 📝 Step 1: Enter Store URLs")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("#### 🍎 App Store IDs")
+        st.markdown(
+            """
+            **Format:** Just paste the app ID
+            - Example: `1158907446` (UAE)
+            - Example: `1234567890` (US)
+            """
+        )
+        app_store_urls = st.text_area(
+            "App Store IDs (one per line)",
+            placeholder="1158907446\n1234567890",
+            height=150,
+            key="app_urls"
+        )
+    with col2:
+        st.markdown("#### 🤖 Play Store Packages")
+        st.markdown(
+            """
+            **Format:** Package name
+            - Example: `com.yas.app`
+            - Example: `com.company.app`
+            """
+        )
+        play_store_urls = st.text_area(
+            "Play Store Package Names (one per line)",
+            placeholder="com.yas.app\ncom.company.app",
+            height=150,
+            key="play_urls"
+        )
+    st.markdown("---")
+    st.markdown("### 🔑 Step 2: Configure Settings")
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        hf_api_key = st.text_input(
+            "🔑 HuggingFace API Key",
+            type="password",
+            placeholder="hf_...",
+            help="Get your key from: https://huggingface.co/settings/tokens",
+            key="hf_key"
+        )
+    with col2:
+        review_limit = st.slider(
+            "📊 Reviews per App",
+            min_value=5,
+            max_value=100,
+            value=20,
+            step=5,
+            help="More reviews = longer processing time",
+            key="review_limit"
+        )
+    st.markdown("---")
+    # Submit button
+    col1, col2, col3 = st.columns([1, 1, 1])
+    with col2:
+        if st.button("🚀 Start Analysis", use_container_width=True, type="primary"):
+            with st.spinner("Processing..."):
+                success = process_reviews_streamlit(
+                    app_store_urls,
+                    play_store_urls,
+                    hf_api_key,
+                    review_limit
+                )
+                if success:
+                    st.balloons()
+                    st.rerun()
+    # Documentation
+    with st.expander("📚 How to Use"):
+        st.markdown("""
+        ### 📖 Quick Guide
+        **1. Get HuggingFace API Key:**
+        - Visit: https://huggingface.co/settings/tokens
+        - Create new token (Read access)
+        - Copy token (starts with `hf_`)
+        **2. Enter URLs:**
+        - **App Store**: Just the ID number (e.g., `1234567890`)
+        - **Play Store**: Package name (e.g., `com.company.app`)
+        - One per line
+        **3. Click Start:**
+        - Watch progress bar
+        - Wait for completion (~7 sec per review)
+        - View results automatically
+        ### 🏗️ What Happens:
+        - 🕷️ **Stage 0**: Scrapes reviews from stores
+        - 🤖 **Stage 1**: Classifies with 3 AI models (Type, Department, Priority)
+        - 😊 **Stage 2**: Analyzes sentiment with dual BERT models
+        - 📊 **Stage 3**: Synthesizes insights and recommendations
+        - 💡 **Stage 4**: Generates batch analytics
+        ### ⚡ Performance:
+        - ~7 seconds per review
+        - 7 AI models working together
+        - Parallel execution for speed
+        """)
+def show_results_dashboard():
+    """Show results dashboard with charts and tables"""
+    results = st.session_state.results
+    insights = st.session_state.insights
+    scraped_count = st.session_state.scraped_count
+    # Summary section
+    create_summary_section(scraped_count, results, insights)
+    st.markdown("---")
+    # Tabs for different views
+    tab1, tab2, tab3, tab4 = st.tabs([
+        "📊 Sentiment Analysis",
+        "🚨 Critical Issues",
+        "📋 All Reviews",
+        "📥 Export"
+    ])
+    # TAB 1: Sentiment Analysis
+    with tab1:
+        st.header("📊 Sentiment Analysis Overview")
+        col1, col2 = st.columns(2)
+        with col1:
+            fig_sentiment = create_sentiment_chart(insights)
+            st.plotly_chart(fig_sentiment, use_container_width=True)
+        with col2:
+            fig_priority = create_priority_chart(insights)
+            st.plotly_chart(fig_priority, use_container_width=True)
+        st.markdown("### 🏢 Department Routing")
+        fig_dept = create_department_chart(insights)
+        st.plotly_chart(fig_dept, use_container_width=True)
+        st.markdown("### 😊 Emotional Analysis")
+        fig_emotion = create_emotion_chart(insights)
+        st.plotly_chart(fig_emotion, use_container_width=True)
+    # TAB 2: Critical Issues
+    with tab2:
+        st.header("🚨 Critical Issues Requiring Attention")
+        # Filter critical reviews
+        critical_reviews = [
+            r for r in results
+            if (r.get('stage1_llm1_priority') == 'critical' or
+                r.get('stage3_needs_human_review') or
+                (r.get('stage3_final_sentiment') == 'NEGATIVE' and r.get('rating', 5) <= 2))
+        ]
+        if len(critical_reviews) == 0:
+            st.success("✅ No critical issues found! All reviews are in good shape.")
+        else:
+            st.warning(f"Found {len(critical_reviews)} critical issues")
+            for review in critical_reviews:
+                with st.expander(
+                    f"⚠️ {review.get('review_id', 'Unknown')[:30]} - "
+                    f"Rating: {review.get('rating', 'N/A')}/5"
+                ):
+                    col1, col2 = st.columns([2, 1])
+                    with col1:
+                        st.markdown("**Review Text:**")
+                        st.write(review.get('review_text', 'No text available'))
+                        st.markdown("**Reasoning:**")
+                        st.info(review.get('stage3_reasoning', 'No reasoning available'))
+                    with col2:
+                        st.markdown("**Classification:**")
+                        st.write(f"📌 Type: {review.get('stage1_llm1_type', 'N/A')}")
+                        st.write(f"🏢 Department: {review.get('stage1_llm1_department', 'N/A')}")
+                        st.write(f"🎯 Priority: {review.get('stage1_llm1_priority', 'N/A')}")
+                        st.write(f"😔 Emotion: {review.get('stage1_llm2_emotion', 'N/A')}")
+                        st.write(f"💭 Sentiment: {review.get('stage3_final_sentiment', 'N/A')}")
+                        st.markdown("**Action:**")
+                        st.error(review.get('stage3_action_recommendation', 'No action specified'))
+    # TAB 3: All Reviews
+    with tab3:
+        st.header("📋 Detailed Review Analysis")
+        # Create DataFrame
+        df = create_reviews_dataframe(results)
+        # Filters
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            sentiment_filter = st.multiselect(
+                "Filter by Sentiment",
+                options=df['Sentiment'].unique(),
+                default=df['Sentiment'].unique()
+            )
+        with col2:
+            dept_filter = st.multiselect(
+                "Filter by Department",
+                options=df['Department'].unique(),
+                default=df['Department'].unique()
+            )
+        with col3:
+            priority_filter = st.multiselect(
+                "Filter by Priority",
+                options=df['Priority'].unique(),
+                default=df['Priority'].unique()
+            )
+        # Apply filters
+        filtered_df = df[
+            (df['Sentiment'].isin(sentiment_filter)) &
+            (df['Department'].isin(dept_filter)) &
+            (df['Priority'].isin(priority_filter))
+        ]
+        st.info(f"Showing {len(filtered_df)} of {len(df)} reviews")
+        # Display table
+        st.dataframe(
+            filtered_df,
+            use_container_width=True,
+            height=600
+        )
+    # TAB 4: Export
+    with tab4:
+        st.header("📥 Export Results")
+        st.markdown("### Download Options")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.markdown("#### 📊 CSV Export")
+            st.write("Download complete analysis with all classifications")
+            df = create_reviews_dataframe(results)
+            csv = df.to_csv(index=False)
+            st.download_button(
+                label="📥 Download CSV Report",
+                data=csv,
+                file_name=f"review_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                mime="text/csv",
+                use_container_width=True
+            )
+        with col2:
+            st.markdown("#### 📋 JSON Export")
+            st.write("Download raw data with all details")
+            import json
+            json_data = json.dumps({
+                'results': results,
+                'insights': insights,
+                'scraped_count': scraped_count,
+                'export_date': datetime.now().isoformat()
+            }, indent=2)
+            st.download_button(
+                label="📥 Download JSON Data",
+                data=json_data,
+                file_name=f"review_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
+                mime="application/json",
+                use_container_width=True
+            )
+        st.markdown("---")
+        st.markdown("### 📊 Summary Statistics")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Total Reviews Analyzed", len(results))
+        with col2:
+            positive = insights.get('sentiment_distribution', {}).get('POSITIVE', 0)
+            total = len(results)
+            pct = (positive / total * 100) if total > 0 else 0
+            st.metric("Positive Rate", f"{pct:.1f}%")
+        with col3:
+            critical = insights.get('priority_distribution', {}).get('critical', 0)
+            st.metric("Critical Issues", critical)
+# ============================================================================
+# FOOTER
+# ============================================================================
+def show_footer():
+    """Show footer with credits"""
+    st.markdown("---")
+    st.markdown(
+        """
+        <div style='text-align: center'>
+            <p>🤖 Powered by Multi-Stage AI Pipeline |
+            Stage 1: Classification (Qwen, Mistral, Llama) |
+            Stage 2: Sentiment (Twitter-BERT) |
+            Stage 3: Finalization (Llama 70B) |
+            Stage 4: Batch Analysis</p>
+            <p>Built with ❤️ using LangGraph + HuggingFace + Streamlit</p>
+        </div>
+        """,
+        unsafe_allow_html=True
+    )
+# ============================================================================
+# RUN APP
+# ============================================================================
+if __name__ == "__main__":
+    main()
+    show_footer()

config.yaml ADDED Viewed

	@@ -0,0 +1,508 @@

+# 🎯 Review Intelligence System Configuration
+# Edit this file to customize agent behavior, prompts, and models
+# =============================================================================
+# MODELS CONFIGURATION
+# =============================================================================
+models:
+  # Stage 1: Classification Models
+  stage1:
+    llm1:
+      name: "Qwen/Qwen2.5-72B-Instruct"
+      temperature: 0.1
+      max_tokens: 200
+      role: "Type, Department, Priority classifier"
+    llm2:
+      name: "mistralai/Mistral-7B-Instruct-v0.3"
+      temperature: 0.1
+      max_tokens: 200
+      role: "User type, Emotion, Context analyzer"
+    manager:
+      name: "meta-llama/Llama-3.1-8B-Instruct"
+      temperature: 0.1
+      max_tokens: 250
+      role: "Synthesis manager"
+  # Stage 2: Sentiment Models (Local BERT)
+  stage2:
+    best_model:
+      name: "cardiffnlp/twitter-roberta-base-sentiment-latest"
+      type: "local"
+      description: "Twitter-trained RoBERTa (124M tweets)"
+    alternate_model:
+      name: "finiteautomata/bertweet-base-sentiment-analysis"
+      type: "local"
+      description: "BERTweet (850M tweets)"
+  # Stage 3: Finalization Model
+  stage3:
+    llm3:
+      name: "meta-llama/Llama-3.1-70B-Instruct"
+      temperature: 0.1
+      max_tokens: 400
+      role: "Final synthesis and reasoning"
+# =============================================================================
+# AGENT PERSONAS & PROMPTS
+# =============================================================================
+personas:
+  # LLM1: Classification Expert
+  llm1:
+    name: "Classification Specialist"
+    expertise: "Expert at classifying customer reviews for theme park and attraction apps"
+    personality: "Analytical, precise, focused on categorization"
+    tone: "Professional and systematic"
+    system_prompt: |
+      You are an expert at classifying customer reviews for theme park and attraction apps.
+      Your job is to analyze reviews and categorize them across multiple dimensions.
+      Be precise, analytical, and consistent in your classifications.
+    categories:
+      type:
+        - complaint: "Customer reports a problem"
+        - praise: "Customer expresses satisfaction"
+        - suggestion: "Customer proposes improvement"
+        - question: "Customer asks about something"
+        - bug_report: "Technical issue described"
+      department:
+        - engineering: "Technical issues, bugs, crashes"
+        - ux: "Design, usability, interface issues"
+        - support: "Customer service, help needed"
+        - business: "Pricing, policies, marketing"
+      priority:
+        - critical: "Service down, major blocker"
+        - high: "Significant problem affecting use"
+        - medium: "Inconvenience but not blocking"
+        - low: "Minor issue or suggestion"
+  # LLM2: Psychology Expert
+  llm2:
+    name: "User Psychology Analyst"
+    expertise: "Expert at understanding customer psychology and emotional context"
+    personality: "Empathetic, insightful, human-centered"
+    tone: "Warm yet professional"
+    system_prompt: |
+      You are an expert at understanding customer psychology and emotional context.
+      Your job is to analyze the human behind the review - their emotions, user type, and context.
+      Be empathetic, insightful, and focus on the human experience.
+    categories:
+      user_type:
+        - new_user: "First-time or new user"
+        - regular_user: "Returning customer"
+        - power_user: "Heavy user, tech-savvy"
+        - churning_user: "Considering leaving"
+      emotion:
+        - anger: "Angry, hostile tone"
+        - frustration: "Frustrated but not angry"
+        - joy: "Happy, satisfied"
+        - satisfaction: "Content, pleased"
+        - disappointment: "Let down, sad"
+        - confusion: "Unclear, needs help"
+  # Manager: Synthesis Expert
+  manager:
+    name: "Synthesis Manager"
+    expertise: "Expert at reconciling multiple AI analyses and making final decisions"
+    personality: "Balanced, fair, decisive"
+    tone: "Authoritative yet collaborative"
+    system_prompt: |
+      You are a synthesis manager evaluating two AI analyses of the same review.
+      Your job is to validate both analyses, resolve conflicts, and make final classification decisions.
+      Be thorough, fair, and provide clear reasoning for your decisions.
+  # LLM3: Strategic Analyst
+  llm3:
+    name: "Strategic Decision Maker"
+    expertise: "Expert at synthesizing complex data and providing actionable recommendations"
+    personality: "Strategic, comprehensive, business-focused"
+    tone: "Executive-level, actionable"
+    system_prompt: |
+      You are a final decision-making AI analyzing customer feedback for a theme park/attraction app.
+      Your job is to synthesize all previous analysis stages and provide comprehensive, actionable insights.
+      Think strategically about business impact, user satisfaction, and operational priorities.
+      Your recommendations should be clear, specific, and immediately actionable.
+# =============================================================================
+# CLASSIFICATION RULES
+# =============================================================================
+classification_rules:
+  # Priority escalation rules
+  priority_escalation:
+    keywords_critical:
+      - "crash"
+      - "doesn't work"
+      - "broken"
+      - "can't use"
+      - "completely unusable"
+      - "emergency"
+      - "urgent"
+    keywords_high:
+      - "bug"
+      - "error"
+      - "problem"
+      - "issue"
+      - "not working"
+      - "frustrated"
+    rating_thresholds:
+      critical: 1  # 1-star reviews are critical
+      high: 2      # 2-star reviews are high priority
+  # Department routing rules
+  department_keywords:
+    engineering:
+      - "crash"
+      - "bug"
+      - "error"
+      - "not loading"
+      - "freeze"
+      - "slow"
+      - "technical"
+    ux:
+      - "confusing"
+      - "hard to use"
+      - "can't find"
+      - "design"
+      - "layout"
+      - "interface"
+      - "navigation"
+    support:
+      - "help"
+      - "contact"
+      - "customer service"
+      - "support"
+      - "assistance"
+      - "question"
+    business:
+      - "price"
+      - "refund"
+      - "subscription"
+      - "billing"
+      - "expensive"
+      - "policy"
+  # Churn risk indicators
+  churn_indicators:
+    high_risk:
+      - "switching to"
+      - "deleted the app"
+      - "uninstalling"
+      - "terrible experience"
+      - "never again"
+      - "disappointed"
+    medium_risk:
+      - "might switch"
+      - "considering alternatives"
+      - "getting worse"
+      - "used to be better"
+# =============================================================================
+# SENTIMENT ANALYSIS SETTINGS
+# =============================================================================
+sentiment:
+  # Agreement thresholds
+  agreement:
+    strong_threshold: 0.9  # Both models >0.9 confidence
+    weak_threshold: 0.6    # One model <0.6 confidence
+  # Confidence weighting
+  confidence:
+    minimum_acceptable: 0.5
+    high_confidence: 0.8
+    very_high_confidence: 0.95
+  # Override rules
+  override_rules:
+    # If rating is 1-star but sentiment is positive, flag for review
+    rating_sentiment_mismatch:
+      enabled: true
+      flag_threshold: 2  # 2-star difference
+# =============================================================================
+# BATCH ANALYSIS SETTINGS
+# =============================================================================
+batch_analysis:
+  # Critical issues detection
+  critical_issues:
+    max_display: 10
+    criteria:
+      - priority: "critical"
+      - sentiment: "NEGATIVE"
+      - rating: "<=2"
+      - needs_human_review: true
+  # Quick wins detection
+  quick_wins:
+    max_display: 10
+    criteria:
+      - type: "suggestion"
+      - priority: ["low", "medium"]
+      - feasibility: "easy"
+  # Churn risk calculation
+  churn_risk:
+    weights:
+      churning_user: 2.0
+      negative_low_rating: 1.5
+      rating_1_star: 1.0
+    thresholds:
+      high: 30    # >30% is high risk
+      medium: 15  # 15-30% is medium risk
+      low: 0      # <15% is low risk
+# =============================================================================
+# PROMPT TEMPLATES
+# =============================================================================
+prompt_templates:
+  # Stage 1 LLM1 Prompt
+  stage1_llm1: |
+    You are an expert at classifying customer reviews for theme park and attraction apps.
+    REVIEW:
+    Rating: {rating}/5
+    Text: {review_text}
+    Classify this review across these dimensions:
+    1. TYPE (choose ONE): {type_options}
+    2. DEPARTMENT (choose ONE): {department_options}
+    3. PRIORITY (choose ONE): {priority_options}
+    4. CONFIDENCE (0.0-1.0): How confident are you in this classification?
+    5. REASONING: Brief one-sentence explanation
+    Respond ONLY in valid JSON format:
+    {{
+      "type": "complaint/praise/suggestion/question/bug_report",
+      "department": "engineering/ux/support/business",
+      "priority": "critical/high/medium/low",
+      "confidence": 0.0-1.0,
+      "reasoning": "brief explanation"
+    }}
+  # Stage 1 LLM2 Prompt
+  stage1_llm2: |
+    You are an expert at understanding customer psychology and emotional context.
+    REVIEW:
+    Rating: {rating}/5
+    Text: {review_text}
+    Analyze the user and emotional context:
+    1. USER_TYPE (choose ONE): {user_type_options}
+    2. EMOTION (choose ONE): {emotion_options}
+    3. CONTEXT (brief): What is the underlying issue or situation? 1-2 words summary
+    4. CONFIDENCE (0.0-1.0): How confident are you?
+    5. REASONING: Brief one-sentence explanation
+    Respond ONLY in valid JSON format:
+    {{
+      "user_type": "new_user/regular_user/power_user/churning_user",
+      "emotion": "anger/frustration/joy/satisfaction/disappointment/confusion",
+      "context": "brief context",
+      "confidence": 0.0-1.0,
+      "reasoning": "brief explanation"
+    }}
+  # Stage 1 Manager Prompt
+  stage1_manager: |
+    You are a synthesis manager evaluating two AI analyses of the same review.
+    REVIEW:
+    Rating: {rating}/5
+    Text: {review_text}
+    LLM1 ANALYSIS (Type/Dept/Priority):
+    {llm1_result}
+    LLM2 ANALYSIS (User/Emotion/Context):
+    {llm2_result}
+    Your task:
+    1. Validate both analyses
+    2. Resolve any conflicts
+    3. Make final classification decision
+    4. Provide synthesis reasoning
+    Respond ONLY in valid JSON format:
+    {{
+      "final_type": "from llm1 or adjusted",
+      "final_department": "from llm1 or adjusted",
+      "final_priority": "from llm1 or adjusted",
+      "final_user_type": "from llm2 or adjusted",
+      "final_emotion": "from llm2 or adjusted",
+      "confidence": 0.0-1.0,
+      "reasoning": "synthesis explanation",
+      "conflicts_found": "any conflicts between LLM1 and LLM2, or 'none'"
+    }}
+  # Stage 3 LLM3 Prompt
+  stage3_llm3: |
+    You are a final decision-making AI analyzing customer feedback for a theme park/attraction app.
+    REVIEW DATA:
+    Rating: {rating}/5
+    Text: {review_text}
+    STAGE 1 CLASSIFICATION:
+    - Review Type: {type}
+    - Department: {department}
+    - Priority: {priority}
+    - User Type: {user_type}
+    - Emotion: {emotion}
+    STAGE 2 SENTIMENT ANALYSIS:
+    - Best Model: {best_sentiment} (confidence: {best_confidence})
+    - Alternate Model: {alt_sentiment} (confidence: {alt_confidence})
+    - Models Agreement: {agreement}
+    YOUR TASK:
+    1. Review all data from both stages
+    2. Make FINAL sentiment decision (POSITIVE, NEGATIVE, or NEUTRAL)
+    3. Validate that classification and sentiment align
+    4. Provide comprehensive reasoning
+    5. Identify any conflicts between stages
+    6. Generate action recommendation
+    7. Flag if human review is needed
+    Respond ONLY in valid JSON format:
+    {{
+      "final_sentiment": "POSITIVE/NEGATIVE/NEUTRAL",
+      "confidence": 0.0-1.0,
+      "reasoning": "Comprehensive explanation synthesizing all stages",
+      "validation_notes": "Does classification match sentiment?",
+      "conflicts_found": "any conflicts or 'none'",
+      "action_recommendation": "Specific action to take",
+      "needs_human_review": true/false
+    }}
+# =============================================================================
+# PROCESSING SETTINGS
+# =============================================================================
+processing:
+  # Batch settings
+  batch_size: 10
+  max_workers: 3
+  timeout_seconds: 30
+  retry_attempts: 3
+  # Rate limiting (for HF API)
+  rate_limit:
+    requests_per_minute: 60
+    requests_per_day: 10000  # HF Pro limit
+  # Logging
+  logging:
+    level: "INFO"  # DEBUG, INFO, WARNING, ERROR
+    save_logs: true
+    log_file: "processing.log"
+  # Checkpointing
+  checkpoint:
+    enabled: true
+    save_after_each_stage: true
+    auto_resume: true
+# =============================================================================
+# DASHBOARD SETTINGS
+# =============================================================================
+dashboard:
+  # UI Configuration
+  ui:
+    title: "Review Intelligence System"
+    icon: "🎯"
+    layout: "wide"
+    theme: "light"  # light or dark
+  # Chart colors
+  colors:
+    positive: "#2ca02c"
+    neutral: "#ff7f0e"
+    negative: "#d62728"
+    critical: "#d62728"
+    high: "#ff7f0e"
+    medium: "#1f77b4"
+    low: "#2ca02c"
+  # Filters
+  filters:
+    enable_sentiment: true
+    enable_department: true
+    enable_priority: true
+    enable_date_range: false  # Future feature
+  # Display limits
+  display:
+    max_critical_issues: 20
+    max_quick_wins: 15
+    reviews_per_page: 50
+    auto_refresh_seconds: 60
+# =============================================================================
+# DOMAIN-SPECIFIC CUSTOMIZATION (Theme Parks / Attractions)
+# =============================================================================
+domain:
+  name: "Theme Parks & Attractions"
+  # Common features to look for
+  features:
+    - "ticket booking"
+    - "queue times"
+    - "express pass"
+    - "meal plans"
+    - "park maps"
+    - "show times"
+    - "photo pass"
+    - "virtual queue"
+    - "ride reservations"
+    - "mobile ordering"
+  # Pain points to prioritize
+  pain_points:
+    high_impact:
+      - "can't book tickets"
+      - "app crashes during booking"
+      - "payment fails"
+      - "queue times wrong"
+      - "can't access tickets"
+    medium_impact:
+      - "map doesn't load"
+      - "slow performance"
+      - "confusing navigation"
+      - "notifications not working"
+  # Positive signals
+  positive_signals:
+    - "easy booking"
+    - "fast check-in"
+    - "helpful features"
+    - "saved time"
+    - "convenient"
+    - "great experience"
+# =============================================================================
+# NOTES
+# =============================================================================
+# - Edit this file to customize agent behavior
+# - Prompts support variables in {curly_braces}
+# - Model names must match HuggingFace model IDs
+# - Temperature: 0.0 = deterministic, 1.0 = creative
+# - Changes take effect on next run (no restart needed for some settings)

config_loader.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+Configuration Loader
+Loads settings from config.yaml for agent personas and prompts
+"""
+import yaml
+import os
+from typing import Dict, Any
+class Config:
+    """
+    Configuration manager for the Review Intelligence System
+    Loads and provides access to config.yaml settings
+    """
+    def __init__(self, config_file: str = "config.yaml"):
+        self.config_file = config_file
+        self.config = self._load_config()
+    def _load_config(self) -> Dict[str, Any]:
+        """Load configuration from YAML file"""
+        if not os.path.exists(self.config_file):
+            print(f"⚠️  Config file not found: {self.config_file}")
+            print("   Using default configuration")
+            return self._default_config()
+        try:
+            with open(self.config_file, 'r') as f:
+                config = yaml.safe_load(f)
+            print(f"✅ Configuration loaded from {self.config_file}")
+            return config
+        except Exception as e:
+            print(f"⚠️  Error loading config: {e}")
+            print("   Using default configuration")
+            return self._default_config()
+    def _default_config(self) -> Dict[str, Any]:
+        """Return default configuration if YAML not available"""
+        return {
+            'models': {
+                'stage1': {
+                    'llm1': {'name': 'Qwen/Qwen2.5-72B-Instruct', 'temperature': 0.1},
+                    'llm2': {'name': 'mistralai/Mistral-7B-Instruct-v0.3', 'temperature': 0.1},
+                    'manager': {'name': 'meta-llama/Llama-3.1-8B-Instruct', 'temperature': 0.1}
+                },
+                'stage2': {
+                    'best_model': {'name': 'cardiffnlp/twitter-roberta-base-sentiment-latest'},
+                    'alternate_model': {'name': 'finiteautomata/bertweet-base-sentiment-analysis'}
+                },
+                'stage3': {
+                    'llm3': {'name': 'meta-llama/Llama-3.1-70B-Instruct', 'temperature': 0.1}
+                }
+            }
+        }
+    def get_model(self, stage: str, model_key: str) -> Dict[str, Any]:
+        """Get model configuration for a specific stage"""
+        return self.config.get('models', {}).get(stage, {}).get(model_key, {})
+    def get_persona(self, agent: str) -> Dict[str, Any]:
+        """Get persona configuration for an agent"""
+        return self.config.get('personas', {}).get(agent, {})
+    def get_prompt_template(self, template_name: str) -> str:
+        """Get prompt template"""
+        return self.config.get('prompt_templates', {}).get(template_name, '')
+    def get_classification_rules(self) -> Dict[str, Any]:
+        """Get classification rules"""
+        return self.config.get('classification_rules', {})
+    def get_sentiment_settings(self) -> Dict[str, Any]:
+        """Get sentiment analysis settings"""
+        return self.config.get('sentiment', {})
+    def get_batch_settings(self) -> Dict[str, Any]:
+        """Get batch analysis settings"""
+        return self.config.get('batch_analysis', {})
+    def get_processing_settings(self) -> Dict[str, Any]:
+        """Get processing settings"""
+        return self.config.get('processing', {})
+    def get_dashboard_settings(self) -> Dict[str, Any]:
+        """Get dashboard settings"""
+        return self.config.get('dashboard', {})
+# Singleton instance
+_config_instance = None
+def get_config(config_file: str = "config.yaml") -> Config:
+    """Get or create config singleton"""
+    global _config_instance
+    if _config_instance is None:
+        _config_instance = Config(config_file)
+    return _config_instance
+if __name__ == "__main__":
+    # Test config loader
+    print("\n" + "="*60)
+    print("🧪 TESTING CONFIG LOADER")
+    print("="*60 + "\n")
+    config = get_config()
+    # Test model access
+    llm1_config = config.get_model('stage1', 'llm1')
+    print(f"LLM1 Model: {llm1_config.get('name', 'Not found')}")
+    # Test persona access
+    llm1_persona = config.get_persona('llm1')
+    print(f"LLM1 Persona: {llm1_persona.get('name', 'Not found')}")
+    # Test prompt template
+    prompt = config.get_prompt_template('stage1_llm1')
+    print(f"Prompt template loaded: {len(prompt)} characters")
+    print("\n✅ Config loader test complete!")

database_enhanced.py ADDED Viewed

	@@ -0,0 +1,403 @@

+"""
+Enhanced Database Schema for Multi-Stage Review Analysis
+Adds Stage 1-4 columns to existing reviews table
+"""
+import sqlite3
+from datetime import datetime
+from typing import Dict, List, Any, Optional
+import json
+class EnhancedDatabase:
+    """
+    Manages enhanced database schema with Stage 1-4 columns
+    """
+    def __init__(self, db_file: str = "review_database.db"):
+        self.db_file = db_file
+        self.conn = None
+        print(f"📁 Database: {db_file}")
+    def connect(self):
+        """Connect to database"""
+        self.conn = sqlite3.connect(self.db_file)
+        self.conn.row_factory = sqlite3.Row
+        print("✅ Connected to database")
+        return self.conn
+    def close(self):
+        """Close database connection"""
+        if self.conn:
+            self.conn.close()
+            print("✅ Database connection closed")
+    def enhance_schema(self):
+        """
+        Add Stage 1-4 columns to existing reviews table
+        Non-destructive: keeps all existing data
+        """
+        print("\n" + "="*60)
+        print("🔧 ENHANCING DATABASE SCHEMA")
+        print("="*60)
+        cursor = self.conn.cursor()
+        # Get existing columns
+        cursor.execute("PRAGMA table_info(reviews)")
+        existing_columns = [row[1] for row in cursor.fetchall()]
+        print(f"📋 Existing columns: {len(existing_columns)}")
+        # Stage 1: Classification columns
+        stage1_columns = [
+            ("stage1_llm1_type", "TEXT"),
+            ("stage1_llm1_department", "TEXT"),
+            ("stage1_llm1_priority", "TEXT"),
+            ("stage1_llm1_confidence", "REAL"),
+            ("stage1_llm1_reasoning", "TEXT"),
+            ("stage1_llm2_user_type", "TEXT"),
+            ("stage1_llm2_emotion", "TEXT"),
+            ("stage1_llm2_context", "TEXT"),
+            ("stage1_llm2_confidence", "REAL"),
+            ("stage1_llm2_reasoning", "TEXT"),
+            ("stage1_manager_classification", "TEXT"),
+            ("stage1_manager_reasoning", "TEXT"),
+            ("stage1_completed_at", "TIMESTAMP"),
+        ]
+        # Stage 2: Sentiment columns
+        stage2_columns = [
+            ("stage2_best_sentiment", "TEXT"),
+            ("stage2_best_confidence", "REAL"),
+            ("stage2_best_prob_positive", "REAL"),
+            ("stage2_best_prob_neutral", "REAL"),
+            ("stage2_best_prob_negative", "REAL"),
+            ("stage2_alt_sentiment", "TEXT"),
+            ("stage2_alt_confidence", "REAL"),
+            ("stage2_alt_prob_positive", "REAL"),
+            ("stage2_alt_prob_neutral", "REAL"),
+            ("stage2_alt_prob_negative", "REAL"),
+            ("stage2_agreement", "BOOLEAN"),
+            ("stage2_layer_sentiment", "TEXT"),
+            ("stage2_completed_at", "TIMESTAMP"),
+        ]
+        # Stage 3: Finalization columns
+        stage3_columns = [
+            ("stage3_final_sentiment", "TEXT"),
+            ("stage3_confidence", "REAL"),
+            ("stage3_reasoning", "TEXT"),
+            ("stage3_validation_notes", "TEXT"),
+            ("stage3_conflicts_found", "TEXT"),
+            ("stage3_action_recommendation", "TEXT"),
+            ("stage3_needs_human_review", "BOOLEAN"),
+            ("stage3_completed_at", "TIMESTAMP"),
+        ]
+        # Processing metadata
+        metadata_columns = [
+            ("processing_status", "TEXT DEFAULT 'pending'"),
+            ("processing_version", "TEXT DEFAULT 'v1.0'"),
+            ("processing_started_at", "TIMESTAMP"),
+            ("processing_completed_at", "TIMESTAMP"),
+        ]
+        all_new_columns = (
+            stage1_columns +
+            stage2_columns +
+            stage3_columns +
+            metadata_columns
+        )
+        # Add columns that don't exist
+        added_count = 0
+        for col_name, col_type in all_new_columns:
+            if col_name not in existing_columns:
+                try:
+                    cursor.execute(f"ALTER TABLE reviews ADD COLUMN {col_name} {col_type}")
+                    added_count += 1
+                    print(f"   ✅ Added column: {col_name}")
+                except sqlite3.OperationalError as e:
+                    if "duplicate column" not in str(e).lower():
+                        print(f"   ⚠️  Error adding {col_name}: {e}")
+        self.conn.commit()
+        print(f"\n✅ Schema enhanced: {added_count} new columns added")
+        # Create logs table for LLM decisions
+        self._create_logs_table(cursor)
+        # Create batch insights table
+        self._create_batch_insights_table(cursor)
+        return added_count
+    def _create_logs_table(self, cursor):
+        """Create table for LLM decision logs"""
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS llm_decision_logs (
+                log_id INTEGER PRIMARY KEY AUTOINCREMENT,
+                review_id TEXT NOT NULL,
+                stage TEXT NOT NULL,
+                model_name TEXT NOT NULL,
+                input_prompt TEXT,
+                output_response TEXT,
+                confidence REAL,
+                reasoning TEXT,
+                processing_time_seconds REAL,
+                timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                FOREIGN KEY (review_id) REFERENCES reviews(review_id)
+            )
+        """)
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_logs_review_id
+            ON llm_decision_logs(review_id)
+        """)
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_logs_stage
+            ON llm_decision_logs(stage)
+        """)
+        self.conn.commit()
+        print("   ✅ Created llm_decision_logs table")
+    def _create_batch_insights_table(self, cursor):
+        """Create table for batch analytics"""
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS batch_insights (
+                batch_id INTEGER PRIMARY KEY AUTOINCREMENT,
+                analysis_date DATE,
+                total_reviews INTEGER,
+                sentiment_positive INTEGER,
+                sentiment_neutral INTEGER,
+                sentiment_negative INTEGER,
+                priority_critical INTEGER,
+                priority_high INTEGER,
+                priority_medium INTEGER,
+                priority_low INTEGER,
+                dept_engineering INTEGER,
+                dept_ux INTEGER,
+                dept_support INTEGER,
+                dept_business INTEGER,
+                critical_issues TEXT,
+                quick_wins TEXT,
+                recommendations TEXT,
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+        """)
+        self.conn.commit()
+        print("   ✅ Created batch_insights table")
+    def get_pending_reviews(self, limit: Optional[int] = None) -> List[Dict]:
+        """Get reviews that haven't been processed yet"""
+        cursor = self.conn.cursor()
+        query = """
+            SELECT * FROM reviews
+            WHERE processing_status IS NULL OR processing_status = 'pending'
+            ORDER BY scraped_at DESC
+        """
+        if limit:
+            query += f" LIMIT {limit}"
+        cursor.execute(query)
+        rows = cursor.fetchall()
+        return [dict(row) for row in rows]
+    def update_stage1(self, review_id: str, data: Dict[str, Any]):
+        """Update Stage 1 classification data"""
+        cursor = self.conn.cursor()
+        cursor.execute("""
+            UPDATE reviews SET
+                stage1_llm1_type = ?,
+                stage1_llm1_department = ?,
+                stage1_llm1_priority = ?,
+                stage1_llm1_confidence = ?,
+                stage1_llm1_reasoning = ?,
+                stage1_llm2_user_type = ?,
+                stage1_llm2_emotion = ?,
+                stage1_llm2_context = ?,
+                stage1_llm2_confidence = ?,
+                stage1_llm2_reasoning = ?,
+                stage1_manager_classification = ?,
+                stage1_manager_reasoning = ?,
+                stage1_completed_at = ?,
+                processing_status = 'stage1_complete'
+            WHERE review_id = ?
+        """, (
+            data.get('llm1_type'),
+            data.get('llm1_department'),
+            data.get('llm1_priority'),
+            data.get('llm1_confidence'),
+            data.get('llm1_reasoning'),
+            data.get('llm2_user_type'),
+            data.get('llm2_emotion'),
+            data.get('llm2_context'),
+            data.get('llm2_confidence'),
+            data.get('llm2_reasoning'),
+            data.get('manager_classification'),
+            data.get('manager_reasoning'),
+            datetime.now().isoformat(),
+            review_id
+        ))
+        self.conn.commit()
+    def update_stage2(self, review_id: str, data: Dict[str, Any]):
+        """Update Stage 2 sentiment data"""
+        cursor = self.conn.cursor()
+        cursor.execute("""
+            UPDATE reviews SET
+                stage2_best_sentiment = ?,
+                stage2_best_confidence = ?,
+                stage2_best_prob_positive = ?,
+                stage2_best_prob_neutral = ?,
+                stage2_best_prob_negative = ?,
+                stage2_alt_sentiment = ?,
+                stage2_alt_confidence = ?,
+                stage2_alt_prob_positive = ?,
+                stage2_alt_prob_neutral = ?,
+                stage2_alt_prob_negative = ?,
+                stage2_agreement = ?,
+                stage2_layer_sentiment = ?,
+                stage2_completed_at = ?,
+                processing_status = 'stage2_complete'
+            WHERE review_id = ?
+        """, (
+            data.get('best_sentiment'),
+            data.get('best_confidence'),
+            data.get('best_prob_positive'),
+            data.get('best_prob_neutral'),
+            data.get('best_prob_negative'),
+            data.get('alt_sentiment'),
+            data.get('alt_confidence'),
+            data.get('alt_prob_positive'),
+            data.get('alt_prob_neutral'),
+            data.get('alt_prob_negative'),
+            data.get('agreement'),
+            data.get('layer_sentiment'),
+            datetime.now().isoformat(),
+            review_id
+        ))
+        self.conn.commit()
+    def update_stage3(self, review_id: str, data: Dict[str, Any]):
+        """Update Stage 3 finalization data"""
+        cursor = self.conn.cursor()
+        cursor.execute("""
+            UPDATE reviews SET
+                stage3_final_sentiment = ?,
+                stage3_confidence = ?,
+                stage3_reasoning = ?,
+                stage3_validation_notes = ?,
+                stage3_conflicts_found = ?,
+                stage3_action_recommendation = ?,
+                stage3_needs_human_review = ?,
+                stage3_completed_at = ?,
+                processing_status = 'complete',
+                processing_completed_at = ?
+            WHERE review_id = ?
+        """, (
+            data.get('final_sentiment'),
+            data.get('confidence'),
+            data.get('reasoning'),
+            data.get('validation_notes'),
+            data.get('conflicts_found'),
+            data.get('action_recommendation'),
+            data.get('needs_human_review'),
+            datetime.now().isoformat(),
+            datetime.now().isoformat(),
+            review_id
+        ))
+        self.conn.commit()
+    def log_llm_decision(self, review_id: str, stage: str, model_name: str,
+                        input_prompt: str, output_response: str,
+                        confidence: float, reasoning: str, processing_time: float):
+        """Log LLM decision for audit trail"""
+        cursor = self.conn.cursor()
+        cursor.execute("""
+            INSERT INTO llm_decision_logs
+            (review_id, stage, model_name, input_prompt, output_response,
+             confidence, reasoning, processing_time_seconds)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+        """, (
+            review_id, stage, model_name, input_prompt, output_response,
+            confidence, reasoning, processing_time
+        ))
+        self.conn.commit()
+    def get_all_processed_reviews(self) -> List[Dict]:
+        """Get all reviews that have been fully processed"""
+        cursor = self.conn.cursor()
+        cursor.execute("""
+            SELECT * FROM reviews
+            WHERE processing_status = 'complete'
+            ORDER BY processing_completed_at DESC
+        """)
+        rows = cursor.fetchall()
+        return [dict(row) for row in rows]
+    def save_batch_insights(self, insights: Dict[str, Any]):
+        """Save batch analytics to database"""
+        cursor = self.conn.cursor()
+        cursor.execute("""
+            INSERT INTO batch_insights
+            (analysis_date, total_reviews, sentiment_positive, sentiment_neutral,
+             sentiment_negative, priority_critical, priority_high, priority_medium,
+             priority_low, dept_engineering, dept_ux, dept_support, dept_business,
+             critical_issues, quick_wins, recommendations)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """, (
+            datetime.now().date(),
+            insights.get('total_reviews', 0),
+            insights.get('sentiment_positive', 0),
+            insights.get('sentiment_neutral', 0),
+            insights.get('sentiment_negative', 0),
+            insights.get('priority_critical', 0),
+            insights.get('priority_high', 0),
+            insights.get('priority_medium', 0),
+            insights.get('priority_low', 0),
+            insights.get('dept_engineering', 0),
+            insights.get('dept_ux', 0),
+            insights.get('dept_support', 0),
+            insights.get('dept_business', 0),
+            json.dumps(insights.get('critical_issues', [])),
+            json.dumps(insights.get('quick_wins', [])),
+            json.dumps(insights.get('recommendations', []))
+        ))
+        self.conn.commit()
+        print("   ✅ Batch insights saved to database")
+if __name__ == "__main__":
+    # Test database enhancement
+    print("\n" + "="*60)
+    print("🧪 TESTING DATABASE ENHANCEMENT")
+    print("="*60 + "\n")
+    db = EnhancedDatabase()
+    db.connect()
+    db.enhance_schema()
+    # Get pending reviews
+    pending = db.get_pending_reviews(limit=5)
+    print(f"\n📋 Found {len(pending)} pending reviews")
+    db.close()
+    print("\n✅ Database enhancement test complete!")

gradio_pipeline.py ADDED Viewed

	@@ -0,0 +1,321 @@

+"""
+Gradio Pipeline - Streamlined processing for HuggingFace Spaces
+Integrates scraping, classification, sentiment, and batch analysis with progress tracking
+"""
+import os
+import sqlite3
+import time
+from typing import List, Dict, Any, Optional, Callable
+from datetime import datetime
+import json
+# Import existing modules
+from stage0_scraper import Stage0WebScraper
+from langgraph_state import ReviewState, create_initial_state
+from langgraph_graph import build_review_graph, build_batch_graph
+from database_enhanced import EnhancedDatabase
+from stage4_batch_analysis import Stage4BatchAnalysis
+class GradioPipeline:
+    """
+    Streamlined pipeline for Gradio interface
+    Handles scraping, processing, and analysis with progress callbacks
+    """
+    def __init__(self, db_file: str = "review_database.db", review_limit: int = 20):
+        self.db_file = db_file
+        self.review_limit = review_limit
+        # Initialize database
+        self.db = EnhancedDatabase(db_file)
+        self.db.connect()
+        self.db.enhance_schema()
+        # Initialize scraper
+        self.scraper = Stage0WebScraper(db_file)
+        self.scraper.create_reviews_table()
+        # Build graphs
+        self.review_graph = build_review_graph()
+        self.batch_graph = build_batch_graph()
+        print("✅ Gradio Pipeline initialized")
+    def scrape_reviews(
+        self,
+        app_store_ids: List[str],
+        play_store_packages: List[str],
+        progress_callback: Optional[Callable] = None
+    ) -> int:
+        """
+        Scrape reviews from App Store and Play Store
+        Args:
+            app_store_ids: List of App Store IDs
+            play_store_packages: List of Play Store package names
+            progress_callback: Optional Gradio progress callback
+        Returns:
+            Total number of reviews scraped
+        """
+        total_scraped = 0
+        total_apps = len(app_store_ids) + len(play_store_packages)
+        if total_apps == 0:
+            return 0
+        current_app = 0
+        # Scrape App Store
+        for app_id in app_store_ids:
+            current_app += 1
+            if progress_callback:
+                progress_val = 0.1 + (0.2 * current_app / total_apps)
+                progress_callback(
+                    progress_val,
+                    desc=f"🍎 Scraping App Store ({current_app}/{total_apps}): {app_id}"
+                )
+            try:
+                reviews = self.scraper.scrape_app_store_rss(
+                    app_id,
+                    country="ae",
+                    limit=self.review_limit
+                )
+                saved = self.scraper.save_reviews_to_db(reviews)
+                total_scraped += saved
+                print(f"   ✅ App Store {app_id}: {saved} reviews")
+            except Exception as e:
+                print(f"   ❌ App Store {app_id} error: {e}")
+                continue
+            time.sleep(1)  # Rate limiting
+        # Scrape Play Store
+        for package in play_store_packages:
+            current_app += 1
+            if progress_callback:
+                progress_val = 0.1 + (0.2 * current_app / total_apps)
+                progress_callback(
+                    progress_val,
+                    desc=f"🤖 Scraping Play Store ({current_app}/{total_apps}): {package}"
+                )
+            try:
+                reviews = self.scraper.scrape_play_store_api(
+                    package,
+                    country="ae",
+                    limit=self.review_limit
+                )
+                saved = self.scraper.save_reviews_to_db(reviews)
+                total_scraped += saved
+                print(f"   ✅ Play Store {package}: {saved} reviews")
+            except Exception as e:
+                print(f"   ❌ Play Store {package} error: {e}")
+                continue
+            time.sleep(1)  # Rate limiting
+        print(f"\n✅ Total scraped: {total_scraped} reviews")
+        return total_scraped
+    def process_reviews(
+        self,
+        progress_callback: Optional[Callable] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Process reviews through Stages 1-3
+        Args:
+            progress_callback: Optional Gradio progress callback
+        Returns:
+            List of processed review dictionaries
+        """
+        # Get pending reviews
+        reviews = self.db.get_pending_reviews(limit=self.review_limit)
+        total_reviews = len(reviews)
+        if total_reviews == 0:
+            print("⚠️  No pending reviews to process")
+            return []
+        print(f"\n📊 Processing {total_reviews} reviews...")
+        processed_states = []
+        for i, review in enumerate(reviews, 1):
+            review_id = review.get('review_id', 'unknown')
+            if progress_callback:
+                progress_val = 0.3 + (0.6 * i / total_reviews)
+                progress_callback(
+                    progress_val,
+                    desc=f"🤖 Processing review {i}/{total_reviews}: {review_id[:20]}..."
+                )
+            try:
+                # Create initial state
+                state = create_initial_state(review)
+                # Run through LangGraph
+                config = {"configurable": {"thread_id": f"review_{review_id}"}}
+                final_state = self.review_graph.invoke(state, config=config)
+                # Convert state to dict for easier handling
+                processed_states.append(dict(final_state))
+                print(f"   ✅ Review {i}/{total_reviews} processed")
+            except Exception as e:
+                print(f"   ❌ Error processing review {review_id}: {e}")
+                continue
+        print(f"\n✅ Processed {len(processed_states)}/{total_reviews} reviews")
+        return processed_states
+    def analyze_batch(
+        self,
+        processed_reviews: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """
+        Run Stage 4: Batch Analysis
+        Args:
+            processed_reviews: List of processed review states
+        Returns:
+            Batch insights dictionary
+        """
+        if not processed_reviews:
+            return {}
+        print(f"\n📊 Running batch analysis on {len(processed_reviews)} reviews...")
+        # Convert states to review dicts for Stage 4
+        reviews_for_analysis = []
+        for state in processed_reviews:
+            review_dict = {
+                'review_id': state.get('review_id'),
+                'review_text': state.get('review_text'),
+                'rating': state.get('rating'),
+                'stage1_llm1_type': state.get('classification_type'),
+                'stage1_llm1_department': state.get('department'),
+                'stage1_llm1_priority': state.get('priority'),
+                'stage1_llm2_user_type': state.get('user_type'),
+                'stage1_llm2_emotion': state.get('emotion'),
+                'stage2_agreement': state.get('sentiment_agreement'),
+                'stage3_final_sentiment': state.get('final_sentiment'),
+                'stage3_needs_human_review': state.get('needs_human_review'),
+                'stage3_reasoning': state.get('reasoning'),
+                'stage3_action_recommendation': state.get('action_recommendation'),
+            }
+            reviews_for_analysis.append(review_dict)
+        # Run Stage 4
+        stage4 = Stage4BatchAnalysis()
+        insights = stage4.analyze_batch(reviews_for_analysis)
+        # Save to database
+        self.db.save_batch_insights(insights)
+        print("✅ Batch analysis complete")
+        return insights
+    def get_all_processed_reviews(self) -> List[Dict[str, Any]]:
+        """Get all processed reviews from database"""
+        return self.db.get_all_processed_reviews()
+    def close(self):
+        """Clean up"""
+        self.db.close()
+# ============================================================================
+# HELPER FUNCTIONS FOR GRADIO
+# ============================================================================
+def parse_app_store_url(url: str) -> Optional[str]:
+    """
+    Extract App Store ID from URL or return as-is if already an ID
+    Examples:
+        - "1234567890" -> "1234567890"
+        - "https://apps.apple.com/us/app/name/id1234567890" -> "1234567890"
+    """
+    url = url.strip()
+    # Check if it's already just a number
+    if url.isdigit():
+        return url
+    # Extract from URL
+    if 'apps.apple.com' in url:
+        parts = url.split('/id')
+        if len(parts) > 1:
+            app_id = parts[1].split('?')[0].split('/')[0]
+            if app_id.isdigit():
+                return app_id
+    # Try to find any number in the string
+    import re
+    numbers = re.findall(r'\d+', url)
+    if numbers:
+        # Return the longest number (likely the app ID)
+        return max(numbers, key=len)
+    return None
+def parse_play_store_url(url: str) -> Optional[str]:
+    """
+    Extract package name from Play Store URL or return as-is
+    Examples:
+        - "com.company.app" -> "com.company.app"
+        - "https://play.google.com/store/apps/details?id=com.company.app" -> "com.company.app"
+    """
+    url = url.strip()
+    # Check if it's already a package name (has dots)
+    if '.' in url and not url.startswith('http'):
+        return url
+    # Extract from URL
+    if 'play.google.com' in url:
+        if 'id=' in url:
+            package = url.split('id=')[1].split('&')[0]
+            return package
+    return url if '.' in url else None
+if __name__ == "__main__":
+    print("\n" + "="*60)
+    print("🧪 TESTING GRADIO PIPELINE")
+    print("="*60)
+    # Test URL parsing
+    print("\n📱 Testing URL parsing:")
+    test_app_urls = [
+        "1234567890",
+        "https://apps.apple.com/us/app/name/id1234567890",
+    ]
+    for url in test_app_urls:
+        app_id = parse_app_store_url(url)
+        print(f"   {url} -> {app_id}")
+    test_play_urls = [
+        "com.company.app",
+        "https://play.google.com/store/apps/details?id=com.company.app",
+    ]
+    for url in test_play_urls:
+        package = parse_play_store_url(url)
+        print(f"   {url} -> {package}")
+    print("\n✅ Gradio pipeline test complete!")

langgraph_graph.py ADDED Viewed

	@@ -0,0 +1,313 @@

+"""
+LangGraph Graph Definition
+Defines the review processing workflow with conditional routing
+"""
+from langgraph.graph import StateGraph, END
+from langgraph.checkpoint.memory import MemorySaver
+from typing import Literal
+from langgraph_state import ReviewState, BatchState, create_initial_state
+from langgraph_nodes import (
+    stage1_classification_node,
+    stage2_sentiment_node,
+    stage3_finalization_node
+)
+from stage4_batch_analysis import Stage4BatchAnalysis
+from database_enhanced import EnhancedDatabase
+# ============================================================================
+# DATABASE SYNC NODES
+# ============================================================================
+def save_stage1_to_db_node(state: ReviewState) -> dict:
+    """Save Stage 1 results to database"""
+    db = EnhancedDatabase()
+    db.connect()
+    try:
+        stage1_data = {
+            'llm1_type': state['llm1_result'].get('type'),
+            'llm1_department': state['llm1_result'].get('department'),
+            'llm1_priority': state['llm1_result'].get('priority'),
+            'llm1_confidence': state['llm1_result'].get('confidence'),
+            'llm1_reasoning': state['llm1_result'].get('reasoning'),
+            'llm2_user_type': state['llm2_result'].get('user_type'),
+            'llm2_emotion': state['llm2_result'].get('emotion'),
+            'llm2_context': state['llm2_result'].get('context'),
+            'llm2_confidence': state['llm2_result'].get('confidence'),
+            'llm2_reasoning': state['llm2_result'].get('reasoning'),
+            'manager_classification': str(state['manager_result']),
+            'manager_reasoning': state['manager_result'].get('reasoning'),
+        }
+        db.update_stage1(state['review_id'], stage1_data)
+        db.close()
+        return {"db_stage1_saved": True}
+    except Exception as e:
+        db.close()
+        errors = state.get('errors', [])
+        errors.append(f"DB Stage 1 save error: {str(e)}")
+        return {"errors": errors}
+def save_stage2_to_db_node(state: ReviewState) -> dict:
+    """Save Stage 2 results to database"""
+    db = EnhancedDatabase()
+    db.connect()
+    try:
+        stage2_data = {
+            'best_sentiment': state['best_sentiment_result']['sentiment'],
+            'best_confidence': state['best_sentiment_result']['confidence'],
+            'best_prob_positive': state['best_sentiment_result']['prob_positive'],
+            'best_prob_neutral': state['best_sentiment_result']['prob_neutral'],
+            'best_prob_negative': state['best_sentiment_result']['prob_negative'],
+            'alt_sentiment': state['alt_sentiment_result']['sentiment'],
+            'alt_confidence': state['alt_sentiment_result']['confidence'],
+            'alt_prob_positive': state['alt_sentiment_result']['prob_positive'],
+            'alt_prob_neutral': state['alt_sentiment_result']['prob_neutral'],
+            'alt_prob_negative': state['alt_sentiment_result']['prob_negative'],
+            'agreement': state['sentiment_agreement'],
+            'layer_sentiment': state['sentiment'],
+        }
+        db.update_stage2(state['review_id'], stage2_data)
+        db.close()
+        return {"db_stage2_saved": True}
+    except Exception as e:
+        db.close()
+        errors = state.get('errors', [])
+        errors.append(f"DB Stage 2 save error: {str(e)}")
+        return {"errors": errors}
+def save_stage3_to_db_node(state: ReviewState) -> dict:
+    """Save Stage 3 results to database"""
+    db = EnhancedDatabase()
+    db.connect()
+    try:
+        stage3_data = {
+            'final_sentiment': state['final_sentiment'],
+            'confidence': state['final_confidence'],
+            'reasoning': state['reasoning'],
+            'validation_notes': state['validation_notes'],
+            'conflicts_found': state['conflicts_found'],
+            'action_recommendation': state['action_recommendation'],
+            'needs_human_review': state['needs_human_review'],
+        }
+        db.update_stage3(state['review_id'], stage3_data)
+        db.close()
+        return {"db_stage3_saved": True}
+    except Exception as e:
+        db.close()
+        errors = state.get('errors', [])
+        errors.append(f"DB Stage 3 save error: {str(e)}")
+        return {"errors": errors}
+# ============================================================================
+# STAGE 4: BATCH ANALYSIS NODE
+# ============================================================================
+def stage4_batch_analysis_node(state: BatchState) -> dict:
+    """
+    Stage 4 Node: Batch analysis
+    Runs after all reviews are processed
+    """
+    print(f"\n{'='*70}")
+    print(f"📊 STAGE 4: BATCH ANALYSIS")
+    print(f"{'='*70}")
+    stage4 = Stage4BatchAnalysis()
+    # Convert ReviewState list to dict format for Stage4
+    reviews_for_analysis = []
+    for review_state in state['all_reviews']:
+        review_dict = {
+            'review_id': review_state['review_id'],
+            'review_text': review_state['review_text'],
+            'rating': review_state['rating'],
+            'stage1_llm1_type': review_state.get('classification_type'),
+            'stage1_llm1_department': review_state.get('department'),
+            'stage1_llm1_priority': review_state.get('priority'),
+            'stage1_llm2_user_type': review_state.get('user_type'),
+            'stage1_llm2_emotion': review_state.get('emotion'),
+            'stage2_agreement': review_state.get('sentiment_agreement'),
+            'stage3_final_sentiment': review_state.get('final_sentiment'),
+            'stage3_needs_human_review': review_state.get('needs_human_review'),
+            'stage3_reasoning': review_state.get('reasoning'),
+            'stage3_action_recommendation': review_state.get('action_recommendation'),
+        }
+        reviews_for_analysis.append(review_dict)
+    # Analyze batch
+    insights = stage4.analyze_batch(reviews_for_analysis)
+    # Save to database
+    db = EnhancedDatabase()
+    db.connect()
+    db.save_batch_insights(insights)
+    db.close()
+    return {
+        'sentiment_distribution': insights.get('sentiment_distribution'),
+        'priority_distribution': insights.get('priority_distribution'),
+        'department_distribution': insights.get('department_distribution'),
+        'emotion_distribution': insights.get('emotion_distribution'),
+        'critical_issues': insights.get('critical_issues'),
+        'quick_wins': insights.get('quick_wins'),
+        'churn_risk': insights.get('churn_risk'),
+        'model_agreement_rate': insights.get('model_agreement_rate'),
+        'recommendations': insights.get('recommendations'),
+        'batch_completed_at': insights.get('batch_completed_at')
+    }
+# ============================================================================
+# ROUTING FUNCTIONS
+# ============================================================================
+def route_after_stage3(state: ReviewState) -> Literal["human_review", "complete"]:
+    """
+    Conditional routing after Stage 3
+    Decides if human review is needed
+    """
+    # Check if human review needed
+    if state.get('needs_human_review', False):
+        return "human_review"
+    # Check confidence threshold
+    if state.get('final_confidence', 1.0) < 0.5:
+        return "human_review"
+    # Check for conflicts
+    if state.get('conflicts_found', 'none') != 'none':
+        return "human_review"
+    # Check priority
+    if state.get('priority') == 'critical':
+        return "human_review"
+    return "complete"
+def human_review_queue_node(state: ReviewState) -> dict:
+    """
+    Node for reviews flagged for human review
+    Just marks them in the database
+    """
+    print(f"         🚨 FLAGGED for human review")
+    # Could integrate with ticketing system, email alerts, etc.
+    # For now, just mark in state
+    return {
+        "route_to": "human_review"
+    }
+# ============================================================================
+# BUILD REVIEW PROCESSING GRAPH
+# ============================================================================
+def build_review_graph():
+    """
+    Build the complete review processing graph
+    """
+    # Create graph
+    workflow = StateGraph(ReviewState)
+    # Add all nodes
+    workflow.add_node("stage1_classify", stage1_classification_node)
+    workflow.add_node("save_stage1", save_stage1_to_db_node)
+    workflow.add_node("stage2_sentiment", stage2_sentiment_node)
+    workflow.add_node("save_stage2", save_stage2_to_db_node)
+    workflow.add_node("stage3_finalize", stage3_finalization_node)
+    workflow.add_node("save_stage3", save_stage3_to_db_node)
+    workflow.add_node("human_review_queue", human_review_queue_node)
+    # Add edges (sequential flow through stages)
+    workflow.add_edge("stage1_classify", "save_stage1")
+    workflow.add_edge("save_stage1", "stage2_sentiment")
+    workflow.add_edge("stage2_sentiment", "save_stage2")
+    workflow.add_edge("save_stage2", "stage3_finalize")
+    workflow.add_edge("stage3_finalize", "save_stage3")
+    # Add conditional routing after Stage 3
+    workflow.add_conditional_edges(
+        "save_stage3",
+        route_after_stage3,
+        {
+            "human_review": "human_review_queue",
+            "complete": END
+        }
+    )
+    # Human review goes to END
+    workflow.add_edge("human_review_queue", END)
+    # Set entry point
+    workflow.set_entry_point("stage1_classify")
+    # Compile with checkpointing
+    memory = MemorySaver()
+    graph = workflow.compile(checkpointer=memory)
+    return graph
+# ============================================================================
+# BUILD BATCH ANALYSIS GRAPH (Stage 4)
+# ============================================================================
+def build_batch_graph():
+    """
+    Build the batch analysis graph (Stage 4)
+    This runs after all reviews are processed
+    """
+    workflow = StateGraph(BatchState)
+    # Add batch analysis node
+    workflow.add_node("stage4_batch", stage4_batch_analysis_node)
+    # Simple linear flow
+    workflow.set_entry_point("stage4_batch")
+    workflow.add_edge("stage4_batch", END)
+    # Compile
+    graph = workflow.compile()
+    return graph
+if __name__ == "__main__":
+    print("\n" + "="*60)
+    print("🧪 TESTING LANGGRAPH GRAPH BUILDER")
+    print("="*60)
+    # Build review graph
+    print("\n📊 Building review processing graph...")
+    review_graph = build_review_graph()
+    print("   ✅ Review graph built!")
+    # Build batch graph
+    print("\n📊 Building batch analysis graph...")
+    batch_graph = build_batch_graph()
+    print("   ✅ Batch graph built!")
+    print("\n✅ Graph builder test complete!")

langgraph_nodes.py ADDED Viewed

	@@ -0,0 +1,583 @@

+"""
+LangGraph Nodes
+All node functions for the review processing graph
+Implements parallel execution where possible
+"""
+import os
+import json
+import time
+from typing import Dict, Any
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from huggingface_hub import InferenceClient
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import warnings
+warnings.filterwarnings('ignore')
+from langgraph_state import ReviewState, BatchState
+from database_enhanced import EnhancedDatabase
+# Initialize HF client (singleton)
+HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
+hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else None
+# Initialize sentiment models (singleton) - load once
+_sentiment_models_loaded = False
+_best_tokenizer = None
+_best_model = None
+_alt_tokenizer = None
+_alt_model = None
+def load_sentiment_models():
+    """Load sentiment models once (singleton pattern)"""
+    global _sentiment_models_loaded, _best_tokenizer, _best_model, _alt_tokenizer, _alt_model
+    if _sentiment_models_loaded:
+        return
+    print("   📦 Loading Twitter-BERT models (one-time)...")
+    # Best Model
+    _best_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
+    _best_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
+    _best_model.eval()
+    # Alternate Model
+    _alt_tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
+    _alt_model = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
+    _alt_model.eval()
+    _sentiment_models_loaded = True
+    print("   ✅ Sentiment models loaded!")
+# ============================================================================
+# STAGE 1: CLASSIFICATION NODE (Parallel LLM1 + LLM2)
+# ============================================================================
+def llm1_classify(review: Dict[str, Any]) -> Dict[str, Any]:
+    """LLM1: Type, Department, Priority classification"""
+    review_text = review.get('review_text', '')
+    rating = review.get('rating', 3)
+    prompt = f"""You are an expert at classifying customer reviews for theme park and attraction apps.
+REVIEW:
+Rating: {rating}/5
+Text: {review_text}
+Classify this review across these dimensions:
+1. TYPE (choose ONE):
+   - complaint: Customer reports a problem
+   - praise: Customer expresses satisfaction
+   - suggestion: Customer proposes improvement
+   - question: Customer asks about something
+   - bug_report: Technical issue described
+2. DEPARTMENT (choose ONE):
+   - engineering: Technical issues, bugs, crashes
+   - ux: Design, usability, interface issues
+   - support: Customer service, help needed
+   - business: Pricing, policies, marketing
+3. PRIORITY (choose ONE):
+   - critical: Service down, major blocker
+   - high: Significant problem affecting use
+   - medium: Inconvenience but not blocking
+   - low: Minor issue or suggestion
+4. CONFIDENCE (0.0-1.0): How confident are you?
+5. REASONING: Brief one-sentence explanation
+Respond ONLY in valid JSON format:
+{{
+  "type": "complaint/praise/suggestion/question/bug_report",
+  "department": "engineering/ux/support/business",
+  "priority": "critical/high/medium/low",
+  "confidence": 0.0-1.0,
+  "reasoning": "brief explanation"
+}}"""
+    try:
+        response = hf_client.text_generation(
+            prompt,
+            model="Qwen/Qwen2.5-72B-Instruct",
+            max_new_tokens=200,
+            temperature=0.1
+        )
+        # Clean and parse JSON
+        response_clean = response.strip()
+        if response_clean.startswith('```'):
+            response_clean = response_clean.split('```')[1]
+            if response_clean.startswith('json'):
+                response_clean = response_clean[4:]
+        response_clean = response_clean.strip()
+        result = json.loads(response_clean)
+        result['model'] = 'Qwen/Qwen2.5-72B-Instruct'
+        return result
+    except Exception as e:
+        return {
+            'type': 'unknown',
+            'department': 'unknown',
+            'priority': 'medium',
+            'confidence': 0.0,
+            'reasoning': f'Error: {str(e)}',
+            'model': 'Qwen/Qwen2.5-72B-Instruct'
+        }
+def llm2_analyze(review: Dict[str, Any]) -> Dict[str, Any]:
+    """LLM2: User type, Emotion, Context analysis"""
+    review_text = review.get('review_text', '')
+    rating = review.get('rating', 3)
+    prompt = f"""You are an expert at understanding customer psychology and emotional context.
+REVIEW:
+Rating: {rating}/5
+Text: {review_text}
+Analyze the user and emotional context:
+1. USER_TYPE (choose ONE):
+   - new_user: First-time or new user
+   - regular_user: Returning customer
+   - power_user: Heavy user, tech-savvy
+   - churning_user: Considering leaving
+2. EMOTION (choose ONE):
+   - anger: Angry, hostile tone
+   - frustration: Frustrated but not angry
+   - joy: Happy, satisfied
+   - satisfaction: Content, pleased
+   - disappointment: Let down, sad
+   - confusion: Unclear, needs help
+3. CONTEXT (brief): What is the underlying issue? 1-2 words
+4. CONFIDENCE (0.0-1.0): How confident are you?
+5. REASONING: Brief one-sentence explanation
+Respond ONLY in valid JSON format:
+{{
+  "user_type": "new_user/regular_user/power_user/churning_user",
+  "emotion": "anger/frustration/joy/satisfaction/disappointment/confusion",
+  "context": "brief context",
+  "confidence": 0.0-1.0,
+  "reasoning": "brief explanation"
+}}"""
+    try:
+        response = hf_client.text_generation(
+            prompt,
+            model="mistralai/Mistral-7B-Instruct-v0.3",
+            max_new_tokens=200,
+            temperature=0.1
+        )
+        # Clean and parse JSON
+        response_clean = response.strip()
+        if response_clean.startswith('```'):
+            response_clean = response_clean.split('```')[1]
+            if response_clean.startswith('json'):
+                response_clean = response_clean[4:]
+        response_clean = response_clean.strip()
+        result = json.loads(response_clean)
+        result['model'] = 'mistralai/Mistral-7B-Instruct-v0.3'
+        return result
+    except Exception as e:
+        return {
+            'user_type': 'unknown',
+            'emotion': 'unknown',
+            'context': 'unknown',
+            'confidence': 0.0,
+            'reasoning': f'Error: {str(e)}',
+            'model': 'mistralai/Mistral-7B-Instruct-v0.3'
+        }
+def manager_synthesize(llm1_result: Dict, llm2_result: Dict, review: Dict) -> Dict[str, Any]:
+    """Manager: Synthesize LLM1 and LLM2 results"""
+    review_text = review.get('review_text', '')
+    rating = review.get('rating', 3)
+    prompt = f"""You are a synthesis manager evaluating two AI analyses of the same review.
+REVIEW:
+Rating: {rating}/5
+Text: {review_text}
+LLM1 ANALYSIS (Type/Dept/Priority):
+{json.dumps(llm1_result, indent=2)}
+LLM2 ANALYSIS (User/Emotion/Context):
+{json.dumps(llm2_result, indent=2)}
+Your task:
+1. Validate both analyses
+2. Resolve any conflicts
+3. Make final classification decision
+4. Provide synthesis reasoning
+Respond ONLY in valid JSON format:
+{{
+  "final_type": "from llm1 or adjusted",
+  "final_department": "from llm1 or adjusted",
+  "final_priority": "from llm1 or adjusted",
+  "final_user_type": "from llm2 or adjusted",
+  "final_emotion": "from llm2 or adjusted",
+  "confidence": 0.0-1.0,
+  "reasoning": "synthesis explanation",
+  "conflicts_found": "any conflicts or 'none'"
+}}"""
+    try:
+        response = hf_client.text_generation(
+            prompt,
+            model="meta-llama/Llama-3.1-8B-Instruct",
+            max_new_tokens=250,
+            temperature=0.1
+        )
+        # Clean and parse JSON
+        response_clean = response.strip()
+        if response_clean.startswith('```'):
+            response_clean = response_clean.split('```')[1]
+            if response_clean.startswith('json'):
+                response_clean = response_clean[4:]
+        response_clean = response_clean.strip()
+        result = json.loads(response_clean)
+        result['model'] = 'meta-llama/Llama-3.1-8B-Instruct'
+        return result
+    except Exception as e:
+        # Fallback to LLM1 results
+        return {
+            'final_type': llm1_result.get('type', 'unknown'),
+            'final_department': llm1_result.get('department', 'unknown'),
+            'final_priority': llm1_result.get('priority', 'medium'),
+            'final_user_type': llm2_result.get('user_type', 'unknown'),
+            'final_emotion': llm2_result.get('emotion', 'unknown'),
+            'confidence': 0.5,
+            'reasoning': f'Manager error, used LLM1 results: {str(e)}',
+            'conflicts_found': 'error',
+            'model': 'meta-llama/Llama-3.1-8B-Instruct'
+        }
+def stage1_classification_node(state: ReviewState) -> Dict[str, Any]:
+    """
+    Stage 1 Node: Classification with PARALLEL execution
+    Runs LLM1 and LLM2 in parallel, then Manager synthesizes
+    """
+    print(f"\n      📝 Review ID: {state['review_id']}")
+    print(f"      ⏳ STAGE 1: Classification (Parallel LLM1 + LLM2)...")
+    start_time = time.time()
+    # PARALLEL EXECUTION: LLM1 and LLM2 run simultaneously
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        future1 = executor.submit(llm1_classify, state['review'])
+        future2 = executor.submit(llm2_analyze, state['review'])
+        llm1_result = future1.result()
+        llm2_result = future2.result()
+    print(f"         ✅ LLM1: {llm1_result.get('type')} → {llm1_result.get('department')} (Priority: {llm1_result.get('priority')})")
+    print(f"         ✅ LLM2: {llm2_result.get('user_type')}, {llm2_result.get('emotion')}")
+    # Manager synthesizes sequentially (needs both results)
+    print(f"         🤖 Manager synthesizing...")
+    manager_result = manager_synthesize(llm1_result, llm2_result, state['review'])
+    stage1_time = time.time() - start_time
+    print(f"      ✅ Stage 1 complete ({stage1_time:.2f}s)")
+    # Update state
+    return {
+        "llm1_result": llm1_result,
+        "llm2_result": llm2_result,
+        "manager_result": manager_result,
+        "classification_type": manager_result.get('final_type'),
+        "department": manager_result.get('final_department'),
+        "priority": manager_result.get('final_priority'),
+        "user_type": manager_result.get('final_user_type'),
+        "emotion": manager_result.get('final_emotion'),
+        "stage1_completed": True,
+        "stage1_time": stage1_time,
+        "errors": state.get('errors', [])
+    }
+# ============================================================================
+# STAGE 2: SENTIMENT NODE (Parallel Best + Alternate)
+# ============================================================================
+def analyze_best_sentiment(text: str) -> Dict[str, Any]:
+    """Best Model: Twitter-RoBERTa"""
+    load_sentiment_models()
+    try:
+        inputs = _best_tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
+        with torch.no_grad():
+            outputs = _best_model(**inputs)
+            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
+            prediction = torch.argmax(probs, dim=-1).item()
+            confidence = probs[0][prediction].item()
+        label_map = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
+        return {
+            'sentiment': label_map[prediction],
+            'confidence': confidence,
+            'prob_negative': probs[0][0].item(),
+            'prob_neutral': probs[0][1].item(),
+            'prob_positive': probs[0][2].item(),
+            'model': 'twitter-roberta-base-sentiment-latest'
+        }
+    except Exception as e:
+        return {
+            'sentiment': 'NEUTRAL',
+            'confidence': 0.0,
+            'prob_negative': 0.33,
+            'prob_neutral': 0.34,
+            'prob_positive': 0.33,
+            'model': 'error',
+            'error': str(e)
+        }
+def analyze_alt_sentiment(text: str) -> Dict[str, Any]:
+    """Alternate Model: BERTweet"""
+    load_sentiment_models()
+    try:
+        inputs = _alt_tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
+        with torch.no_grad():
+            outputs = _alt_model(**inputs)
+            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
+            prediction = torch.argmax(probs, dim=-1).item()
+            confidence = probs[0][prediction].item()
+        label_map = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
+        return {
+            'sentiment': label_map[prediction],
+            'confidence': confidence,
+            'prob_negative': probs[0][0].item(),
+            'prob_neutral': probs[0][1].item(),
+            'prob_positive': probs[0][2].item(),
+            'model': 'bertweet-base-sentiment-analysis'
+        }
+    except Exception as e:
+        return {
+            'sentiment': 'NEUTRAL',
+            'confidence': 0.0,
+            'prob_negative': 0.33,
+            'prob_neutral': 0.34,
+            'prob_positive': 0.33,
+            'model': 'error',
+            'error': str(e)
+        }
+def sentiment_layer(best_result: Dict, alt_result: Dict) -> Dict[str, Any]:
+    """Sentiment Layer: Combine with confidence weighting"""
+    best_sentiment = best_result.get('sentiment')
+    best_confidence = best_result.get('confidence', 0.0)
+    alt_sentiment = alt_result.get('sentiment')
+    alt_confidence = alt_result.get('confidence', 0.0)
+    agreement = (best_sentiment == alt_sentiment)
+    if agreement:
+        final_sentiment = best_sentiment
+        combined_confidence = max(best_confidence, alt_confidence)
+        agreement_strength = "STRONG"
+    else:
+        if best_confidence > alt_confidence:
+            final_sentiment = best_sentiment
+            combined_confidence = best_confidence
+        else:
+            final_sentiment = alt_sentiment
+            combined_confidence = alt_confidence
+        agreement_strength = "WEAK"
+    return {
+        'layer_sentiment': final_sentiment,
+        'combined_confidence': combined_confidence,
+        'agreement': agreement,
+        'agreement_strength': agreement_strength
+    }
+def stage2_sentiment_node(state: ReviewState) -> Dict[str, Any]:
+    """
+    Stage 2 Node: Sentiment with PARALLEL execution
+    Runs Best and Alternate models in parallel, then combines
+    """
+    print(f"\n      ⏳ STAGE 2: Sentiment Analysis (Parallel Best + Alternate)...")
+    start_time = time.time()
+    review_text = state['review_text']
+    # PARALLEL EXECUTION: Best and Alternate models run simultaneously
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        future_best = executor.submit(analyze_best_sentiment, review_text)
+        future_alt = executor.submit(analyze_alt_sentiment, review_text)
+        best_result = future_best.result()
+        alt_result = future_alt.result()
+    print(f"         ✅ Best: {best_result['sentiment']} ({best_result['confidence']:.3f})")
+    print(f"         ✅ Alt: {alt_result['sentiment']} ({alt_result['confidence']:.3f})")
+    # Sentiment Layer combines results
+    layer_result = sentiment_layer(best_result, alt_result)
+    agreement_icon = "✅" if layer_result['agreement'] else "⚠️ "
+    print(f"         {agreement_icon} Final: {layer_result['layer_sentiment']} (agreement: {layer_result['agreement']})")
+    stage2_time = time.time() - start_time
+    print(f"      ✅ Stage 2 complete ({stage2_time:.2f}s)")
+    return {
+        "best_sentiment_result": best_result,
+        "alt_sentiment_result": alt_result,
+        "sentiment_layer_result": layer_result,
+        "sentiment": layer_result['layer_sentiment'],
+        "sentiment_confidence": layer_result['combined_confidence'],
+        "sentiment_agreement": layer_result['agreement'],
+        "stage2_completed": True,
+        "stage2_time": stage2_time,
+        "errors": state.get('errors', [])
+    }
+# ============================================================================
+# STAGE 3: FINALIZATION NODE
+# ============================================================================
+def stage3_finalization_node(state: ReviewState) -> Dict[str, Any]:
+    """
+    Stage 3 Node: Final synthesis with LLM3 (Llama 70B)
+    """
+    print(f"\n      ⏳ STAGE 3: Finalization (LLM3)...")
+    start_time = time.time()
+    review_text = state['review_text']
+    rating = state['rating']
+    prompt = f"""You are a final decision-making AI analyzing customer feedback for a theme park/attraction app.
+REVIEW DATA:
+Rating: {rating}/5
+Text: {review_text}
+STAGE 1 CLASSIFICATION:
+- Type: {state.get('classification_type')}
+- Department: {state.get('department')}
+- Priority: {state.get('priority')}
+- User Type: {state.get('user_type')}
+- Emotion: {state.get('emotion')}
+STAGE 2 SENTIMENT:
+- Best: {state['best_sentiment_result'].get('sentiment')} ({state['best_sentiment_result'].get('confidence'):.2f})
+- Alternate: {state['alt_sentiment_result'].get('sentiment')} ({state['alt_sentiment_result'].get('confidence'):.2f})
+- Agreement: {state.get('sentiment_agreement')}
+YOUR TASK:
+1. Review all data from both stages
+2. Make FINAL sentiment decision
+3. Provide comprehensive reasoning
+4. Generate action recommendation
+5. Flag if human review needed
+Respond ONLY in valid JSON format:
+{{
+  "final_sentiment": "POSITIVE/NEGATIVE/NEUTRAL",
+  "confidence": 0.0-1.0,
+  "reasoning": "Comprehensive explanation",
+  "validation_notes": "Does classification match sentiment?",
+  "conflicts_found": "any conflicts or 'none'",
+  "action_recommendation": "Specific action",
+  "needs_human_review": true/false
+}}"""
+    try:
+        response = hf_client.text_generation(
+            prompt,
+            model="meta-llama/Llama-3.1-70B-Instruct",
+            max_new_tokens=400,
+            temperature=0.1
+        )
+        response_clean = response.strip()
+        if response_clean.startswith('```'):
+            response_clean = response_clean.split('```')[1]
+            if response_clean.startswith('json'):
+                response_clean = response_clean[4:]
+        response_clean = response_clean.strip()
+        result = json.loads(response_clean)
+        result['model'] = 'meta-llama/Llama-3.1-70B-Instruct'
+    except Exception as e:
+        result = {
+            'final_sentiment': state.get('sentiment', 'NEUTRAL'),
+            'confidence': state.get('sentiment_confidence', 0.5),
+            'reasoning': f'Error in LLM3: {str(e)}',
+            'validation_notes': 'Error',
+            'conflicts_found': 'error',
+            'action_recommendation': f"Route to {state.get('department')}",
+            'needs_human_review': True,
+            'model': 'meta-llama/Llama-3.1-70B-Instruct'
+        }
+    stage3_time = time.time() - start_time
+    print(f"         ✅ Final: {result['final_sentiment']} ({result.get('confidence', 0):.3f})")
+    print(f"         📋 Needs Review: {result.get('needs_human_review', False)}")
+    print(f"      ✅ Stage 3 complete ({stage3_time:.2f}s)")
+    # Calculate total time
+    total_time = state.get('stage1_time', 0) + state.get('stage2_time', 0) + stage3_time
+    return {
+        "final_result": result,
+        "final_sentiment": result['final_sentiment'],
+        "final_confidence": result['confidence'],
+        "reasoning": result['reasoning'],
+        "action_recommendation": result['action_recommendation'],
+        "conflicts_found": result['conflicts_found'],
+        "validation_notes": result['validation_notes'],
+        "needs_human_review": result['needs_human_review'],
+        "stage3_completed": True,
+        "stage3_time": stage3_time,
+        "total_time": total_time,
+        "processing_completed_at": datetime.now().isoformat(),
+        "errors": state.get('errors', [])
+    }
+if __name__ == "__main__":
+    print("\n✅ LangGraph nodes module loaded!")
+    print("   Nodes available:")
+    print("   - stage1_classification_node (parallel LLM1+LLM2)")
+    print("   - stage2_sentiment_node (parallel Best+Alt)")
+    print("   - stage3_finalization_node (LLM3)")

langgraph_state.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""
+LangGraph State Schema
+Defines the state that flows through the graph
+"""
+from typing import TypedDict, Optional, Dict, Any, List
+from datetime import datetime
+class ReviewState(TypedDict):
+    """
+    State schema for review processing graph
+    All stages add to this state as it flows through the graph
+    """
+    # Input data
+    review: Dict[str, Any]
+    review_id: str
+    review_text: str
+    rating: int
+    # Stage 1: Classification outputs
+    llm1_result: Optional[Dict[str, Any]]
+    llm2_result: Optional[Dict[str, Any]]
+    manager_result: Optional[Dict[str, Any]]
+    # Stage 1: Extracted fields for easy access
+    classification_type: Optional[str]
+    department: Optional[str]
+    priority: Optional[str]
+    user_type: Optional[str]
+    emotion: Optional[str]
+    # Stage 2: Sentiment outputs
+    best_sentiment_result: Optional[Dict[str, Any]]
+    alt_sentiment_result: Optional[Dict[str, Any]]
+    sentiment_layer_result: Optional[Dict[str, Any]]
+    # Stage 2: Extracted fields
+    sentiment: Optional[str]  # POSITIVE, NEGATIVE, NEUTRAL
+    sentiment_confidence: Optional[float]
+    sentiment_agreement: Optional[bool]
+    # Stage 3: Finalization outputs
+    final_result: Optional[Dict[str, Any]]
+    # Stage 3: Extracted fields
+    final_sentiment: Optional[str]
+    final_confidence: Optional[float]
+    reasoning: Optional[str]
+    action_recommendation: Optional[str]
+    conflicts_found: Optional[str]
+    validation_notes: Optional[str]
+    # Routing decisions
+    needs_human_review: bool
+    route_to: Optional[str]  # 'human_review', 'complete', 'batch_analysis'
+    # Processing metadata
+    stage1_completed: bool
+    stage2_completed: bool
+    stage3_completed: bool
+    processing_started_at: Optional[str]
+    processing_completed_at: Optional[str]
+    # Timing information
+    stage1_time: Optional[float]
+    stage2_time: Optional[float]
+    stage3_time: Optional[float]
+    total_time: Optional[float]
+    # Error handling
+    errors: List[str]
+    retry_count: int
+    # Database sync status
+    db_stage1_saved: bool
+    db_stage2_saved: bool
+    db_stage3_saved: bool
+class BatchState(TypedDict):
+    """
+    State for batch analysis (Stage 4)
+    Aggregates results from multiple reviews
+    """
+    # Input
+    all_reviews: List[ReviewState]
+    total_count: int
+    # Aggregated metrics
+    sentiment_distribution: Optional[Dict[str, int]]
+    priority_distribution: Optional[Dict[str, int]]
+    department_distribution: Optional[Dict[str, int]]
+    emotion_distribution: Optional[Dict[str, int]]
+    # Analysis outputs
+    critical_issues: Optional[List[Dict[str, Any]]]
+    quick_wins: Optional[List[Dict[str, Any]]]
+    churn_risk: Optional[float]
+    model_agreement_rate: Optional[float]
+    # Recommendations
+    recommendations: Optional[List[str]]
+    # Processing metadata
+    batch_started_at: Optional[str]
+    batch_completed_at: Optional[str]
+    batch_processing_time: Optional[float]
+def create_initial_state(review: Dict[str, Any]) -> ReviewState:
+    """
+    Create initial state for a review
+    """
+    return ReviewState(
+        # Input
+        review=review,
+        review_id=review.get('review_id', 'unknown'),
+        review_text=review.get('review_text', ''),
+        rating=review.get('rating', 3),
+        # Stage 1
+        llm1_result=None,
+        llm2_result=None,
+        manager_result=None,
+        classification_type=None,
+        department=None,
+        priority=None,
+        user_type=None,
+        emotion=None,
+        # Stage 2
+        best_sentiment_result=None,
+        alt_sentiment_result=None,
+        sentiment_layer_result=None,
+        sentiment=None,
+        sentiment_confidence=None,
+        sentiment_agreement=None,
+        # Stage 3
+        final_result=None,
+        final_sentiment=None,
+        final_confidence=None,
+        reasoning=None,
+        action_recommendation=None,
+        conflicts_found=None,
+        validation_notes=None,
+        # Routing
+        needs_human_review=False,
+        route_to=None,
+        # Processing metadata
+        stage1_completed=False,
+        stage2_completed=False,
+        stage3_completed=False,
+        processing_started_at=datetime.now().isoformat(),
+        processing_completed_at=None,
+        # Timing
+        stage1_time=None,
+        stage2_time=None,
+        stage3_time=None,
+        total_time=None,
+        # Errors
+        errors=[],
+        retry_count=0,
+        # Database
+        db_stage1_saved=False,
+        db_stage2_saved=False,
+        db_stage3_saved=False
+    )
+def create_batch_state(reviews: List[ReviewState]) -> BatchState:
+    """
+    Create batch state from processed reviews
+    """
+    return BatchState(
+        all_reviews=reviews,
+        total_count=len(reviews),
+        sentiment_distribution=None,
+        priority_distribution=None,
+        department_distribution=None,
+        emotion_distribution=None,
+        critical_issues=None,
+        quick_wins=None,
+        churn_risk=None,
+        model_agreement_rate=None,
+        recommendations=None,
+        batch_started_at=datetime.now().isoformat(),
+        batch_completed_at=None,
+        batch_processing_time=None
+    )
+if __name__ == "__main__":
+    # Test state creation
+    print("\n" + "="*60)
+    print("🧪 TESTING LANGGRAPH STATE")
+    print("="*60)
+    test_review = {
+        'review_id': 'test_001',
+        'review_text': 'App crashes!',
+        'rating': 1
+    }
+    state = create_initial_state(test_review)
+    print(f"\n✅ Initial state created for: {state['review_id']}")
+    print(f"   Review text: {state['review_text']}")
+    print(f"   Stage 1 completed: {state['stage1_completed']}")
+    print("\n✅ State schema test complete!")

requirements.txt CHANGED Viewed

@@ -1,3 +1,27 @@
-altair
-pandas
-streamlit

+# Core Dependencies
+python-dotenv==1.0.0
+pandas>=2.2.0
+PyYAML==6.0.1
+# LangGraph & LangChain
+langgraph>=0.2.0
+langchain>=0.2.0
+langchain-core>=0.2.0
+# HuggingFace
+huggingface-hub>=0.20.3
+transformers>=4.36.2
+torch>=2.1.2
+# Gradio (for HuggingFace Spaces UI)
+gradio>=4.0.0
+# Plotly for visualizations
+plotly>=5.18.0
+# Web Scraping
+google-play-scraper>=1.2.4
+requests>=2.31.0
+# Database (SQLite is built-in to Python)
+# sqlite3 is included with Python

stage0_scraper.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""
+Stage 0: Web Scraping (App Store & Play Store)
+Scrapes reviews and stores in database
+This integrates with your existing scraper or can be used standalone
+"""
+import os
+import sqlite3
+import requests
+import json
+import time
+from datetime import datetime
+from typing import List, Dict, Any
+import re
+class Stage0WebScraper:
+    """
+    Stage 0: Web scraping for App Store and Play Store reviews
+    Integrates with existing database structure
+    """
+    def __init__(self, db_file: str = "review_database.db"):
+        self.db_file = db_file
+        print(f"   📁 Database: {db_file}")
+    def create_reviews_table(self):
+        """
+        Create reviews table if it doesn't exist
+        This is your Stage 0 schema
+        """
+        conn = sqlite3.connect(self.db_file)
+        cursor = conn.cursor()
+        cursor.execute("""
+            CREATE TABLE IF NOT EXISTS reviews (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                review_id TEXT UNIQUE,
+                product_url TEXT,
+                platform TEXT,
+                app_name TEXT,
+                user_name TEXT,
+                review_text TEXT,
+                rating INTEGER,
+                review_date TEXT,
+                app_version TEXT,
+                scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+        """)
+        # Create index for faster lookups
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_review_id
+            ON reviews(review_id)
+        """)
+        cursor.execute("""
+            CREATE INDEX IF NOT EXISTS idx_platform
+            ON reviews(platform)
+        """)
+        conn.commit()
+        conn.close()
+        print("   ✅ Reviews table ready (Stage 0)")
+    def scrape_app_store_rss(self, app_id: str, country: str = "us",
+                              limit: int = 100) -> List[Dict]:
+        """
+        Scrape App Store reviews using RSS feed
+        This is a simple, free method (no API key needed)
+        Args:
+            app_id: App Store app ID (e.g., "1234567890")
+            country: Country code (e.g., "us", "ae", "uk")
+            limit: Number of reviews to fetch (max 500 per request)
+        """
+        print(f"   🍎 Scraping App Store: {app_id} ({country})")
+        # App Store RSS feed URL
+        url = f"https://itunes.apple.com/{country}/rss/customerreviews/id={app_id}/sortBy=mostRecent/json"
+        try:
+            response = requests.get(url, timeout=30)
+            response.raise_for_status()
+            data = response.json()
+            reviews = []
+            entries = data.get('feed', {}).get('entry', [])
+            # Skip first entry (it's the app info)
+            if entries and 'author' not in entries[0]:
+                entries = entries[1:]
+            for entry in entries[:limit]:
+                try:
+                    review = {
+                        'review_id': entry.get('id', {}).get('label', ''),
+                        'platform': 'app_store',
+                        'app_name': data.get('feed', {}).get('title', {}).get('label', 'Unknown'),
+                        'user_name': entry.get('author', {}).get('name', {}).get('label', 'Anonymous'),
+                        'review_text': entry.get('content', {}).get('label', ''),
+                        'rating': int(entry.get('im:rating', {}).get('label', '3')),
+                        'review_date': entry.get('updated', {}).get('label', ''),
+                        'app_version': entry.get('im:version', {}).get('label', ''),
+                        'product_url': entry.get('link', {}).get('attributes', {}).get('href', '')
+                    }
+                    reviews.append(review)
+                except Exception as e:
+                    print(f"      ⚠️  Error parsing review: {e}")
+                    continue
+            print(f"      ✅ Scraped {len(reviews)} reviews")
+            return reviews
+        except Exception as e:
+            print(f"      ❌ Error scraping App Store: {e}")
+            return []
+    def scrape_play_store_api(self, app_id: str, country: str = "us",
+                               limit: int = 100) -> List[Dict]:
+        """
+        Scrape Google Play Store reviews
+        Note: This is a simplified version. For production, use google-play-scraper library
+        Args:
+            app_id: Play Store package name (e.g., "com.company.app")
+            country: Country code
+            limit: Number of reviews to fetch
+        """
+        print(f"   🤖 Scraping Play Store: {app_id} ({country})")
+        try:
+            # Using unofficial API endpoint (works without auth)
+            # For production, recommend: pip install google-play-scraper
+            from google_play_scraper import Sort, reviews_all
+            result = reviews_all(
+                app_id,
+                sleep_milliseconds=0,
+                lang='en',
+                country=country,
+                sort=Sort.NEWEST
+            )
+            reviews = []
+            for r in result[:limit]:
+                review = {
+                    'review_id': r.get('reviewId', ''),
+                    'platform': 'play_store',
+                    'app_name': app_id,
+                    'user_name': r.get('userName', 'Anonymous'),
+                    'review_text': r.get('content', ''),
+                    'rating': r.get('score', 3),
+                    'review_date': r.get('at', '').isoformat() if r.get('at') else '',
+                    'app_version': r.get('reviewCreatedVersion', ''),
+                    'product_url': f"https://play.google.com/store/apps/details?id={app_id}"
+                }
+                reviews.append(review)
+            print(f"      ✅ Scraped {len(reviews)} reviews")
+            return reviews
+        except ImportError:
+            print("      ⚠️  google-play-scraper not installed")
+            print("      Run: pip install google-play-scraper")
+            return []
+        except Exception as e:
+            print(f"      ❌ Error scraping Play Store: {e}")
+            return []
+    def save_reviews_to_db(self, reviews: List[Dict]) -> int:
+        """
+        Save scraped reviews to database
+        Returns number of new reviews saved
+        """
+        if not reviews:
+            return 0
+        conn = sqlite3.connect(self.db_file)
+        cursor = conn.cursor()
+        saved_count = 0
+        for review in reviews:
+            try:
+                cursor.execute("""
+                    INSERT OR IGNORE INTO reviews
+                    (review_id, product_url, platform, app_name, user_name,
+                     review_text, rating, review_date, app_version)
+                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+                """, (
+                    review.get('review_id'),
+                    review.get('product_url', ''),
+                    review.get('platform'),
+                    review.get('app_name', ''),
+                    review.get('user_name'),
+                    review.get('review_text'),
+                    review.get('rating'),
+                    review.get('review_date', ''),
+                    review.get('app_version', '')
+                ))
+                if cursor.rowcount > 0:
+                    saved_count += 1
+            except Exception as e:
+                print(f"      ⚠️  Error saving review: {e}")
+                continue
+        conn.commit()
+        conn.close()
+        print(f"      ✅ Saved {saved_count} new reviews to database")
+        return saved_count
+    def scrape_from_urls_file(self, urls_file: str = "urls.txt") -> int:
+        """
+        Scrape reviews from URLs listed in a text file
+        URLs file format (one per line):
+        app_store:1234567890:us
+        play_store:com.company.app:us
+        """
+        print(f"\n   📄 Reading URLs from: {urls_file}")
+        if not os.path.exists(urls_file):
+            print(f"      ⚠️  File not found: {urls_file}")
+            return 0
+        total_saved = 0
+        with open(urls_file, 'r') as f:
+            urls = [line.strip() for line in f if line.strip() and not line.startswith('#')]
+        print(f"      ✅ Found {len(urls)} URLs")
+        for i, url in enumerate(urls, 1):
+            print(f"\n   [{i}/{len(urls)}] Processing: {url}")
+            parts = url.split(':')
+            if len(parts) < 2:
+                print(f"      ⚠️  Invalid format: {url}")
+                continue
+            platform = parts[0].lower()
+            app_id = parts[1]
+            country = parts[2] if len(parts) > 2 else 'us'
+            if platform == 'app_store':
+                reviews = self.scrape_app_store_rss(app_id, country)
+            elif platform == 'play_store':
+                reviews = self.scrape_play_store_api(app_id, country)
+            else:
+                print(f"      ⚠️  Unknown platform: {platform}")
+                continue
+            saved = self.save_reviews_to_db(reviews)
+            total_saved += saved
+            # Be nice to servers
+            time.sleep(2)
+        return total_saved
+    def get_review_count(self) -> int:
+        """Get total number of reviews in database"""
+        conn = sqlite3.connect(self.db_file)
+        cursor = conn.cursor()
+        cursor.execute("SELECT COUNT(*) FROM reviews")
+        count = cursor.fetchone()[0]
+        conn.close()
+        return count
+if __name__ == "__main__":
+    # Run Stage 0 scraper - reads from urls.txt
+    print("\n" + "="*70)
+    print("🕷️  STAGE 0: WEB SCRAPER")
+    print("="*70)
+    scraper = Stage0WebScraper(db_file="review_database.db")
+    # Create table if not exists
+    print("\n📁 Ensuring database table exists...")
+    scraper.create_reviews_table()
+    # Scrape from urls.txt
+    print("\n🔄 Starting scraping from urls.txt...")
+    total_saved = scraper.scrape_from_urls_file("urls.txt")
+    # Show results
+    total_reviews = scraper.get_review_count()
+    print("\n" + "="*70)
+    print("✅ SCRAPING COMPLETE!")
+    print("="*70)
+    print(f"📊 New reviews saved: {total_saved}")
+    print(f"📊 Total reviews in database: {total_reviews}")
+    print("\n🎯 Next step: Run the analysis!")
+    print("   python main_langgraph.py")
+    print("="*70 + "\n")

stage4_batch_analysis.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""
+Stage 4: Batch Analysis & Aggregation
+- Aggregate insights across all processed reviews
+- Identify patterns, trends, critical issues
+- Generate actionable recommendations
+"""
+import json
+from typing import Dict, Any, List
+from collections import Counter
+class Stage4BatchAnalysis:
+    """
+    Stage 4: Batch-level intelligence and recommendations
+    """
+    def __init__(self):
+        print("   📊 Stage 4: Batch Analysis initialized")
+    def analyze_batch(self, reviews: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Analyze a batch of processed reviews
+        """
+        if not reviews:
+            print("      ⚠️  No reviews to analyze")
+            return self._empty_insights()
+        print(f"\n      📊 Analyzing batch of {len(reviews)} reviews...")
+        # Initialize counters
+        total = len(reviews)
+        # Sentiment distribution
+        sentiment_counts = Counter()
+        for review in reviews:
+            sentiment = review.get('stage3_final_sentiment', 'NEUTRAL')
+            sentiment_counts[sentiment] += 1
+        print(f"         📈 Sentiment: "
+              f"POS={sentiment_counts.get('POSITIVE', 0)}, "
+              f"NEU={sentiment_counts.get('NEUTRAL', 0)}, "
+              f"NEG={sentiment_counts.get('NEGATIVE', 0)}")
+        # Priority distribution
+        priority_counts = Counter()
+        for review in reviews:
+            priority = review.get('stage1_llm1_priority', 'unknown')
+            priority_counts[priority] += 1
+        print(f"         🎯 Priority: "
+              f"Critical={priority_counts.get('critical', 0)}, "
+              f"High={priority_counts.get('high', 0)}, "
+              f"Medium={priority_counts.get('medium', 0)}, "
+              f"Low={priority_counts.get('low', 0)}")
+        # Department routing
+        dept_counts = Counter()
+        for review in reviews:
+            dept = review.get('stage1_llm1_department', 'unknown')
+            dept_counts[dept] += 1
+        print(f"         🏢 Departments: "
+              f"Eng={dept_counts.get('engineering', 0)}, "
+              f"UX={dept_counts.get('ux', 0)}, "
+              f"Support={dept_counts.get('support', 0)}, "
+              f"Business={dept_counts.get('business', 0)}")
+        # Emotion distribution
+        emotion_counts = Counter()
+        for review in reviews:
+            emotion = review.get('stage1_llm2_emotion', 'unknown')
+            emotion_counts[emotion] += 1
+        # Review type distribution
+        type_counts = Counter()
+        for review in reviews:
+            review_type = review.get('stage1_llm1_type', 'unknown')
+            type_counts[review_type] += 1
+        # Identify critical issues
+        critical_issues = self._identify_critical_issues(reviews)
+        print(f"         🚨 Critical Issues: {len(critical_issues)}")
+        # Identify quick wins
+        quick_wins = self._identify_quick_wins(reviews)
+        print(f"         ⚡ Quick Wins: {len(quick_wins)}")
+        # Calculate churn risk
+        churn_risk = self._calculate_churn_risk(reviews)
+        print(f"         ⚠️  Churn Risk: {churn_risk:.1f}%")
+        # Model agreement rate
+        agreement_count = sum(1 for r in reviews if r.get('stage2_agreement', False))
+        agreement_rate = (agreement_count / total * 100) if total > 0 else 0
+        print(f"         🤝 Model Agreement: {agreement_rate:.1f}%")
+        # Generate recommendations
+        recommendations = self._generate_recommendations(
+            sentiment_counts, priority_counts, dept_counts,
+            critical_issues, quick_wins, churn_risk
+        )
+        # Compile batch insights
+        insights = {
+            'total_reviews': total,
+            # Sentiment
+            'sentiment_positive': sentiment_counts.get('POSITIVE', 0),
+            'sentiment_neutral': sentiment_counts.get('NEUTRAL', 0),
+            'sentiment_negative': sentiment_counts.get('NEGATIVE', 0),
+            'sentiment_distribution': dict(sentiment_counts),
+            # Priority
+            'priority_critical': priority_counts.get('critical', 0),
+            'priority_high': priority_counts.get('high', 0),
+            'priority_medium': priority_counts.get('medium', 0),
+            'priority_low': priority_counts.get('low', 0),
+            'priority_distribution': dict(priority_counts),
+            # Department
+            'dept_engineering': dept_counts.get('engineering', 0),
+            'dept_ux': dept_counts.get('ux', 0),
+            'dept_support': dept_counts.get('support', 0),
+            'dept_business': dept_counts.get('business', 0),
+            'department_distribution': dict(dept_counts),
+            # Additional insights
+            'emotion_distribution': dict(emotion_counts),
+            'type_distribution': dict(type_counts),
+            'model_agreement_rate': agreement_rate,
+            'churn_risk': churn_risk,
+            # Actionable lists
+            'critical_issues': critical_issues,
+            'quick_wins': quick_wins,
+            'recommendations': recommendations
+        }
+        return insights
+    def _identify_critical_issues(self, reviews: List[Dict]) -> List[Dict]:
+        """Identify critical issues requiring immediate attention"""
+        critical = []
+        for review in reviews:
+            priority = review.get('stage1_llm1_priority', '')
+            sentiment = review.get('stage3_final_sentiment', '')
+            needs_review = review.get('stage3_needs_human_review', False)
+            if priority == 'critical' or (sentiment == 'NEGATIVE' and needs_review):
+                critical.append({
+                    'review_id': review.get('review_id', 'unknown'),
+                    'type': review.get('stage1_llm1_type', 'unknown'),
+                    'department': review.get('stage1_llm1_department', 'unknown'),
+                    'reasoning': review.get('stage3_reasoning', ''),
+                    'action': review.get('stage3_action_recommendation', ''),
+                    'rating': review.get('rating', 0)
+                })
+        # Sort by rating (lowest first)
+        critical.sort(key=lambda x: x['rating'])
+        return critical[:10]  # Top 10 critical issues
+    def _identify_quick_wins(self, reviews: List[Dict]) -> List[Dict]:
+        """Identify easy-to-fix issues for quick wins"""
+        quick_wins = []
+        for review in reviews:
+            review_type = review.get('stage1_llm1_type', '')
+            priority = review.get('stage1_llm1_priority', '')
+            sentiment = review.get('stage3_final_sentiment', '')
+            # Suggestions with low priority = quick wins
+            if review_type == 'suggestion' and priority in ['low', 'medium']:
+                quick_wins.append({
+                    'review_id': review.get('review_id', 'unknown'),
+                    'suggestion': review.get('review_text', '')[:100],
+                    'department': review.get('stage1_llm1_department', 'unknown'),
+                    'action': review.get('stage3_action_recommendation', ''),
+                    'rating': review.get('rating', 0)
+                })
+        return quick_wins[:10]  # Top 10 quick wins
+    def _calculate_churn_risk(self, reviews: List[Dict]) -> float:
+        """Calculate overall churn risk percentage"""
+        if not reviews:
+            return 0.0
+        churn_indicators = 0
+        for review in reviews:
+            user_type = review.get('stage1_llm2_user_type', '')
+            sentiment = review.get('stage3_final_sentiment', '')
+            rating = review.get('rating', 3)
+            # Churn indicators
+            if user_type == 'churning_user':
+                churn_indicators += 2
+            elif sentiment == 'NEGATIVE' and rating <= 2:
+                churn_indicators += 1
+            elif rating == 1:
+                churn_indicators += 1
+        # Calculate percentage
+        max_possible = len(reviews) * 2
+        churn_risk = (churn_indicators / max_possible * 100) if max_possible > 0 else 0.0
+        return min(churn_risk, 100.0)
+    def _generate_recommendations(self, sentiment_counts, priority_counts,
+                                 dept_counts, critical_issues, quick_wins,
+                                 churn_risk) -> List[str]:
+        """Generate actionable recommendations"""
+        recommendations = []
+        # Sentiment-based
+        total = sum(sentiment_counts.values())
+        if total > 0:
+            neg_pct = (sentiment_counts.get('NEGATIVE', 0) / total * 100)
+            if neg_pct > 40:
+                recommendations.append(
+                    f"🚨 HIGH: {neg_pct:.0f}% negative sentiment. Immediate investigation needed."
+                )
+            elif neg_pct > 25:
+                recommendations.append(
+                    f"⚠️  MEDIUM: {neg_pct:.0f}% negative sentiment. Monitor closely."
+                )
+        # Priority-based
+        if priority_counts.get('critical', 0) > 0:
+            recommendations.append(
+                f"🔥 URGENT: {priority_counts['critical']} critical issues require immediate attention."
+            )
+        # Department-based
+        if dept_counts:
+            top_dept = max(dept_counts, key=dept_counts.get)
+            top_count = dept_counts[top_dept]
+            recommendations.append(
+                f"🎯 FOCUS: {top_count} issues routed to {top_dept} department."
+            )
+        # Churn risk
+        if churn_risk > 30:
+            recommendations.append(
+                f"⚠️  CHURN: {churn_risk:.0f}% churn risk detected. Implement retention strategy."
+            )
+        # Quick wins
+        if quick_wins:
+            recommendations.append(
+                f"⚡ OPPORTUNITY: {len(quick_wins)} quick wins available for easy improvements."
+            )
+        return recommendations
+    def _empty_insights(self) -> Dict[str, Any]:
+        """Return empty insights structure"""
+        return {
+            'total_reviews': 0,
+            'sentiment_positive': 0,
+            'sentiment_neutral': 0,
+            'sentiment_negative': 0,
+            'priority_critical': 0,
+            'priority_high': 0,
+            'priority_medium': 0,
+            'priority_low': 0,
+            'dept_engineering': 0,
+            'dept_ux': 0,
+            'dept_support': 0,
+            'dept_business': 0,
+            'critical_issues': [],
+            'quick_wins': [],
+            'recommendations': []
+        }
+if __name__ == "__main__":
+    # Test Stage 4
+    print("\n" + "="*60)
+    print("🧪 TESTING STAGE 4 BATCH ANALYSIS")
+    print("="*60)
+    # Sample processed reviews
+    sample_reviews = [
+        {
+            'review_id': '001',
+            'review_text': 'App crashes!',
+            'rating': 1,
+            'stage1_llm1_type': 'bug_report',
+            'stage1_llm1_department': 'engineering',
+            'stage1_llm1_priority': 'critical',
+            'stage1_llm2_user_type': 'power_user',
+            'stage1_llm2_emotion': 'frustration',
+            'stage2_agreement': True,
+            'stage3_final_sentiment': 'NEGATIVE',
+            'stage3_needs_human_review': True,
+            'stage3_reasoning': 'Critical bug',
+            'stage3_action_recommendation': 'Fix immediately'
+        },
+        {
+            'review_id': '002',
+            'review_text': 'Great app!',
+            'rating': 5,
+            'stage1_llm1_type': 'praise',
+            'stage1_llm1_department': 'ux',
+            'stage1_llm1_priority': 'low',
+            'stage1_llm2_user_type': 'regular_user',
+            'stage1_llm2_emotion': 'joy',
+            'stage2_agreement': True,
+            'stage3_final_sentiment': 'POSITIVE',
+            'stage3_needs_human_review': False
+        }
+    ]
+    stage4 = Stage4BatchAnalysis()
+    insights = stage4.analyze_batch(sample_reviews)
+    print("\n📊 BATCH INSIGHTS:")
+    print(json.dumps(insights, indent=2))
+    print("\n✅ Stage 4 test complete!")