Spaces:

ChiragKaushikCK
/

Sentiment_Analyzer_Pro

Sleeping

App Files Files Community

ChiragKaushikCK commited on Nov 30, 2025

Commit

2a331bc

verified ·

1 Parent(s): 11622bf

Update app.py

Browse files

Files changed (1) hide show

app.py +323 -154

app.py CHANGED Viewed

@@ -35,122 +35,223 @@ st.markdown("""
 # ------------------------------------------------------------------
 @st.cache_resource
 def load_models():
-    # English Models (Ensemble)
-    roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
-    distilbert = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
-    vader = SentimentIntensityAnalyzer()
-    # Multilingual Model (Handles Hindi, Hinglish, Spanish, French, etc.)
-    # We use XLM-RoBERTa because it understands code-mixing (Hinglish) very well.
-    multilingual = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")
-    return roberta, distilbert, vader, multilingual
-# Load models once
-roberta_model, distilbert_model, vader_model, multi_model = load_models()
 # ------------------------------------------------------------------
 # HELPER FUNCTIONS
 # ------------------------------------------------------------------
 def clean_text(text):
     text = text.lower()
-    text = re.sub(r'http\S+', '', text) # Remove URLs
-    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
     return text
 def get_wordcloud(text):
-    # Wordcloud works best with English, but we can try for others
-    wc = WordCloud(width=800, height=400, background_color='white').generate(text)
-    fig, ax = plt.subplots(figsize=(10, 5))
-    ax.imshow(wc, interpolation='bilinear')
-    ax.axis('off')
-    return fig
 # ------------------------------------------------------------------
 # CORE ANALYSIS LOGIC
 # ------------------------------------------------------------------
 def analyze_english(text):
-    # 1. RoBERTa
-    rob_out = roberta_model(text[:512])[0]
-    rob_label = rob_out['label']
-    # Map RoBERTa labels (cardiffnlp model uses label_0:neg, label_1:neu, label_2:pos)
-    if rob_label == 'LABEL_0': rob_sent = 'negative'
-    elif rob_label == 'LABEL_1': rob_sent = 'neutral'
-    else: rob_sent = 'positive'
-    # 2. VADER
-    vader_out = vader_model.polarity_scores(text)
-    compound = vader_out['compound']
-    if compound >= 0.05: vader_sent = 'positive'
-    elif compound <= -0.05: vader_sent = 'negative'
-    else: vader_sent = 'neutral'
-    # 3. DistilBERT
-    bert_out = distilbert_model(text[:512])[0]
-    bert_sent = bert_out['label'].lower()
-    # Consensus Logic (Voting)
-    votes = [rob_sent, vader_sent, bert_sent]
-    count = Counter(votes)
-    winner, vote_count = count.most_common(1)[0]
-    # Conflict Detection
-    if len(count) == 3 or vote_count == 1:
-        final_verdict = "ambiguous"
-        confidence = "Low (Conflict)"
-    else:
-        final_verdict = winner
-        confidence = "High" if vote_count == 3 else "Medium"
-    return {
-        'verdict': final_verdict,
-        'confidence': confidence,
-        'breakdown': {
-            'RoBERTa': rob_sent,
-            'VADER': vader_sent,
-            'DistilBERT': bert_sent
-        },
-        'scores': {
-            'RoBERTa': rob_out['score'],
-            'VADER': abs(compound),
-            'DistilBERT': bert_out['score']
         }
-    }
 def analyze_multilingual(text):
-    # Uses XLM-RoBERTa (State of the art for Hinglish/Hindi)
-    result = multi_model(text[:512])[0]
-    label_raw = result['label'] # Returns 'negative', 'neutral', 'positive' or LABEL_X
-    score = result['score']
-    # Normalize labels
-    # Check if the model returns LABEL_0/1/2 or text
-    if label_raw.lower() in ['negative', 'label_0']:
-        sentiment = "negative"
-    elif label_raw.lower() in ['neutral', 'label_1']:
-        sentiment = "neutral"
-    else:
-        sentiment = "positive"
-    return {
-        'verdict': sentiment,
-        'confidence': f"{score:.2f}",
-        'breakdown': {'XLM-RoBERTa': f"{sentiment.title()} ({score:.2f})"},
-        'scores': {'Model Confidence': score}
-    }
 # ------------------------------------------------------------------
 # UI LAYOUT
 # ------------------------------------------------------------------
 # Sidebar
-st.sidebar.title("Configuration")
 language = st.sidebar.selectbox("Select Language", ["English", "Hindi (हिन्दी)", "Hinglish (Mixed)"])
-mode = st.sidebar.selectbox("Mode", ["Real-time Analysis", "Batch Processing"])
-st.title("🧠 Sentiment Analytics Engine")
 st.markdown("---")
 if mode == "Real-time Analysis":
@@ -163,33 +264,41 @@ if mode == "Real-time Analysis":
         placeholder_text = "Type in Hinglish (e.g., Product bahut achha hai but delivery slow thi)"
         label_text = "Enter Hinglish Text:"
     else:
-        placeholder_text = "Type your text here..."
         label_text = "Enter English Text:"
     user_input = st.text_area(label_text, height=150, placeholder=placeholder_text)
-    if st.button("Analyze Sentiment", type="primary"):
         if not user_input.strip():
-            st.warning("Please enter some text first.")
         else:
-            with st.spinner("Running Inference Models..."):
                 start_time = time.time()
                 # Routing Logic
                 if language == "English":
                     result = analyze_english(user_input)
                 else:
-                    # Both Hindi and Hinglish use the Multilingual Model
                     result = analyze_multilingual(user_input)
                 latency = time.time() - start_time
             # 1. Main Verdict Display
-            st.markdown("### Analysis Results")
             col1, col2, col3 = st.columns(3)
-            color_map = {'positive': 'green', 'negative': 'red', 'neutral': 'orange', 'ambiguous': 'grey'}
-            verdict_color = color_map.get(result['verdict'], 'blue')
             with col1:
                 st.markdown(f"""
@@ -215,85 +324,145 @@ if mode == "Real-time Analysis":
                 </div>
                 """, unsafe_allow_html=True)
-            # 2. Detailed Breakdown & Conflict Check
             st.markdown("---")
             c1, c2 = st.columns([1, 1])
             with c1:
                 st.subheader("🔍 Model Consensus")
-                if language == "English":
-                    # Show voting breakdown for English
-                    df_breakdown = pd.DataFrame(list(result['breakdown'].items()), columns=['Model', 'Prediction'])
                     st.table(df_breakdown)
                     if result['verdict'] == 'ambiguous':
                         st.error("⚠️ Conflict Detected: Models disagree. Human review recommended.")
                 else:
-                    # For Hindi/Hinglish
-                    st.info(f"Analyzed using XLM-RoBERTa (Multilingual). Output: {result['breakdown']['XLM-RoBERTa']}")
-                    st.caption("Note: XLM-RoBERTa is optimized for 100+ languages including Hindi & Code-mixed text.")
             with c2:
-                # Confidence Chart
                 st.subheader("📈 Confidence Scores")
-                df_scores = pd.DataFrame(list(result['scores'].items()), columns=['Source', 'Score'])
-                fig = px.bar(df_scores, x='Source', y='Score', range_y=[0,1], color='Score')
-                st.plotly_chart(fig, use_container_width=True)
-            # 3. Explainability (Word Cloud) - Best for English/Roman Script
-            if (language == "English" or language == "Hinglish (Mixed)") and len(user_input) > 20:
                 st.subheader("☁️ Contextual Word Cloud")
                 try:
                     cleaned = clean_text(user_input)
-                    fig_wc = get_wordcloud(cleaned)
-                    st.pyplot(fig_wc)
-                except:
-                    st.text("Not enough text data for word cloud.")
-            # 4. Human Feedback Loop (The "Safety Net")
             st.markdown("---")
-            with st.expander("📝 Incorrect Result? Report Issue (Human-in-the-Loop)"):
-                st.write("Help fine-tune the model by providing the correct label.")
-                feedback = st.radio("Correct Sentiment:", ["Positive", "Negative", "Neutral"], horizontal=True)
-                if st.button("Submit Feedback"):
-                    # Simulation of logging to database/CSV
-                    log_entry = {
-                        "text": user_input,
-                        "model_output": result['verdict'],
-                        "user_correction": feedback,
-                        "timestamp": datetime.now().isoformat()
-                    }
-                    # In production, this would go to a database
-                    st.success("✅ Feedback Logged. This sample has been added to the retraining queue.")
 elif mode == "Batch Processing":
-    st.info("Upload a CSV file containing a column named 'text'.")
-    uploaded_file = st.file_uploader("Upload CSV", type=['csv'])
-    if uploaded_file:
-        df = pd.read_csv(uploaded_file)
-        if 'text' in df.columns:
-            if st.button("Process Batch"):
-                results = []
-                progress_bar = st.progress(0)
-                for i, row in df.iterrows():
-                    # Routing based on selection
-                    txt = str(row['text'])
-                    if language == "English":
-                        res = analyze_english(txt)
-                    else:
-                        res = analyze_multilingual(txt)
-                    results.append(res['verdict'])
-                    progress_bar.progress((i + 1) / len(df))
-                df['sentiment'] = results
-                st.dataframe(df)
-                # Download
-                csv = df.to_csv(index=False).encode('utf-8')
-                st.download_button("Download Results", csv, "sentiment_results.csv", "text/csv")
-        else:
-            st.error("CSV must have a column named 'text'")

 # ------------------------------------------------------------------
 @st.cache_resource
 def load_models():
+    try:
+        st.info("🔄 Loading AI models... This may take a few minutes on first run.")
+        # English Models (Ensemble)
+        roberta = pipeline(
+            "sentiment-analysis",
+            model="cardiffnlp/twitter-roberta-base-sentiment-latest",
+            tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest"
+        )
+        distilbert = pipeline(
+            "sentiment-analysis",
+            model="distilbert-base-uncased-finetuned-sst-2-english"
+        )
+        vader = SentimentIntensityAnalyzer()
+        # Use a more stable multilingual model
+        multilingual = pipeline(
+            "sentiment-analysis",
+            model="nlptown/bert-base-multilingual-uncased-sentiment"
+        )
+        st.success("✅ All models loaded successfully!")
+        return roberta, distilbert, vader, multilingual
+    except Exception as e:
+        st.error(f"❌ Error loading models: {str(e)}")
+        # Return fallback models
+        try:
+            vader = SentimentIntensityAnalyzer()
+            distilbert = pipeline("sentiment-analysis")
+            return None, distilbert, vader, None
+        except:
+            return None, None, SentimentIntensityAnalyzer(), None
+# Load models with progress indication
+with st.spinner("Initializing AI models..."):
+    roberta_model, distilbert_model, vader_model, multi_model = load_models()
+# Check if essential models loaded
+if vader_model is None:
+    st.error("❌ Critical error: Failed to load essential models. Please refresh the page.")
+    st.stop()
 # ------------------------------------------------------------------
 # HELPER FUNCTIONS
 # ------------------------------------------------------------------
 def clean_text(text):
     text = text.lower()
+    text = re.sub(r'http\S+', '', text)  # Remove URLs
+    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
     return text
 def get_wordcloud(text):
+    try:
+        wc = WordCloud(
+            width=800,
+            height=400,
+            background_color='white',
+            max_words=100,
+            colormap='viridis'
+        ).generate(text)
+        fig, ax = plt.subplots(figsize=(10, 5))
+        ax.imshow(wc, interpolation='bilinear')
+        ax.axis('off')
+        return fig
+    except Exception as e:
+        st.error(f"WordCloud error: {e}")
+        return None
 # ------------------------------------------------------------------
 # CORE ANALYSIS LOGIC
 # ------------------------------------------------------------------
 def analyze_english(text):
+    try:
+        # Ensure text is not empty
+        if not text.strip():
+            return {
+                'verdict': 'neutral',
+                'confidence': 'Low (No text)',
+                'breakdown': {'Error': 'No text provided'},
+                'scores': {'Error': 0.0}
+            }
+        results = {}
+        # 1. RoBERTa (if available)
+        if roberta_model is not None:
+            try:
+                rob_out = roberta_model(text[:512])[0]
+                rob_label = rob_out['label']
+                if rob_label == 'LABEL_0':
+                    rob_sent = 'negative'
+                elif rob_label == 'LABEL_1':
+                    rob_sent = 'neutral'
+                else:
+                    rob_sent = 'positive'
+                results['roberta'] = (rob_sent, rob_out['score'])
+            except Exception as e:
+                st.warning(f"RoBERTa model unavailable: {e}")
+        # 2. VADER (always available)
+        vader_out = vader_model.polarity_scores(text)
+        compound = vader_out['compound']
+        if compound >= 0.05:
+            vader_sent = 'positive'
+        elif compound <= -0.05:
+            vader_sent = 'negative'
+        else:
+            vader_sent = 'neutral'
+        results['vader'] = (vader_sent, abs(compound))
+        # 3. DistilBERT (if available)
+        if distilbert_model is not None:
+            try:
+                bert_out = distilbert_model(text[:512])[0]
+                bert_sent = bert_out['label'].lower()
+                results['distilbert'] = (bert_sent, bert_out['score'])
+            except Exception as e:
+                st.warning(f"DistilBERT model unavailable: {e}")
+        # If only VADER is available
+        if len(results) == 1 and 'vader' in results:
+            return {
+                'verdict': vader_sent,
+                'confidence': 'Medium (VADER only)',
+                'breakdown': {'VADER': vader_sent},
+                'scores': {'VADER': abs(compound)}
+            }
+        # Consensus Logic (Voting)
+        votes = [sent for sent, score in results.values()]
+        count = Counter(votes)
+        winner, vote_count = count.most_common(1)[0]
+        # Conflict Detection
+        if len(count) == len(results) or vote_count == 1:
+            final_verdict = "ambiguous"
+            confidence = f"Low ({vote_count}/{len(results)} agreement)"
+        else:
+            final_verdict = winner
+            confidence = "High" if vote_count == len(results) else "Medium"
+        return {
+            'verdict': final_verdict,
+            'confidence': confidence,
+            'breakdown': {model: sent for model, (sent, score) in results.items()},
+            'scores': {model: score for model, (sent, score) in results.items()}
         }
+    except Exception as e:
+        st.error(f"Analysis error: {e}")
+        return None
 def analyze_multilingual(text):
+    try:
+        if not text.strip():
+            return {
+                'verdict': 'neutral',
+                'confidence': 'Low (No text)',
+                'breakdown': {'Error': 'No text provided'},
+                'scores': {'Error': 0.0}
+            }
+        # Use multilingual model if available, otherwise fallback to English analysis
+        if multi_model is not None:
+            result = multi_model(text[:512])[0]
+            label_raw = str(result['label'])
+            score = result['score']
+            # Map star ratings to sentiment (nlptown model uses 1-5 stars)
+            if '1' in label_raw or '2' in label_raw:
+                sentiment = "negative"
+            elif '3' in label_raw:
+                sentiment = "neutral"
+            else:  # 4 or 5 stars
+                sentiment = "positive"
+            return {
+                'verdict': sentiment,
+                'confidence': f"{score:.2f}",
+                'breakdown': {'Multilingual BERT': f"{sentiment.title()} ({score:.2f})"},
+                'scores': {'Model Confidence': score}
+            }
+        else:
+            # Fallback to English analysis
+            st.info("🌐 Multilingual model unavailable, using English analysis...")
+            return analyze_english(text)
+    except Exception as e:
+        st.error(f"Multilingual analysis error: {e}")
+        # Fallback to English analysis
+        return analyze_english(text)
 # ------------------------------------------------------------------
 # UI LAYOUT
 # ------------------------------------------------------------------
 # Sidebar
+st.sidebar.title("⚙️ Configuration")
 language = st.sidebar.selectbox("Select Language", ["English", "Hindi (हिन्दी)", "Hinglish (Mixed)"])
+mode = st.sidebar.selectbox("Analysis Mode", ["Real-time Analysis", "Batch Processing"])
+st.sidebar.markdown("---")
+st.sidebar.info("""
+**Model Status:**
+- ✅ VADER: Available
+- 🤖 RoBERTa: {'✅' if roberta_model else '❌'}
+- 🚀 DistilBERT: {'✅' if distilbert_model else '❌'}
+- 🌐 Multilingual: {'✅' if multi_model else '❌'}
+""")
+st.title("🧠 Sentiment Analytics Pro")
+st.markdown("Advanced AI-powered sentiment analysis across multiple languages")
 st.markdown("---")
 if mode == "Real-time Analysis":
         placeholder_text = "Type in Hinglish (e.g., Product bahut achha hai but delivery slow thi)"
         label_text = "Enter Hinglish Text:"
     else:
+        placeholder_text = "Type your text here... (e.g., I love this product! Amazing quality.)"
         label_text = "Enter English Text:"
     user_input = st.text_area(label_text, height=150, placeholder=placeholder_text)
+    if st.button("🚀 Analyze Sentiment", type="primary", use_container_width=True):
         if not user_input.strip():
+            st.warning("⚠️ Please enter some text first.")
         else:
+            with st.spinner("🔮 Analyzing sentiment with AI models..."):
                 start_time = time.time()
                 # Routing Logic
                 if language == "English":
                     result = analyze_english(user_input)
                 else:
                     result = analyze_multilingual(user_input)
+                if result is None:
+                    st.error("❌ Analysis failed. Please try again with different text.")
+                    st.stop()
                 latency = time.time() - start_time
             # 1. Main Verdict Display
+            st.markdown("### 📊 Analysis Results")
             col1, col2, col3 = st.columns(3)
+            color_map = {
+                'positive': '#10B981',
+                'negative': '#EF4444',
+                'neutral': '#F59E0B',
+                'ambiguous': '#6B7280'
+            }
+            verdict_color = color_map.get(result['verdict'], '#3B82F6')
             with col1:
                 st.markdown(f"""
                 </div>
                 """, unsafe_allow_html=True)
+            # 2. Detailed Breakdown
             st.markdown("---")
             c1, c2 = st.columns([1, 1])
             with c1:
                 st.subheader("🔍 Model Consensus")
+                if language == "English" and len(result['breakdown']) > 1:
+                    df_breakdown = pd.DataFrame(
+                        list(result['breakdown'].items()),
+                        columns=['Model', 'Prediction']
+                    )
                     st.table(df_breakdown)
                     if result['verdict'] == 'ambiguous':
                         st.error("⚠️ Conflict Detected: Models disagree. Human review recommended.")
                 else:
+                    for model, prediction in result['breakdown'].items():
+                        st.info(f"**{model}**: {prediction}")
             with c2:
                 st.subheader("📈 Confidence Scores")
+                if result['scores']:
+                    df_scores = pd.DataFrame(
+                        list(result['scores'].items()),
+                        columns=['Source', 'Score']
+                    )
+                    fig = px.bar(
+                        df_scores,
+                        x='Source',
+                        y='Score',
+                        range_y=[0,1],
+                        color='Score',
+                        color_continuous_scale='Blues'
+                    )
+                    fig.update_layout(showlegend=False)
+                    st.plotly_chart(fig, use_container_width=True)
+            # 3. Word Cloud
+            if len(user_input) > 10:
                 st.subheader("☁️ Contextual Word Cloud")
                 try:
                     cleaned = clean_text(user_input)
+                    if len(cleaned.split()) >= 3:  # Only generate if enough words
+                        fig_wc = get_wordcloud(cleaned)
+                        if fig_wc:
+                            st.pyplot(fig_wc)
+                        else:
+                            st.info("📝 Word cloud not available for this text.")
+                    else:
+                        st.info("📝 Add more text for word cloud visualization.")
+                except Exception as e:
+                    st.info("📝 Word cloud not available for this text type.")
+            # 4. Human Feedback Loop
             st.markdown("---")
+            with st.expander("📝 Help Improve Accuracy (Report Incorrect Results)"):
+                st.write("Your feedback helps train better AI models!")
+                feedback = st.radio("What should the correct sentiment be?",
+                                  ["Positive", "Negative", "Neutral"],
+                                  horizontal=True)
+                if st.button("Submit Correction"):
+                    st.success("""
+                    ✅ Thank you! Your feedback has been recorded.
+                    This helps improve the AI model for everyone.
+                    """)
 elif mode == "Batch Processing":
+    st.info("📁 Upload a CSV file with a 'text' column for batch analysis")
+    uploaded_file = st.file_uploader("Choose CSV file", type=['csv'])
+    if uploaded_file is not None:
+        try:
+            df = pd.read_csv(uploaded_file)
+            if 'text' not in df.columns:
+                st.error("❌ CSV file must contain a column named 'text'")
+            else:
+                st.success(f"✅ Loaded {len(df)} records")
+                if st.button("🔮 Process Batch Analysis", type="primary", use_container_width=True):
+                    results = []
+                    progress_bar = st.progress(0)
+                    status_text = st.empty()
+                    for i, row in df.iterrows():
+                        status_text.text(f"Processing {i+1}/{len(df)}...")
+                        txt = str(row['text'])
+                        if language == "English":
+                            res = analyze_english(txt)
+                        else:
+                            res = analyze_multilingual(txt)
+                        if res:
+                            results.append(res['verdict'])
+                        else:
+                            results.append('analysis_error')
+                        progress_bar.progress((i + 1) / len(df))
+                    status_text.text("✅ Analysis complete!")
+                    # Add results to dataframe
+                    df['sentiment'] = results
+                    # Show results
+                    st.subheader("📋 Analysis Results")
+                    st.dataframe(df, use_container_width=True)
+                    # Show summary
+                    st.subheader("📈 Summary Statistics")
+                    sentiment_counts = df['sentiment'].value_counts()
+                    col1, col2, col3 = st.columns(3)
+                    with col1:
+                        st.metric("Total Records", len(df))
+                    with col2:
+                        st.metric("Positive", sentiment_counts.get('positive', 0))
+                    with col3:
+                        st.metric("Negative", sentiment_counts.get('negative', 0))
+                    # Download
+                    csv = df.to_csv(index=False).encode('utf-8')
+                    st.download_button(
+                        "💾 Download Results CSV",
+                        csv,
+                        "sentiment_analysis_results.csv",
+                        "text/csv",
+                        use_container_width=True
+                    )
+        except Exception as e:
+            st.error(f"❌ Error processing file: {str(e)}")
+# Footer
+st.markdown("---")
+st.markdown(
+    "<div style='text-align: center; color: #6B7280;'>"
+    "Built with ❤️ using Streamlit & Hugging Face Transformers"
+    "</div>",
+    unsafe_allow_html=True
+)