Spaces:

ChiragKaushikCK
/

Sentiment_Analyzer_Pro

Sleeping

App Files Files Community

ChiragKaushikCK commited on Nov 30, 2025

Commit

4c354eb

verified ·

1 Parent(s): 49af197

Create app.py

Browse files

Files changed (1) hide show

app.py +386 -0

app.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import streamlit as st
+import pandas as pd
+import torch
+from transformers import pipeline
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+import plotly.express as px
+import plotly.graph_objects as go
+import numpy as np
+from datetime import datetime
+import time
+import re
+import string
+import nltk
+from nltk.corpus import stopwords
+from collections import Counter
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+# ==========================================
+# 1. SETUP & CONFIGURATION
+# ==========================================
+st.set_page_config(
+    page_title="Sentiment Intelligence Engine",
+    page_icon="🧠",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# NLTK Data Download (Cached to prevent re-downloading)
+@st.cache_resource
+def download_nltk_data():
+    try:
+        nltk.data.find('corpora/stopwords')
+    except LookupError:
+        nltk.download('stopwords')
+download_nltk_data()
+# Custom CSS for Professional UI
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: 700;
+        color: #1E88E5;
+        text-align: center;
+        margin-bottom: 1rem;
+    }
+    .sub-header {
+        font-size: 1.1rem;
+        color: #555;
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .metric-card {
+        background-color: #ffffff;
+        padding: 1.5rem;
+        border-radius: 12px;
+        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+        text-align: center;
+        border-top: 5px solid #1E88E5;
+    }
+    .stTab {
+        font-weight: bold;
+    }
+</style>
+""", unsafe_allow_html=True)
+# ==========================================
+# 2. PREPROCESSING & ANALYTICS LOGIC (YOUR CODE)
+# ==========================================
+class TextPreprocessor:
+    """
+    Custom logic to clean text before analysis.
+    This demonstrates understanding of NLP pipeline steps.
+    """
+    def __init__(self):
+        self.stop_words = set(stopwords.words('english'))
+    def clean_text(self, text):
+        # Convert to lowercase
+        text = text.lower()
+        # Remove URLs
+        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
+        # Remove numbers
+        text = re.sub(r'\d+', '', text)
+        # Remove punctuation
+        text = text.translate(str.maketrans('', '', string.punctuation))
+        # Remove stopwords
+        tokens = text.split()
+        clean_tokens = [word for word in tokens if word not in self.stop_words]
+        return " ".join(clean_tokens)
+    def get_keywords(self, text, top_n=10):
+        clean_txt = self.clean_text(text)
+        words = clean_txt.split()
+        counter = Counter(words)
+        return counter.most_common(top_n)
+# ==========================================
+# 3. SENTIMENT ANALYZER ENGINE
+# ==========================================
+class SentimentAnalyzer:
+    def __init__(self):
+        # Initialize models
+        try:
+            self.models = {
+                'roberta': pipeline('sentiment-analysis',
+                                  model='cardiffnlp/twitter-roberta-base-sentiment-latest'),
+                'vader': SentimentIntensityAnalyzer(),
+                'distilbert': pipeline('sentiment-analysis',
+                                     model='distilbert-base-uncased-finetuned-sst-2-english')
+            }
+            self.preprocessor = TextPreprocessor()
+        except Exception as e:
+            st.error(f"Error loading models: {e}")
+    def analyze_text(self, text):
+        start_time = time.time() # Benchmarking Start
+        results = {}
+        try:
+            # 1. RoBERTa Analysis (Deep Learning)
+            roberta_result = self.models['roberta'](text[:512])[0]
+            results['roberta'] = {
+                'label': roberta_result['label'],
+                'score': roberta_result['score'],
+                'sentiment': self._map_roberta_sentiment(roberta_result['label'])
+            }
+            # 2. VADER Analysis (Rule Based)
+            vader_scores = self.models['vader'].polarity_scores(text)
+            results['vader'] = {
+                'compound': vader_scores['compound'],
+                'sentiment': 'positive' if vader_scores['compound'] >= 0.05 else
+                             'negative' if vader_scores['compound'] <= -0.05 else 'neutral'
+            }
+            # 3. DistilBERT Analysis (Transformer)
+            distil_result = self.models['distilbert'](text[:512])[0]
+            results['distilbert'] = {
+                'score': distil_result['score'],
+                'sentiment': distil_result['label'].lower()
+            }
+            # 4. Ensemble Decision Logic (Your Algorithm)
+            results['final_verdict'] = self._ensemble_decision(results)
+            # 5. Add Metrics & Cleaning
+            end_time = time.time()
+            results['metrics'] = {
+                'time_taken': end_time - start_time,
+                'char_count': len(text),
+                'clean_text': self.preprocessor.clean_text(text)
+            }
+        except Exception as e:
+            st.error(f"Analysis Error: {e}")
+            return None
+        return results
+    def _map_roberta_sentiment(self, label):
+        mapping = {'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive'}
+        return mapping.get(label, label.lower())
+    def _ensemble_decision(self, results):
+        sentiments = [
+            results['roberta']['sentiment'],
+            results['vader']['sentiment'],
+            results['distilbert']['sentiment']
+        ]
+        counts = Counter(sentiments)
+        winner = counts.most_common(1)[0]
+        # Logic: If tie or low confidence, default to VADER (good for social media)
+        return {
+            'sentiment': winner[0],
+            'confidence': 'High' if winner[1] >= 2 else 'Medium',
+            'agreement': f"{winner[1]}/3 Models"
+        }
+    def batch_analyze(self, texts):
+        return [self.analyze_text(text) for text in texts]
+# Initialize Application
+@st.cache_resource
+def load_analyzer():
+    return SentimentAnalyzer()
+analyzer = load_analyzer()
+preprocessor = TextPreprocessor()
+# ==========================================
+# 4. VISUALIZATION HELPERS
+# ==========================================
+def create_wordcloud(text):
+    if not text.strip():
+        return None
+    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ax.imshow(wordcloud, interpolation='bilinear')
+    ax.axis('off')
+    return fig
+# ==========================================
+# 5. USER INTERFACE
+# ==========================================
+# Sidebar
+st.sidebar.title("⚙️ Control Panel")
+st.sidebar.markdown("---")
+analysis_mode = st.sidebar.radio("Select Module:", ["Single Text Analysis", "Batch Processor", "File Upload"])
+st.sidebar.markdown("---")
+st.sidebar.info("💡 **System Architecture:**\n\nUses a Hybrid Ensemble approach combining Transformer models (RoBERTa, BERT) with Lexicon-based Logic (VADER) for robust accuracy.")
+# Main Header
+st.markdown('<div class="main-header">Sentiment Intelligence Engine</div>', unsafe_allow_html=True)
+st.markdown('<div class="sub-header">Advanced NLP Analytics with Ensemble Learning</div>', unsafe_allow_html=True)
+# ----------------------------
+# MODULE 1: SINGLE TEXT
+# ----------------------------
+if analysis_mode == "Single Text Analysis":
+    text_input = st.text_area("Input Text:", height=150, placeholder="Type a review, tweet, or feedback here...")
+    if st.button("Run Analysis", type="primary") and text_input:
+        with st.spinner("Processing through NLP Pipeline..."):
+            result = analyzer.analyze_text(text_input)
+            if result:
+                # Top Summary Cards
+                st.markdown("---")
+                col1, col2, col3 = st.columns(3)
+                # Colors for sentiment
+                color_map = {'positive': '#2ecc71', 'negative': '#e74c3c', 'neutral': '#f39c12'}
+                sent = result['final_verdict']['sentiment']
+                with col1:
+                    st.markdown(f"""
+                    <div class="metric-card">
+                        <h3 style="color:{color_map.get(sent, 'black')}">{sent.upper()}</h3>
+                        <p>Ensemble Verdict</p>
+                    </div>
+                    """, unsafe_allow_html=True)
+                with col2:
+                    st.markdown(f"""
+                    <div class="metric-card">
+                        <h3>{result['final_verdict']['agreement']}</h3>
+                        <p>Model Consensus</p>
+                    </div>
+                    """, unsafe_allow_html=True)
+                with col3:
+                    st.markdown(f"""
+                    <div class="metric-card">
+                        <h3>{result['metrics']['time_taken']:.4f}s</h3>
+                        <p>Inference Latency</p>
+                    </div>
+                    """, unsafe_allow_html=True)
+                st.markdown("### 📊 Analysis Dashboard")
+                # Tabbed View for detailed analysis
+                tab1, tab2, tab3 = st.tabs(["🧠 Model Internals", "🔍 Linguistics & Keywords", "📈 Confidence Metrics"])
+                with tab1:
+                    st.markdown("#### Model-wise Predictions")
+                    m_col1, m_col2, m_col3 = st.columns(3)
+                    m_col1.info(f"**RoBERTa:** {result['roberta']['sentiment'].upper()} ({result['roberta']['score']:.3f})")
+                    m_col2.info(f"**VADER:** {result['vader']['sentiment'].upper()} ({result['vader']['compound']:.3f})")
+                    m_col3.info(f"**DistilBERT:** {result['distilbert']['sentiment'].upper()} ({result['distilbert']['score']:.3f})")
+                with tab2:
+                    st.markdown("#### Key Drivers of Sentiment")
+                    k_col1, k_col2 = st.columns([2, 1])
+                    with k_col1:
+                        st.caption("Word Cloud (Stopwords Removed)")
+                        wc_fig = create_wordcloud(result['metrics']['clean_text'])
+                        if wc_fig:
+                            st.pyplot(wc_fig)
+                        else:
+                            st.warning("Not enough text data for Word Cloud.")
+                    with k_col2:
+                        st.caption("Top Impact Keywords")
+                        keywords = preprocessor.get_keywords(text_input)
+                        df_kw = pd.DataFrame(keywords, columns=['Token', 'Frequency'])
+                        st.dataframe(df_kw, use_container_width=True, hide_index=True)
+                with tab3:
+                    # Visualization of confidence
+                    conf_data = pd.DataFrame({
+                        'Model': ['RoBERTa', 'VADER (Abs)', 'DistilBERT'],
+                        'Confidence': [
+                            result['roberta']['score'],
+                            abs(result['vader']['compound']),
+                            result['distilbert']['score']
+                        ]
+                    })
+                    fig = px.bar(conf_data, x='Model', y='Confidence',
+                                 title="Model Confidence Benchmarking",
+                                 color='Confidence', color_continuous_scale='Blues')
+                    st.plotly_chart(fig, use_container_width=True)
+# ----------------------------
+# MODULE 2 & 3: BATCH & FILE
+# ----------------------------
+elif analysis_mode in ["Batch Processor", "File Upload"]:
+    texts = []
+    if analysis_mode == "Batch Processor":
+        batch_input = st.text_area("Enter multiple texts (one per line):", height=200)
+        if st.button("Analyze Batch"):
+            texts = [line.strip() for line in batch_input.split('\n') if line.strip()]
+    else: # File Upload
+        uploaded_file = st.file_uploader("Upload CSV/TXT", type=['csv', 'txt'])
+        if uploaded_file:
+            if uploaded_file.type == "text/plain":
+                texts = [line.strip() for line in uploaded_file.getvalue().decode("utf-8").split('\n') if line.strip()]
+            else:
+                df = pd.read_csv(uploaded_file)
+                texts = df.iloc[:, 0].astype(str).tolist()
+            st.success(f"Loaded {len(texts)} entries.")
+    if texts:
+        with st.spinner("Running Batch Processing..."):
+            # Progress bar
+            progress_bar = st.progress(0)
+            results_list = []
+            for i, text in enumerate(texts):
+                res = analyzer.analyze_text(text)
+                if res:
+                    flat_res = {
+                        'Text': text,
+                        'Sentiment': res['final_verdict']['sentiment'],
+                        'Confidence': res['final_verdict']['confidence'],
+                        'RoBERTa': res['roberta']['sentiment'],
+                        'VADER': res['vader']['sentiment'],
+                        'Latency (s)': res['metrics']['time_taken']
+                    }
+                    results_list.append(flat_res)
+                progress_bar.progress((i + 1) / len(texts))
+            df_results = pd.DataFrame(results_list)
+            # Global Dashboard
+            st.markdown("### 📈 Aggregate Analytics")
+            # 1. Pie Chart
+            col1, col2 = st.columns([1, 1])
+            with col1:
+                fig_pie = px.pie(df_results, names='Sentiment', title='Overall Sentiment Distribution',
+                                color_discrete_map={'positive':'#2ecc71', 'negative':'#e74c3c', 'neutral':'#f39c12'})
+                st.plotly_chart(fig_pie, use_container_width=True)
+            # 2. Performance Stats
+            with col2:
+                avg_time = df_results['Latency (s)'].mean()
+                total_time = df_results['Latency (s)'].sum()
+                st.metric("Average Inference Time", f"{avg_time:.4f} s")
+                st.metric("Total Processing Time", f"{total_time:.4f} s")
+            # 3. Aggregate Word Cloud (The "Bonus" Feature)
+            st.markdown("#### ☁️ Collective Word Cloud")
+            all_text = " ".join(df_results['Text'].tolist())
+            clean_all_text = preprocessor.clean_text(all_text)
+            wc_fig = create_wordcloud(clean_all_text)
+            if wc_fig:
+                st.pyplot(wc_fig)
+            # Data Table
+            st.markdown("### 📋 Detailed Report")
+            st.dataframe(df_results, use_container_width=True)
+            # Download
+            csv = df_results.to_csv(index=False)
+            st.download_button("Download Report CSV", data=csv, file_name="sentiment_report.csv", mime="text/csv")
+# Footer
+st.markdown("---")
+st.markdown("<div style='text-align: center; color: grey;'>Developed using Streamlit, Transformers & NLTK</div>", unsafe_allow_html=True)