ChiragKaushikCK commited on
Commit
4c354eb
Β·
verified Β·
1 Parent(s): 49af197

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +386 -0
app.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import torch
4
+ from transformers import pipeline
5
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ import numpy as np
9
+ from datetime import datetime
10
+ import time
11
+ import re
12
+ import string
13
+ import nltk
14
+ from nltk.corpus import stopwords
15
+ from collections import Counter
16
+ from wordcloud import WordCloud
17
+ import matplotlib.pyplot as plt
18
+
19
+ # ==========================================
20
+ # 1. SETUP & CONFIGURATION
21
+ # ==========================================
22
+ st.set_page_config(
23
+ page_title="Sentiment Intelligence Engine",
24
+ page_icon="🧠",
25
+ layout="wide",
26
+ initial_sidebar_state="expanded"
27
+ )
28
+
29
+ # NLTK Data Download (Cached to prevent re-downloading)
30
+ @st.cache_resource
31
+ def download_nltk_data():
32
+ try:
33
+ nltk.data.find('corpora/stopwords')
34
+ except LookupError:
35
+ nltk.download('stopwords')
36
+
37
+ download_nltk_data()
38
+
39
+ # Custom CSS for Professional UI
40
+ st.markdown("""
41
+ <style>
42
+ .main-header {
43
+ font-size: 2.5rem;
44
+ font-weight: 700;
45
+ color: #1E88E5;
46
+ text-align: center;
47
+ margin-bottom: 1rem;
48
+ }
49
+ .sub-header {
50
+ font-size: 1.1rem;
51
+ color: #555;
52
+ text-align: center;
53
+ margin-bottom: 2rem;
54
+ }
55
+ .metric-card {
56
+ background-color: #ffffff;
57
+ padding: 1.5rem;
58
+ border-radius: 12px;
59
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
60
+ text-align: center;
61
+ border-top: 5px solid #1E88E5;
62
+ }
63
+ .stTab {
64
+ font-weight: bold;
65
+ }
66
+ </style>
67
+ """, unsafe_allow_html=True)
68
+
69
+ # ==========================================
70
+ # 2. PREPROCESSING & ANALYTICS LOGIC (YOUR CODE)
71
+ # ==========================================
72
+ class TextPreprocessor:
73
+ """
74
+ Custom logic to clean text before analysis.
75
+ This demonstrates understanding of NLP pipeline steps.
76
+ """
77
+ def __init__(self):
78
+ self.stop_words = set(stopwords.words('english'))
79
+
80
+ def clean_text(self, text):
81
+ # Convert to lowercase
82
+ text = text.lower()
83
+ # Remove URLs
84
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
85
+ # Remove numbers
86
+ text = re.sub(r'\d+', '', text)
87
+ # Remove punctuation
88
+ text = text.translate(str.maketrans('', '', string.punctuation))
89
+ # Remove stopwords
90
+ tokens = text.split()
91
+ clean_tokens = [word for word in tokens if word not in self.stop_words]
92
+ return " ".join(clean_tokens)
93
+
94
+ def get_keywords(self, text, top_n=10):
95
+ clean_txt = self.clean_text(text)
96
+ words = clean_txt.split()
97
+ counter = Counter(words)
98
+ return counter.most_common(top_n)
99
+
100
+ # ==========================================
101
+ # 3. SENTIMENT ANALYZER ENGINE
102
+ # ==========================================
103
+ class SentimentAnalyzer:
104
+ def __init__(self):
105
+ # Initialize models
106
+ try:
107
+ self.models = {
108
+ 'roberta': pipeline('sentiment-analysis',
109
+ model='cardiffnlp/twitter-roberta-base-sentiment-latest'),
110
+ 'vader': SentimentIntensityAnalyzer(),
111
+ 'distilbert': pipeline('sentiment-analysis',
112
+ model='distilbert-base-uncased-finetuned-sst-2-english')
113
+ }
114
+ self.preprocessor = TextPreprocessor()
115
+ except Exception as e:
116
+ st.error(f"Error loading models: {e}")
117
+
118
+ def analyze_text(self, text):
119
+ start_time = time.time() # Benchmarking Start
120
+ results = {}
121
+
122
+ try:
123
+ # 1. RoBERTa Analysis (Deep Learning)
124
+ roberta_result = self.models['roberta'](text[:512])[0]
125
+ results['roberta'] = {
126
+ 'label': roberta_result['label'],
127
+ 'score': roberta_result['score'],
128
+ 'sentiment': self._map_roberta_sentiment(roberta_result['label'])
129
+ }
130
+
131
+ # 2. VADER Analysis (Rule Based)
132
+ vader_scores = self.models['vader'].polarity_scores(text)
133
+ results['vader'] = {
134
+ 'compound': vader_scores['compound'],
135
+ 'sentiment': 'positive' if vader_scores['compound'] >= 0.05 else
136
+ 'negative' if vader_scores['compound'] <= -0.05 else 'neutral'
137
+ }
138
+
139
+ # 3. DistilBERT Analysis (Transformer)
140
+ distil_result = self.models['distilbert'](text[:512])[0]
141
+ results['distilbert'] = {
142
+ 'score': distil_result['score'],
143
+ 'sentiment': distil_result['label'].lower()
144
+ }
145
+
146
+ # 4. Ensemble Decision Logic (Your Algorithm)
147
+ results['final_verdict'] = self._ensemble_decision(results)
148
+
149
+ # 5. Add Metrics & Cleaning
150
+ end_time = time.time()
151
+ results['metrics'] = {
152
+ 'time_taken': end_time - start_time,
153
+ 'char_count': len(text),
154
+ 'clean_text': self.preprocessor.clean_text(text)
155
+ }
156
+
157
+ except Exception as e:
158
+ st.error(f"Analysis Error: {e}")
159
+ return None
160
+
161
+ return results
162
+
163
+ def _map_roberta_sentiment(self, label):
164
+ mapping = {'LABEL_0': 'negative', 'LABEL_1': 'neutral', 'LABEL_2': 'positive'}
165
+ return mapping.get(label, label.lower())
166
+
167
+ def _ensemble_decision(self, results):
168
+ sentiments = [
169
+ results['roberta']['sentiment'],
170
+ results['vader']['sentiment'],
171
+ results['distilbert']['sentiment']
172
+ ]
173
+
174
+ counts = Counter(sentiments)
175
+ winner = counts.most_common(1)[0]
176
+
177
+ # Logic: If tie or low confidence, default to VADER (good for social media)
178
+ return {
179
+ 'sentiment': winner[0],
180
+ 'confidence': 'High' if winner[1] >= 2 else 'Medium',
181
+ 'agreement': f"{winner[1]}/3 Models"
182
+ }
183
+
184
+ def batch_analyze(self, texts):
185
+ return [self.analyze_text(text) for text in texts]
186
+
187
+ # Initialize Application
188
+ @st.cache_resource
189
+ def load_analyzer():
190
+ return SentimentAnalyzer()
191
+
192
+ analyzer = load_analyzer()
193
+ preprocessor = TextPreprocessor()
194
+
195
+ # ==========================================
196
+ # 4. VISUALIZATION HELPERS
197
+ # ==========================================
198
+ def create_wordcloud(text):
199
+ if not text.strip():
200
+ return None
201
+ wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)
202
+ fig, ax = plt.subplots(figsize=(10, 5))
203
+ ax.imshow(wordcloud, interpolation='bilinear')
204
+ ax.axis('off')
205
+ return fig
206
+
207
+ # ==========================================
208
+ # 5. USER INTERFACE
209
+ # ==========================================
210
+
211
+ # Sidebar
212
+ st.sidebar.title("βš™οΈ Control Panel")
213
+ st.sidebar.markdown("---")
214
+ analysis_mode = st.sidebar.radio("Select Module:", ["Single Text Analysis", "Batch Processor", "File Upload"])
215
+ st.sidebar.markdown("---")
216
+ st.sidebar.info("πŸ’‘ **System Architecture:**\n\nUses a Hybrid Ensemble approach combining Transformer models (RoBERTa, BERT) with Lexicon-based Logic (VADER) for robust accuracy.")
217
+
218
+ # Main Header
219
+ st.markdown('<div class="main-header">Sentiment Intelligence Engine</div>', unsafe_allow_html=True)
220
+ st.markdown('<div class="sub-header">Advanced NLP Analytics with Ensemble Learning</div>', unsafe_allow_html=True)
221
+
222
+ # ----------------------------
223
+ # MODULE 1: SINGLE TEXT
224
+ # ----------------------------
225
+ if analysis_mode == "Single Text Analysis":
226
+ text_input = st.text_area("Input Text:", height=150, placeholder="Type a review, tweet, or feedback here...")
227
+
228
+ if st.button("Run Analysis", type="primary") and text_input:
229
+ with st.spinner("Processing through NLP Pipeline..."):
230
+ result = analyzer.analyze_text(text_input)
231
+
232
+ if result:
233
+ # Top Summary Cards
234
+ st.markdown("---")
235
+ col1, col2, col3 = st.columns(3)
236
+
237
+ # Colors for sentiment
238
+ color_map = {'positive': '#2ecc71', 'negative': '#e74c3c', 'neutral': '#f39c12'}
239
+ sent = result['final_verdict']['sentiment']
240
+
241
+ with col1:
242
+ st.markdown(f"""
243
+ <div class="metric-card">
244
+ <h3 style="color:{color_map.get(sent, 'black')}">{sent.upper()}</h3>
245
+ <p>Ensemble Verdict</p>
246
+ </div>
247
+ """, unsafe_allow_html=True)
248
+ with col2:
249
+ st.markdown(f"""
250
+ <div class="metric-card">
251
+ <h3>{result['final_verdict']['agreement']}</h3>
252
+ <p>Model Consensus</p>
253
+ </div>
254
+ """, unsafe_allow_html=True)
255
+ with col3:
256
+ st.markdown(f"""
257
+ <div class="metric-card">
258
+ <h3>{result['metrics']['time_taken']:.4f}s</h3>
259
+ <p>Inference Latency</p>
260
+ </div>
261
+ """, unsafe_allow_html=True)
262
+
263
+ st.markdown("### πŸ“Š Analysis Dashboard")
264
+
265
+ # Tabbed View for detailed analysis
266
+ tab1, tab2, tab3 = st.tabs(["🧠 Model Internals", "πŸ” Linguistics & Keywords", "πŸ“ˆ Confidence Metrics"])
267
+
268
+ with tab1:
269
+ st.markdown("#### Model-wise Predictions")
270
+ m_col1, m_col2, m_col3 = st.columns(3)
271
+ m_col1.info(f"**RoBERTa:** {result['roberta']['sentiment'].upper()} ({result['roberta']['score']:.3f})")
272
+ m_col2.info(f"**VADER:** {result['vader']['sentiment'].upper()} ({result['vader']['compound']:.3f})")
273
+ m_col3.info(f"**DistilBERT:** {result['distilbert']['sentiment'].upper()} ({result['distilbert']['score']:.3f})")
274
+
275
+ with tab2:
276
+ st.markdown("#### Key Drivers of Sentiment")
277
+ k_col1, k_col2 = st.columns([2, 1])
278
+
279
+ with k_col1:
280
+ st.caption("Word Cloud (Stopwords Removed)")
281
+ wc_fig = create_wordcloud(result['metrics']['clean_text'])
282
+ if wc_fig:
283
+ st.pyplot(wc_fig)
284
+ else:
285
+ st.warning("Not enough text data for Word Cloud.")
286
+
287
+ with k_col2:
288
+ st.caption("Top Impact Keywords")
289
+ keywords = preprocessor.get_keywords(text_input)
290
+ df_kw = pd.DataFrame(keywords, columns=['Token', 'Frequency'])
291
+ st.dataframe(df_kw, use_container_width=True, hide_index=True)
292
+
293
+ with tab3:
294
+ # Visualization of confidence
295
+ conf_data = pd.DataFrame({
296
+ 'Model': ['RoBERTa', 'VADER (Abs)', 'DistilBERT'],
297
+ 'Confidence': [
298
+ result['roberta']['score'],
299
+ abs(result['vader']['compound']),
300
+ result['distilbert']['score']
301
+ ]
302
+ })
303
+ fig = px.bar(conf_data, x='Model', y='Confidence',
304
+ title="Model Confidence Benchmarking",
305
+ color='Confidence', color_continuous_scale='Blues')
306
+ st.plotly_chart(fig, use_container_width=True)
307
+
308
+ # ----------------------------
309
+ # MODULE 2 & 3: BATCH & FILE
310
+ # ----------------------------
311
+ elif analysis_mode in ["Batch Processor", "File Upload"]:
312
+ texts = []
313
+
314
+ if analysis_mode == "Batch Processor":
315
+ batch_input = st.text_area("Enter multiple texts (one per line):", height=200)
316
+ if st.button("Analyze Batch"):
317
+ texts = [line.strip() for line in batch_input.split('\n') if line.strip()]
318
+
319
+ else: # File Upload
320
+ uploaded_file = st.file_uploader("Upload CSV/TXT", type=['csv', 'txt'])
321
+ if uploaded_file:
322
+ if uploaded_file.type == "text/plain":
323
+ texts = [line.strip() for line in uploaded_file.getvalue().decode("utf-8").split('\n') if line.strip()]
324
+ else:
325
+ df = pd.read_csv(uploaded_file)
326
+ texts = df.iloc[:, 0].astype(str).tolist()
327
+ st.success(f"Loaded {len(texts)} entries.")
328
+
329
+ if texts:
330
+ with st.spinner("Running Batch Processing..."):
331
+ # Progress bar
332
+ progress_bar = st.progress(0)
333
+ results_list = []
334
+
335
+ for i, text in enumerate(texts):
336
+ res = analyzer.analyze_text(text)
337
+ if res:
338
+ flat_res = {
339
+ 'Text': text,
340
+ 'Sentiment': res['final_verdict']['sentiment'],
341
+ 'Confidence': res['final_verdict']['confidence'],
342
+ 'RoBERTa': res['roberta']['sentiment'],
343
+ 'VADER': res['vader']['sentiment'],
344
+ 'Latency (s)': res['metrics']['time_taken']
345
+ }
346
+ results_list.append(flat_res)
347
+ progress_bar.progress((i + 1) / len(texts))
348
+
349
+ df_results = pd.DataFrame(results_list)
350
+
351
+ # Global Dashboard
352
+ st.markdown("### πŸ“ˆ Aggregate Analytics")
353
+
354
+ # 1. Pie Chart
355
+ col1, col2 = st.columns([1, 1])
356
+ with col1:
357
+ fig_pie = px.pie(df_results, names='Sentiment', title='Overall Sentiment Distribution',
358
+ color_discrete_map={'positive':'#2ecc71', 'negative':'#e74c3c', 'neutral':'#f39c12'})
359
+ st.plotly_chart(fig_pie, use_container_width=True)
360
+
361
+ # 2. Performance Stats
362
+ with col2:
363
+ avg_time = df_results['Latency (s)'].mean()
364
+ total_time = df_results['Latency (s)'].sum()
365
+ st.metric("Average Inference Time", f"{avg_time:.4f} s")
366
+ st.metric("Total Processing Time", f"{total_time:.4f} s")
367
+
368
+ # 3. Aggregate Word Cloud (The "Bonus" Feature)
369
+ st.markdown("#### ☁️ Collective Word Cloud")
370
+ all_text = " ".join(df_results['Text'].tolist())
371
+ clean_all_text = preprocessor.clean_text(all_text)
372
+ wc_fig = create_wordcloud(clean_all_text)
373
+ if wc_fig:
374
+ st.pyplot(wc_fig)
375
+
376
+ # Data Table
377
+ st.markdown("### πŸ“‹ Detailed Report")
378
+ st.dataframe(df_results, use_container_width=True)
379
+
380
+ # Download
381
+ csv = df_results.to_csv(index=False)
382
+ st.download_button("Download Report CSV", data=csv, file_name="sentiment_report.csv", mime="text/csv")
383
+
384
+ # Footer
385
+ st.markdown("---")
386
+ st.markdown("<div style='text-align: center; color: grey;'>Developed using Streamlit, Transformers & NLTK</div>", unsafe_allow_html=True)