Soundaryasos commited on
Commit
5a4767a
·
verified ·
1 Parent(s): 7051a7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -264
app.py CHANGED
@@ -1,10 +1,9 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
- from datetime import datetime, timedelta
5
  import plotly.express as px
6
  import plotly.graph_objects as go
7
- from plotly.subplots import make_subplots
8
  from sklearn.linear_model import LinearRegression
9
  from sklearn.ensemble import RandomForestRegressor
10
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
@@ -18,308 +17,114 @@ from nltk.corpus import stopwords
18
  from nltk.tokenize import word_tokenize
19
  from nltk.stem import WordNetLemmatizer
20
  import re
21
- import json
22
  from textblob import TextBlob
23
 
24
- # Page configuration
 
 
 
 
 
 
 
 
25
  st.set_page_config(
26
  page_title="SentiMind Pro - Advanced Sentiment Analysis",
27
  page_icon="📊",
28
- layout="wide",
29
- initial_sidebar_state="expanded"
30
  )
31
 
32
- # Download necessary NLTK data
33
- try:
34
- nltk.data.find('tokenizers/punkt')
35
- nltk.data.find('corpora/stopwords')
36
- nltk.data.find('corpora/wordnet')
37
- except LookupError:
38
- st.info("Downloading NLTK resources...")
39
- nltk.download('punkt')
40
- nltk.download('stopwords')
41
- nltk.download('wordnet')
42
-
43
- # Custom CSS
44
- st.markdown("""
45
- <style>
46
- .main-header {
47
- font-size: 2.5rem;
48
- color: #1E88E5;
49
- text-align: center;
50
- margin-bottom: 1rem;
51
- font-weight: bold;
52
- }
53
- .sub-header {
54
- font-size: 1.5rem;
55
- color: #0D47A1;
56
- margin-top: 2rem;
57
- margin-bottom: 1rem;
58
- font-weight: bold;
59
- }
60
- .description {
61
- font-size: 1rem;
62
- color: #424242;
63
- margin-bottom: 2rem;
64
- }
65
- .results-container {
66
- background-color: #f5f5f5;
67
- padding: 1.5rem;
68
- border-radius: 10px;
69
- margin-bottom: 2rem;
70
- }
71
- .metric-card {
72
- background-color: white;
73
- padding: 1rem;
74
- border-radius: 10px;
75
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
76
- text-align: center;
77
  }
78
- .metric-value {
79
- font-size: 1.8rem;
80
- font-weight: bold;
81
- color: #1E88E5;
82
- }
83
- .metric-label {
84
- font-size: 0.9rem;
85
- color: #616161;
86
- }
87
- .footer {
88
- text-align: center;
89
- margin-top: 3rem;
90
- color: #616161;
91
- }
92
- </style>
93
- """, unsafe_allow_html=True)
94
-
95
- # Session state initialization
96
- if 'initialized' not in st.session_state:
97
- st.session_state.initialized = False
98
- st.session_state.user_input = ""
99
- st.session_state.analysis_done = False
100
- st.session_state.historical_inputs = []
101
- st.session_state.historical_results = []
102
 
103
- # ----------- HELPER FUNCTIONS -----------
104
 
 
105
  def preprocess_text(text):
106
- """Preprocess text for sentiment analysis"""
107
  text = text.lower()
108
- text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs
109
  text = re.sub(r'@\w+|#\w+', '', text) # Remove mentions and hashtags
110
  text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
111
- text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
112
 
113
- tokens = word_tokenize(text) # Tokenize
114
  stop_words = set(stopwords.words('english'))
115
- tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
116
 
117
  lemmatizer = WordNetLemmatizer()
118
- tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatize
119
 
120
  return ' '.join(tokens)
121
 
122
- def initialize_models():
123
- """Initialize sentiment analysis models with loading spinner"""
124
- with st.spinner('Initializing sentiment analysis models...'):
125
- st.session_state.sentiment_models = {
126
- 'vader': SentimentIntensityAnalyzer(),
127
- 'textblob': TextBlob
128
- }
129
-
130
- # BERT Sentiment Analysis
131
- try:
132
- model_name = "distilbert-base-uncased-finetuned-sst-2-english"
133
- tokenizer = AutoTokenizer.from_pretrained(model_name)
134
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
135
- st.session_state.sentiment_models['bert'] = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
136
- except Exception as e:
137
- st.error(f"Error loading BERT model: {e}")
138
- st.session_state.sentiment_models['bert'] = pipeline("sentiment-analysis")
139
-
140
- def generate_sample_data():
141
- """Generate realistic sample data for demonstration"""
142
- end_date = datetime.today()
143
- start_date = end_date - timedelta(days=30)
144
- dates = pd.date_range(start=start_date, end=end_date, freq='D')
145
-
146
- weekday_effect = np.array([0.1 if d.weekday() >= 5 else 0 for d in dates])
147
- trend = np.linspace(-0.2, 0.3, len(dates))
148
- seasonal = np.array([-0.15 if d.weekday() == 0 else 0.05 if d.weekday() == 4 else 0 for d in dates])
149
- noise = np.random.normal(0, 0.2, len(dates))
150
-
151
- sentiment_scores = np.clip(weekday_effect + trend + seasonal + noise, -1, 1)
152
 
153
- df = pd.DataFrame({
154
- "Date": dates,
155
- "Sentiment Score": sentiment_scores,
156
- "Volume": np.random.randint(50, 500, len(dates)) # Simulated volume
157
- })
158
 
159
- df['Day'] = df['Date'].dt.dayofweek
160
- df['Hour'] = np.random.randint(0, 24, len(df))
161
- df['Weekday'] = df['Date'].dt.day_name()
162
- df['Month'] = df['Date'].dt.month_name()
163
 
164
- return df
 
 
 
 
 
165
 
166
- def generate_wordcloud(text, sentiment_score):
167
- """Generate a wordcloud colored by sentiment"""
168
- text = preprocess_text(text)
169
- stopwords = set(STOPWORDS)
170
-
171
- def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
172
- if sentiment_score > 0.5:
173
- return "rgb(0, 128, 0)" # Green
174
- elif sentiment_score > 0:
175
- return "rgb(0, 255, 0)" # Light green
176
- elif sentiment_score > -0.5:
177
- return "rgb(255, 165, 0)" # Orange
178
- else:
179
- return "rgb(255, 0, 0)" # Red
180
-
181
- wc = WordCloud(
182
- width=800,
183
- height=400,
184
- background_color='white',
185
- max_words=100,
186
- stopwords=stopwords,
187
- contour_width=3,
188
- contour_color='steelblue'
189
- )
190
-
191
- wordcloud = wc.generate(text)
192
- wordcloud.recolor(color_func=color_func)
193
 
194
  img = BytesIO()
195
  plt.figure(figsize=(10, 5))
196
  plt.imshow(wordcloud, interpolation='bilinear')
197
  plt.axis('off')
198
- plt.tight_layout()
199
  plt.savefig(img, format='PNG', bbox_inches='tight')
200
  plt.close()
201
 
202
  return base64.b64encode(img.getvalue()).decode()
203
 
204
- def analyze_sentiment(text):
205
- """Perform sentiment analysis using multiple models"""
206
- processed_text = preprocess_text(text)
207
-
208
- vader_result = st.session_state.sentiment_models['vader'].polarity_scores(text)
209
- vader_score = vader_result['compound']
210
-
211
- bert_result = st.session_state.sentiment_models['bert'](text)[0]
212
- bert_score = bert_result['score'] if bert_result['label'] == 'POSITIVE' else -bert_result['score']
213
-
214
- blob = st.session_state.sentiment_models['textblob'](text)
215
- textblob_score = blob.sentiment.polarity
216
-
217
- combined_score = (0.4 * vader_score + 0.4 * bert_score + 0.2 * textblob_score)
218
-
219
- key_phrases = extract_key_phrases(text)
220
- emotions = analyze_emotions(text)
221
-
222
- sentiment_results = {
223
- 'raw_text': text,
224
- 'processed_text': processed_text,
225
- 'vader': {
226
- 'score': vader_score,
227
- 'breakdown': vader_result
228
- },
229
- 'bert': {
230
- 'score': bert_score,
231
- 'label': bert_result['label'],
232
- 'confidence': bert_result['score']
233
- },
234
- 'textblob': {
235
- 'score': textblob_score,
236
- 'subjectivity': blob.sentiment.subjectivity
237
- },
238
- 'combined_score': combined_score,
239
- 'key_phrases': key_phrases,
240
- 'emotions': emotions,
241
- 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
242
- }
243
-
244
- return sentiment_results
245
-
246
- def extract_key_phrases(text, num_phrases=5):
247
- """Extract key phrases from text"""
248
- blob = TextBlob(text)
249
- noun_phrases = blob.noun_phrases
250
-
251
- if len(noun_phrases) < num_phrases:
252
- tokens = word_tokenize(text.lower())
253
- bigrams = list(nltk.bigrams(tokens))
254
- bigram_phrases = [' '.join(bigram) for bigram in bigrams]
255
-
256
- all_phrases = list(noun_phrases) + bigram_phrases
257
-
258
- stop_words = set(stopwords.words('english'))
259
- filtered_phrases = [
260
- phrase for phrase in all_phrases
261
- if not all(word in stop_words for word in phrase.split())
262
- ]
263
-
264
- return list(set(filtered_phrases))[:num_phrases]
265
-
266
- return list(set(noun_phrases))[:num_phrases]
267
-
268
- def analyze_emotions(text):
269
- """Analyze emotions in text"""
270
- emotion_dict = {
271
- 'joy': ['happy', 'delighted', 'pleased', 'glad', 'joy', 'love', 'excellent', 'wonderful'],
272
- 'sadness': ['sad', 'unhappy', 'sorrow', 'depressed', 'down', 'gloomy'],
273
- 'anger': ['angry', 'mad', 'furious', 'irritated', 'annoyed'],
274
- 'fear': ['afraid', 'scared', 'fearful', 'terrified', 'worried'],
275
- 'surprise': ['surprised', 'amazed', 'astonished', 'shocked'],
276
- }
277
-
278
- emotions = {emotion: 0 for emotion in emotion_dict.keys()}
279
-
280
- for word in text.split():
281
- for emotion, keywords in emotion_dict.items():
282
- if word in keywords:
283
- emotions[emotion] += 1
284
-
285
- return emotions
286
-
287
- # Main application logic
288
  def main():
289
- st.title("SentiMind Pro - Advanced Sentiment Analysis")
290
-
291
- if not st.session_state.initialized:
292
- initialize_models()
293
- st.session_state.initialized = True
294
 
295
- st.subheader("Enter Text for Sentiment Analysis")
296
- user_input = st.text_area("Input Text", height=150)
297
 
298
- if st.button("Analyze Sentiment"):
299
- if user_input:
300
  sentiment_results = analyze_sentiment(user_input)
301
- st.session_state.historical_inputs.append(user_input)
302
- st.session_state.historical_results.append(sentiment_results)
303
- st.session_state.analysis_done = True
304
 
305
- # Display results
306
- st.markdown("### Sentiment Analysis Results")
307
- st.json(sentiment_results)
 
308
 
309
- # Generate Word Cloud
310
- wordcloud_image = generate_wordcloud(user_input, sentiment_results['combined_score'])
311
- st.image(f"data:image/png;base64,{wordcloud_image}", use_column_width=True)
312
- else:
313
- st.warning("Please enter some text for analysis.")
314
 
315
- if st.session_state.analysis_done:
316
- st.subheader("Historical Analysis")
317
- if st.session_state.historical_results:
318
- for i, result in enumerate(st.session_state.historical_results):
319
- st.markdown(f"**Input Text {i + 1}:** {st.session_state.historical_inputs[i]}")
320
- st.json(result)
321
-
322
- st.markdown("<footer class='footer'>© 2023 SentiMind Pro. All rights reserved.</footer>", unsafe_allow_html=True)
323
-
324
  if __name__ == "__main__":
325
- main()
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
+ from datetime import datetime
5
  import plotly.express as px
6
  import plotly.graph_objects as go
 
7
  from sklearn.linear_model import LinearRegression
8
  from sklearn.ensemble import RandomForestRegressor
9
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
 
17
  from nltk.tokenize import word_tokenize
18
  from nltk.stem import WordNetLemmatizer
19
  import re
 
20
  from textblob import TextBlob
21
 
22
+ # Ensure necessary NLTK resources are downloaded
23
+ nltk_resources = ['punkt', 'stopwords', 'wordnet']
24
+ for resource in nltk_resources:
25
+ try:
26
+ nltk.data.find(f'corpora/{resource}')
27
+ except LookupError:
28
+ nltk.download(resource)
29
+
30
+ # Streamlit Page Configuration
31
  st.set_page_config(
32
  page_title="SentiMind Pro - Advanced Sentiment Analysis",
33
  page_icon="📊",
34
+ layout="wide"
 
35
  )
36
 
37
+ # Initialize Sentiment Analysis Models
38
+ @st.cache_resource()
39
+ def load_models():
40
+ sentiment_models = {
41
+ 'vader': SentimentIntensityAnalyzer(),
42
+ 'textblob': TextBlob
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  }
44
+
45
+ try:
46
+ model_name = "distilbert-base-uncased-finetuned-sst-2-english"
47
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
48
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
49
+ sentiment_models['bert'] = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
50
+ except Exception as e:
51
+ st.warning(f"Could not load BERT model: {e}")
52
+ sentiment_models['bert'] = None
53
+
54
+ return sentiment_models
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ models = load_models()
57
 
58
+ # Text Preprocessing Function
59
  def preprocess_text(text):
 
60
  text = text.lower()
61
+ text = re.sub(r'http\S+|www\S+', '', text) # Remove URLs
62
  text = re.sub(r'@\w+|#\w+', '', text) # Remove mentions and hashtags
63
  text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
64
+ text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
65
 
66
+ tokens = word_tokenize(text)
67
  stop_words = set(stopwords.words('english'))
68
+ tokens = [word for word in tokens if word not in stop_words]
69
 
70
  lemmatizer = WordNetLemmatizer()
71
+ tokens = [lemmatizer.lemmatize(word) for word in tokens]
72
 
73
  return ' '.join(tokens)
74
 
75
+ # Sentiment Analysis Function
76
+ def analyze_sentiment(text):
77
+ processed_text = preprocess_text(text)
78
+ vader_score = models['vader'].polarity_scores(text)['compound']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ if models['bert']:
81
+ bert_result = models['bert'](text)[0]
82
+ bert_score = bert_result['score'] if bert_result['label'] == 'POSITIVE' else -bert_result['score']
83
+ else:
84
+ bert_score = 0
85
 
86
+ textblob_score = models['textblob'](text).sentiment.polarity
87
+ combined_score = (0.4 * vader_score + 0.4 * bert_score + 0.2 * textblob_score)
 
 
88
 
89
+ return {
90
+ 'vader': vader_score,
91
+ 'bert': bert_score,
92
+ 'textblob': textblob_score,
93
+ 'combined': combined_score
94
+ }
95
 
96
+ # Word Cloud Generation
97
+ def generate_wordcloud(text):
98
+ stopwords_set = set(STOPWORDS)
99
+ wordcloud = WordCloud(width=800, height=400, stopwords=stopwords_set, background_color='white').generate(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  img = BytesIO()
102
  plt.figure(figsize=(10, 5))
103
  plt.imshow(wordcloud, interpolation='bilinear')
104
  plt.axis('off')
 
105
  plt.savefig(img, format='PNG', bbox_inches='tight')
106
  plt.close()
107
 
108
  return base64.b64encode(img.getvalue()).decode()
109
 
110
+ # Streamlit UI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  def main():
112
+ st.title("📊 SentiMind Pro - Advanced Sentiment Analysis")
113
+ st.subheader("Analyze text sentiment using multiple models!")
 
 
 
114
 
115
+ user_input = st.text_area("Enter your text for sentiment analysis:")
 
116
 
117
+ if st.button("Analyze Sentiment") and user_input:
118
+ with st.spinner("Analyzing..."):
119
  sentiment_results = analyze_sentiment(user_input)
 
 
 
120
 
121
+ st.metric("VADER Sentiment", f"{sentiment_results['vader']:.2f}")
122
+ st.metric("BERT Sentiment", f"{sentiment_results['bert']:.2f}")
123
+ st.metric("TextBlob Sentiment", f"{sentiment_results['textblob']:.2f}")
124
+ st.metric("Combined Sentiment Score", f"{sentiment_results['combined']:.2f}")
125
 
126
+ wordcloud_img = generate_wordcloud(user_input)
127
+ st.image(f"data:image/png;base64,{wordcloud_img}", caption="Word Cloud", use_column_width=True)
 
 
 
128
 
 
 
 
 
 
 
 
 
 
129
  if __name__ == "__main__":
130
+ main()