File size: 18,081 Bytes
b9fbeb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8af3937
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9fbeb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8af3937
 
 
 
 
 
 
 
b9fbeb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8af3937
b9fbeb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8af3937
 
 
b9fbeb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8af3937
 
 
 
 
b9fbeb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8af3937
 
 
 
 
 
 
b9fbeb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8af3937
 
 
 
 
 
 
b9fbeb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
import streamlit as st
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pickle
import plotly.express as px
import os

# Download NLTK data
@st.cache_resource
def download_nltk_data():
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)

download_nltk_data()

class DataPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
    
    def clean_text(self, text):
        if text is None or text != text:  # Check for NaN
            return ""
        
        # Convert to lowercase
        text = str(text).lower()
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize_and_lemmatize(self, text):
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens 
                 if token not in self.stop_words and len(token) > 2]
        return ' '.join(tokens)

class SentimentAnalyzerApp:
    def __init__(self):
        self.preprocessor = DataPreprocessor()
        self.model = None
        self.vectorizer = None
        self.df = None
        
    def load_sample_data(self):
        """Create sample data for demo purposes"""
        try:
            sample_data = {
                'date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'],
                'review': [
                    'This app is absolutely amazing and very helpful!',
                    'The application works okay but could be better.',
                    'I am very disappointed with the performance.',
                    'Excellent features and great user interface.',
                    'Not what I expected, needs improvement.'
                ],
                'rating': [5, 3, 1, 5, 2],
                'platform': ['Web', 'Mobile', 'Web', 'Mobile', 'Web'],
                'language': ['en', 'en', 'en', 'en', 'en'],
                'location': ['USA', 'UK', 'Canada', 'Australia', 'India'],
                'verified_purchase': ['Yes', 'No', 'Yes', 'Yes', 'No'],
                'helpful_votes': [10, 2, 5, 8, 1]
            }
            self.df = pd.DataFrame(sample_data)
            self.df['date'] = pd.to_datetime(self.df['date'])
            
            # Create sentiment labels
            def get_sentiment(rating):
                if rating >= 4:
                    return 'Positive'
                elif rating == 3:
                    return 'Neutral'
                else:
                    return 'Negative'
            
            self.df['sentiment'] = self.df['rating'].apply(get_sentiment)
            return True
        except Exception as e:
            st.error(f"Error creating sample data: {e}")
            return False
    
    def load_real_data(self):
        """Try to load real data from file"""
        try:
            data_path = 'data/chatgpt_style_reviews_dataset.csv'
            if os.path.exists(data_path):
                self.df = pd.read_csv(data_path)
                self.df['date'] = pd.to_datetime(self.df['date'], errors='coerce')
                
                # Create sentiment labels
                def get_sentiment(rating):
                    if rating >= 4:
                        return 'Positive'
                    elif rating == 3:
                        return 'Neutral'
                    else:
                        return 'Negative'
                
                self.df['sentiment'] = self.df['rating'].apply(get_sentiment)
                return True
            return False
        except Exception as e:
            st.error(f"Error loading real data: {e}")
            return False
    
    def load_model(self):
        """Try to load model, but use simulated predictions if not available"""
        try:
            model_path = 'models/sentiment_model.pkl'
            if os.path.exists(model_path):
                with open(model_path, 'rb') as f:
                    model_data = pickle.load(f)
                self.model = model_data['model']
                self.vectorizer = model_data['vectorizer']
                return True
            else:
                st.info("πŸ€– Using simulated sentiment analysis for demo. Upload a trained model for accurate predictions.")
                return False
        except Exception as e:
            st.warning(f"Model loading failed: {e}. Using simulated mode.")
            return False
    
    def ensure_data_loaded(self):
        """Ensure data is loaded, use sample if real data not available"""
        if self.df is None:
            # First try to load real data
            if not self.load_real_data():
                # If real data fails, load sample data
                self.load_sample_data()
    
    def predict_sentiment(self, text):
        """Predict sentiment for new text"""
        if self.model is not None and self.vectorizer is not None:
            # Use actual model
            cleaned_text = self.preprocessor.clean_text(text)
            processed_text = self.preprocessor.tokenize_and_lemmatize(cleaned_text)
            text_vector = self.vectorizer.transform([processed_text])
            prediction = self.model.predict(text_vector)[0]
            probability = self.model.predict_proba(text_vector)[0]
            return prediction, dict(zip(self.model.classes_, probability))
        else:
            # Simulate prediction
            positive_words = ['good', 'great', 'excellent', 'amazing', 'love', 'awesome', 'perfect', 'fantastic', 'wonderful', 'outstanding']
            negative_words = ['bad', 'terrible', 'awful', 'hate', 'worst', 'disappointed', 'poor', 'horrible', 'waste', 'useless']
            
            text_lower = text.lower()
            positive_count = sum(1 for word in positive_words if word in text_lower)
            negative_count = sum(1 for word in negative_words if word in text_lower)
            
            if positive_count > negative_count:
                prediction = "Positive"
                confidence = min(0.8 + (positive_count * 0.05), 0.95)
            elif negative_count > positive_count:
                prediction = "Negative" 
                confidence = min(0.8 + (negative_count * 0.05), 0.95)
            else:
                prediction = "Neutral"
                confidence = 0.6
            
            # Simulate probabilities
            if prediction == "Positive":
                probabilities = {'Positive': confidence, 'Neutral': (1-confidence)/2, 'Negative': (1-confidence)/2}
            elif prediction == "Negative":
                probabilities = {'Positive': (1-confidence)/2, 'Neutral': (1-confidence)/2, 'Negative': confidence}
            else:
                probabilities = {'Positive': 0.2, 'Neutral': confidence, 'Negative': 0.2}
            
            return prediction, probabilities
    
    def run(self):
        """Main application"""
        st.set_page_config(
            page_title="AI Echo - Sentiment Analysis",
            page_icon="πŸ€–",
            layout="wide",
            initial_sidebar_state="expanded"
        )
        
        # Custom CSS
        st.markdown("""
        <style>
        .main-header {
            font-size: 2.5rem;
            color: #1f77b4;
            text-align: center;
            margin-bottom: 2rem;
        }
        .metric-card {
            background-color: #f0f2f6;
            padding: 1rem;
            border-radius: 10px;
            border-left: 4px solid #1f77b4;
        }
        </style>
        """, unsafe_allow_html=True)
        
        st.markdown('<h1 class="main-header">πŸ€– AI Echo: Sentiment Analysis</h1>', unsafe_allow_html=True)
        st.markdown("### Customer Review Sentiment Analysis Dashboard")
        
        # Initialize and load data
        self.ensure_data_loaded()
        
        if 'model_loaded' not in st.session_state:
            st.session_state.model_loaded = self.load_model()
        
        # Sidebar
        st.sidebar.title("Navigation")
        page = st.sidebar.selectbox(
            "Choose a page:",
            ["πŸ“Š Overview", "πŸ€– Model Demo", "πŸ“ˆ Analysis", "πŸ’‘ Insights"]
        )
        
        # Page routing
        if page == "πŸ“Š Overview":
            self.show_overview()
        elif page == "πŸ€– Model Demo":
            self.show_model_demo()
        elif page == "πŸ“ˆ Analysis":
            self.show_analysis()
        else:
            self.show_insights()
    
    def show_overview(self):
        """Overview page"""
        st.header("πŸ“Š Project Overview")
        
        # Ensure data is loaded
        self.ensure_data_loaded()
        
        # Key metrics
        col1, col2, col3, col4 = st.columns(4)
        
        with col1:
            total_reviews = len(self.df)
            st.metric("Total Reviews", total_reviews)
        
        with col2:
            avg_rating = self.df['rating'].mean()
            st.metric("Average Rating", f"{avg_rating:.2f} ⭐")
        
        with col3:
            positive_pct = (self.df['sentiment'] == 'Positive').mean() * 100
            st.metric("Positive Reviews", f"{positive_pct:.1f}%")
        
        with col4:
            helpful_reviews = self.df['helpful_votes'].sum()
            st.metric("Total Helpful Votes", helpful_reviews)
        
        st.markdown("---")
        
        # Visualizations
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("Review Rating Distribution")
            rating_counts = self.df['rating'].value_counts().sort_index()
            fig = px.bar(rating_counts, x=rating_counts.index, y=rating_counts.values,
                        labels={'x': 'Rating', 'y': 'Count'},
                        title='Distribution of Ratings')
            st.plotly_chart(fig, use_container_width=True)
        
        with col2:
            st.subheader("Sentiment Distribution")
            sentiment_counts = self.df['sentiment'].value_counts()
            fig = px.pie(values=sentiment_counts.values, names=sentiment_counts.index,
                        title='Sentiment Distribution')
            st.plotly_chart(fig, use_container_width=True)
        
        # Show data source info
        if hasattr(self, 'using_real_data') and self.using_real_data:
            st.success("βœ… Using real dataset from file")
        else:
            st.info("πŸ’‘ Using sample data for demo. Upload your dataset to the 'data' folder for real analysis.")
    
    def show_model_demo(self):
        """Interactive model demo"""
        st.header("πŸ€– Sentiment Analysis Demo")
        
        st.markdown("""
        Enter your own review text below to analyze its sentiment.
        The model will predict whether the sentiment is **Positive**, **Neutral**, or **Negative**.
        """)
        
        # Text input
        user_text = st.text_area(
            "Enter your review text:",
            height=150,
            placeholder="Type your review here... Example: 'This app is amazing and very helpful!'",
            value="I love this application! It's incredibly useful and well-designed."
        )
        
        if user_text:
            with st.spinner("Analyzing sentiment..."):
                prediction, probabilities = self.predict_sentiment(user_text)
            
            # Display results
            st.subheader("🎯 Prediction Results")
            
            col1, col2 = st.columns([1, 2])
            
            with col1:
                sentiment_colors = {
                    'Positive': '🟒',
                    'Neutral': '🟑', 
                    'Negative': 'πŸ”΄'
                }
                
                st.metric(
                    "Predicted Sentiment",
                    f"{sentiment_colors.get(prediction, 'βšͺ')} {prediction}"
                )
            
            with col2:
                st.subheader("Confidence Scores")
                
                for sentiment, prob in probabilities.items():
                    st.write(f"**{sentiment}**: {prob:.1%}")
                    st.progress(prob)
            
            if self.model is None:
                st.info("πŸ”¬ Currently using simulated analysis. Upload a trained model file for more accurate predictions.")
        
        # Example reviews
        st.markdown("---")
        st.subheader("πŸ’‘ Try these examples:")
        
        examples = [
            "This app is absolutely fantastic! It helps me so much with my work.",
            "The application is okay, but it could use some improvements.",
            "I'm very disappointed with the performance and customer service.",
            "Outstanding features and excellent user experience!",
            "It's mediocre, nothing special about it."
        ]
        
        cols = st.columns(3)
        for i, example in enumerate(examples):
            with cols[i % 3]:
                if st.button(f"'{example[:30]}...'", use_container_width=True):
                    st.rerun()

    def show_analysis(self):
        """Analysis page"""
        st.header("πŸ“ˆ Data Analysis")
        
        # Ensure data is loaded
        self.ensure_data_loaded()
        
        if self.df is None:
            st.error("No data available for analysis.")
            return
        
        # Platform analysis
        st.subheader("Platform Comparison")
        platform_counts = self.df['platform'].value_counts()
        fig = px.bar(platform_counts, x=platform_counts.index, y=platform_counts.values,
                    labels={'x': 'Platform', 'y': 'Number of Reviews'},
                    title='Reviews by Platform')
        st.plotly_chart(fig, use_container_width=True)
        
        # Sentiment by platform
        platform_sentiment = pd.crosstab(self.df['platform'], self.df['sentiment'], normalize='index') * 100
        fig = px.bar(platform_sentiment, barmode='stack',
                    title='Sentiment Distribution by Platform (%)')
        st.plotly_chart(fig, use_container_width=True)
        
        # Word clouds
        st.subheader("πŸ“ Word Clouds")
        
        positive_text = ' '.join(self.df[self.df['sentiment'] == 'Positive']['review'])
        negative_text = ' '.join(self.df[self.df['sentiment'] == 'Negative']['review'])
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.markdown("**Positive Reviews**")
            if positive_text.strip():
                wordcloud = WordCloud(width=400, height=300, background_color='white').generate(positive_text)
                fig, ax = plt.subplots(figsize=(10, 6))
                ax.imshow(wordcloud, interpolation='bilinear')
                ax.axis('off')
                st.pyplot(fig)
            else:
                st.info("No positive reviews available")
        
        with col2:
            st.markdown("**Negative Reviews**")
            if negative_text.strip():
                wordcloud = WordCloud(width=400, height=300, background_color='white').generate(negative_text)
                fig, ax = plt.subplots(figsize=(10, 6))
                ax.imshow(wordcloud, interpolation='bilinear')
                ax.axis('off')
                st.pyplot(fig)
            else:
                st.info("No negative reviews available")

    def show_insights(self):
        """Insights page"""
        st.header("πŸ’‘ Business Insights & Recommendations")
        
        # Ensure data is loaded
        self.ensure_data_loaded()
        
        if self.df is None:
            st.error("No data available for insights.")
            return
        
        # Key metrics
        positive_pct = (self.df['sentiment'] == 'Positive').mean() * 100
        avg_rating = self.df['rating'].mean()
        
        col1, col2, col3 = st.columns(3)
        
        with col1:
            st.metric("Overall Satisfaction", f"{positive_pct:.1f}%")
        
        with col2:
            st.metric("Average Rating", f"{avg_rating:.2f} ⭐")
        
        with col3:
            verified_ratio = (self.df['verified_purchase'] == 'Yes').mean() * 100
            st.metric("Verified Reviews", f"{verified_ratio:.1f}%")
        
        st.markdown("---")
        
        # Recommendations
        st.subheader("🎯 Actionable Recommendations")
        
        recommendations = [
            "**Monitor Negative Reviews**: Regularly analyze 1-2 star reviews for common issues and pain points",
            "**Platform Optimization**: Ensure consistent user experience across all platforms (Web, Mobile, etc.)",
            "**Feature Development**: Prioritize features frequently mentioned in positive reviews",
            "**Customer Support**: Implement sentiment-based routing for support tickets",
            "**Regional Strategy**: Analyze location-based sentiment for market-specific improvements",
            "**Version Tracking**: Monitor sentiment changes across different application versions"
        ]
        
        for i, recommendation in enumerate(recommendations, 1):
            st.markdown(f"{i}. {recommendation}")
        
        st.markdown("---")
        
        # Technical setup
        st.subheader("πŸ”§ Technical Setup")
        st.info("""
        **To use with your own data:**
        1. Upload your CSV file to the `data/` folder
        2. Train and save your model as `models/sentiment_model.pkl`
        3. The app will automatically detect and use your files
        
        **Current mode:** Using sample data with simulated sentiment analysis
        """)

# Run the app
if __name__ == "__main__":
    app = SentimentAnalyzerApp()
    app.run()