import streamlit as st
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pickle
import plotly.express as px
import os
# Download NLTK data
@st.cache_resource
def download_nltk_data():
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
download_nltk_data()
class DataPreprocessor:
def __init__(self):
self.lemmatizer = WordNetLemmatizer()
self.stop_words = set(stopwords.words('english'))
def clean_text(self, text):
if text is None or text != text: # Check for NaN
return ""
# Convert to lowercase
text = str(text).lower()
# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize_and_lemmatize(self, text):
tokens = word_tokenize(text)
tokens = [self.lemmatizer.lemmatize(token) for token in tokens
if token not in self.stop_words and len(token) > 2]
return ' '.join(tokens)
class SentimentAnalyzerApp:
def __init__(self):
self.preprocessor = DataPreprocessor()
self.model = None
self.vectorizer = None
self.df = None
def load_sample_data(self):
"""Create sample data for demo purposes"""
try:
sample_data = {
'date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'],
'review': [
'This app is absolutely amazing and very helpful!',
'The application works okay but could be better.',
'I am very disappointed with the performance.',
'Excellent features and great user interface.',
'Not what I expected, needs improvement.'
],
'rating': [5, 3, 1, 5, 2],
'platform': ['Web', 'Mobile', 'Web', 'Mobile', 'Web'],
'language': ['en', 'en', 'en', 'en', 'en'],
'location': ['USA', 'UK', 'Canada', 'Australia', 'India'],
'verified_purchase': ['Yes', 'No', 'Yes', 'Yes', 'No'],
'helpful_votes': [10, 2, 5, 8, 1]
}
self.df = pd.DataFrame(sample_data)
self.df['date'] = pd.to_datetime(self.df['date'])
# Create sentiment labels
def get_sentiment(rating):
if rating >= 4:
return 'Positive'
elif rating == 3:
return 'Neutral'
else:
return 'Negative'
self.df['sentiment'] = self.df['rating'].apply(get_sentiment)
return True
except Exception as e:
st.error(f"Error creating sample data: {e}")
return False
def load_real_data(self):
"""Try to load real data from file"""
try:
data_path = 'data/chatgpt_style_reviews_dataset.csv'
if os.path.exists(data_path):
self.df = pd.read_csv(data_path)
self.df['date'] = pd.to_datetime(self.df['date'], errors='coerce')
# Create sentiment labels
def get_sentiment(rating):
if rating >= 4:
return 'Positive'
elif rating == 3:
return 'Neutral'
else:
return 'Negative'
self.df['sentiment'] = self.df['rating'].apply(get_sentiment)
return True
return False
except Exception as e:
st.error(f"Error loading real data: {e}")
return False
def load_model(self):
"""Try to load model, but use simulated predictions if not available"""
try:
model_path = 'models/sentiment_model.pkl'
if os.path.exists(model_path):
with open(model_path, 'rb') as f:
model_data = pickle.load(f)
self.model = model_data['model']
self.vectorizer = model_data['vectorizer']
return True
else:
st.info("🤖 Using simulated sentiment analysis for demo. Upload a trained model for accurate predictions.")
return False
except Exception as e:
st.warning(f"Model loading failed: {e}. Using simulated mode.")
return False
def ensure_data_loaded(self):
"""Ensure data is loaded, use sample if real data not available"""
if self.df is None:
# First try to load real data
if not self.load_real_data():
# If real data fails, load sample data
self.load_sample_data()
def predict_sentiment(self, text):
"""Predict sentiment for new text"""
if self.model is not None and self.vectorizer is not None:
# Use actual model
cleaned_text = self.preprocessor.clean_text(text)
processed_text = self.preprocessor.tokenize_and_lemmatize(cleaned_text)
text_vector = self.vectorizer.transform([processed_text])
prediction = self.model.predict(text_vector)[0]
probability = self.model.predict_proba(text_vector)[0]
return prediction, dict(zip(self.model.classes_, probability))
else:
# Simulate prediction
positive_words = ['good', 'great', 'excellent', 'amazing', 'love', 'awesome', 'perfect', 'fantastic', 'wonderful', 'outstanding']
negative_words = ['bad', 'terrible', 'awful', 'hate', 'worst', 'disappointed', 'poor', 'horrible', 'waste', 'useless']
text_lower = text.lower()
positive_count = sum(1 for word in positive_words if word in text_lower)
negative_count = sum(1 for word in negative_words if word in text_lower)
if positive_count > negative_count:
prediction = "Positive"
confidence = min(0.8 + (positive_count * 0.05), 0.95)
elif negative_count > positive_count:
prediction = "Negative"
confidence = min(0.8 + (negative_count * 0.05), 0.95)
else:
prediction = "Neutral"
confidence = 0.6
# Simulate probabilities
if prediction == "Positive":
probabilities = {'Positive': confidence, 'Neutral': (1-confidence)/2, 'Negative': (1-confidence)/2}
elif prediction == "Negative":
probabilities = {'Positive': (1-confidence)/2, 'Neutral': (1-confidence)/2, 'Negative': confidence}
else:
probabilities = {'Positive': 0.2, 'Neutral': confidence, 'Negative': 0.2}
return prediction, probabilities
def run(self):
"""Main application"""
st.set_page_config(
page_title="AI Echo - Sentiment Analysis",
page_icon="🤖",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS
st.markdown("""
""", unsafe_allow_html=True)
st.markdown('
🤖 AI Echo: Sentiment Analysis
', unsafe_allow_html=True)
st.markdown("### Customer Review Sentiment Analysis Dashboard")
# Initialize and load data
self.ensure_data_loaded()
if 'model_loaded' not in st.session_state:
st.session_state.model_loaded = self.load_model()
# Sidebar
st.sidebar.title("Navigation")
page = st.sidebar.selectbox(
"Choose a page:",
["📊 Overview", "🤖 Model Demo", "📈 Analysis", "💡 Insights"]
)
# Page routing
if page == "📊 Overview":
self.show_overview()
elif page == "🤖 Model Demo":
self.show_model_demo()
elif page == "📈 Analysis":
self.show_analysis()
else:
self.show_insights()
def show_overview(self):
"""Overview page"""
st.header("📊 Project Overview")
# Ensure data is loaded
self.ensure_data_loaded()
# Key metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
total_reviews = len(self.df)
st.metric("Total Reviews", total_reviews)
with col2:
avg_rating = self.df['rating'].mean()
st.metric("Average Rating", f"{avg_rating:.2f} ⭐")
with col3:
positive_pct = (self.df['sentiment'] == 'Positive').mean() * 100
st.metric("Positive Reviews", f"{positive_pct:.1f}%")
with col4:
helpful_reviews = self.df['helpful_votes'].sum()
st.metric("Total Helpful Votes", helpful_reviews)
st.markdown("---")
# Visualizations
col1, col2 = st.columns(2)
with col1:
st.subheader("Review Rating Distribution")
rating_counts = self.df['rating'].value_counts().sort_index()
fig = px.bar(rating_counts, x=rating_counts.index, y=rating_counts.values,
labels={'x': 'Rating', 'y': 'Count'},
title='Distribution of Ratings')
st.plotly_chart(fig, use_container_width=True)
with col2:
st.subheader("Sentiment Distribution")
sentiment_counts = self.df['sentiment'].value_counts()
fig = px.pie(values=sentiment_counts.values, names=sentiment_counts.index,
title='Sentiment Distribution')
st.plotly_chart(fig, use_container_width=True)
# Show data source info
if hasattr(self, 'using_real_data') and self.using_real_data:
st.success("✅ Using real dataset from file")
else:
st.info("💡 Using sample data for demo. Upload your dataset to the 'data' folder for real analysis.")
def show_model_demo(self):
"""Interactive model demo"""
st.header("🤖 Sentiment Analysis Demo")
st.markdown("""
Enter your own review text below to analyze its sentiment.
The model will predict whether the sentiment is **Positive**, **Neutral**, or **Negative**.
""")
# Text input
user_text = st.text_area(
"Enter your review text:",
height=150,
placeholder="Type your review here... Example: 'This app is amazing and very helpful!'",
value="I love this application! It's incredibly useful and well-designed."
)
if user_text:
with st.spinner("Analyzing sentiment..."):
prediction, probabilities = self.predict_sentiment(user_text)
# Display results
st.subheader("🎯 Prediction Results")
col1, col2 = st.columns([1, 2])
with col1:
sentiment_colors = {
'Positive': '🟢',
'Neutral': '🟡',
'Negative': '🔴'
}
st.metric(
"Predicted Sentiment",
f"{sentiment_colors.get(prediction, '⚪')} {prediction}"
)
with col2:
st.subheader("Confidence Scores")
for sentiment, prob in probabilities.items():
st.write(f"**{sentiment}**: {prob:.1%}")
st.progress(prob)
if self.model is None:
st.info("🔬 Currently using simulated analysis. Upload a trained model file for more accurate predictions.")
# Example reviews
st.markdown("---")
st.subheader("💡 Try these examples:")
examples = [
"This app is absolutely fantastic! It helps me so much with my work.",
"The application is okay, but it could use some improvements.",
"I'm very disappointed with the performance and customer service.",
"Outstanding features and excellent user experience!",
"It's mediocre, nothing special about it."
]
cols = st.columns(3)
for i, example in enumerate(examples):
with cols[i % 3]:
if st.button(f"'{example[:30]}...'", use_container_width=True):
st.rerun()
def show_analysis(self):
"""Analysis page"""
st.header("📈 Data Analysis")
# Ensure data is loaded
self.ensure_data_loaded()
if self.df is None:
st.error("No data available for analysis.")
return
# Platform analysis
st.subheader("Platform Comparison")
platform_counts = self.df['platform'].value_counts()
fig = px.bar(platform_counts, x=platform_counts.index, y=platform_counts.values,
labels={'x': 'Platform', 'y': 'Number of Reviews'},
title='Reviews by Platform')
st.plotly_chart(fig, use_container_width=True)
# Sentiment by platform
platform_sentiment = pd.crosstab(self.df['platform'], self.df['sentiment'], normalize='index') * 100
fig = px.bar(platform_sentiment, barmode='stack',
title='Sentiment Distribution by Platform (%)')
st.plotly_chart(fig, use_container_width=True)
# Word clouds
st.subheader("📝 Word Clouds")
positive_text = ' '.join(self.df[self.df['sentiment'] == 'Positive']['review'])
negative_text = ' '.join(self.df[self.df['sentiment'] == 'Negative']['review'])
col1, col2 = st.columns(2)
with col1:
st.markdown("**Positive Reviews**")
if positive_text.strip():
wordcloud = WordCloud(width=400, height=300, background_color='white').generate(positive_text)
fig, ax = plt.subplots(figsize=(10, 6))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
st.pyplot(fig)
else:
st.info("No positive reviews available")
with col2:
st.markdown("**Negative Reviews**")
if negative_text.strip():
wordcloud = WordCloud(width=400, height=300, background_color='white').generate(negative_text)
fig, ax = plt.subplots(figsize=(10, 6))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
st.pyplot(fig)
else:
st.info("No negative reviews available")
def show_insights(self):
"""Insights page"""
st.header("💡 Business Insights & Recommendations")
# Ensure data is loaded
self.ensure_data_loaded()
if self.df is None:
st.error("No data available for insights.")
return
# Key metrics
positive_pct = (self.df['sentiment'] == 'Positive').mean() * 100
avg_rating = self.df['rating'].mean()
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Overall Satisfaction", f"{positive_pct:.1f}%")
with col2:
st.metric("Average Rating", f"{avg_rating:.2f} ⭐")
with col3:
verified_ratio = (self.df['verified_purchase'] == 'Yes').mean() * 100
st.metric("Verified Reviews", f"{verified_ratio:.1f}%")
st.markdown("---")
# Recommendations
st.subheader("🎯 Actionable Recommendations")
recommendations = [
"**Monitor Negative Reviews**: Regularly analyze 1-2 star reviews for common issues and pain points",
"**Platform Optimization**: Ensure consistent user experience across all platforms (Web, Mobile, etc.)",
"**Feature Development**: Prioritize features frequently mentioned in positive reviews",
"**Customer Support**: Implement sentiment-based routing for support tickets",
"**Regional Strategy**: Analyze location-based sentiment for market-specific improvements",
"**Version Tracking**: Monitor sentiment changes across different application versions"
]
for i, recommendation in enumerate(recommendations, 1):
st.markdown(f"{i}. {recommendation}")
st.markdown("---")
# Technical setup
st.subheader("🔧 Technical Setup")
st.info("""
**To use with your own data:**
1. Upload your CSV file to the `data/` folder
2. Train and save your model as `models/sentiment_model.pkl`
3. The app will automatically detect and use your files
**Current mode:** Using sample data with simulated sentiment analysis
""")
# Run the app
if __name__ == "__main__":
app = SentimentAnalyzerApp()
app.run()