|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.cluster import KMeans |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.naive_bayes import MultinomialNB |
|
|
from sklearn.metrics import classification_report, confusion_matrix |
|
|
|
|
|
|
|
|
df = pd.read_csv("amazon_reviews.csv") |
|
|
|
|
|
|
|
|
def page_intro(): |
|
|
st.title("π Amazon Reviews ML App") |
|
|
st.markdown(""" |
|
|
## Dataset Overview |
|
|
This dataset contains Amazon product reviews, including the text and corresponding star ratings. |
|
|
|
|
|
### What This App Does: |
|
|
1. **Unsupervised Learning**: We'll use **TF-IDF** + **K-Means** clustering to discover hidden topics in the reviews. |
|
|
2. **Supervised Learning**: We'll apply **Naive Bayes** to classify sentiment based on the review text. |
|
|
|
|
|
### How to Use: |
|
|
- Navigate using the sidebar to each section. |
|
|
- Interact with charts and view model outputs. |
|
|
- Explore results and gain insights into customer sentiment and topics. |
|
|
""") |
|
|
st.markdown("### Dataset Preview") |
|
|
st.dataframe(df.head()) |
|
|
|
|
|
|
|
|
def page_unsupervised(): |
|
|
st.title("π Unsupervised Learning: Topic Clustering") |
|
|
|
|
|
num_clusters = st.slider("Select number of clusters", 2, 10, 5) |
|
|
max_features = st.slider("Max TF-IDF features", 100, 3000, 1000, step=100) |
|
|
|
|
|
tfidf = TfidfVectorizer(stop_words='english', max_features=max_features) |
|
|
tfidf_matrix = tfidf.fit_transform(df['reviewText'].astype(str)) |
|
|
|
|
|
kmeans = KMeans(n_clusters=num_clusters, random_state=42) |
|
|
df['Cluster'] = kmeans.fit_predict(tfidf_matrix) |
|
|
|
|
|
st.markdown("### Cluster Distribution") |
|
|
cluster_counts = df['Cluster'].value_counts().sort_index() |
|
|
st.bar_chart(cluster_counts) |
|
|
|
|
|
st.markdown("### Sample Reviews Per Cluster") |
|
|
num_samples = st.slider("Number of sample reviews per cluster", 1, 5, 2) |
|
|
for i in range(num_clusters): |
|
|
st.subheader(f"Cluster {i}") |
|
|
samples = df[df['Cluster'] == i]['reviewText'].sample(num_samples, random_state=42).tolist() |
|
|
for s in samples: |
|
|
st.markdown(f"- {s}") |
|
|
|
|
|
|
|
|
def page_supervised(): |
|
|
st.title("π§ Supervised Learning: Sentiment Classification") |
|
|
|
|
|
def convert_sentiment(star): |
|
|
if star <= 2: |
|
|
return 'negative' |
|
|
elif star == 3: |
|
|
return 'neutral' |
|
|
else: |
|
|
return 'positive' |
|
|
|
|
|
df['Sentiment'] = df['overall'].apply(convert_sentiment) |
|
|
|
|
|
max_features = st.slider("Max TF-IDF features (for classification)", 100, 3000, 1000, step=100) |
|
|
test_size = st.slider("Test set size (%)", 10, 50, 20, step=5) / 100 |
|
|
|
|
|
tfidf = TfidfVectorizer(stop_words='english', max_features=max_features) |
|
|
X = tfidf.fit_transform(df['reviewText'].astype(str)) |
|
|
y = df['Sentiment'] |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state=42) |
|
|
|
|
|
model = MultinomialNB() |
|
|
model.fit(X_train, y_train) |
|
|
y_pred = model.predict(X_test) |
|
|
|
|
|
st.markdown("### Classification Report") |
|
|
report = classification_report(y_test, y_pred, output_dict=True) |
|
|
st.dataframe(pd.DataFrame(report).transpose()) |
|
|
|
|
|
st.markdown("### Confusion Matrix") |
|
|
cm = confusion_matrix(y_test, y_pred, labels=['positive', 'neutral', 'negative']) |
|
|
fig, ax = plt.subplots() |
|
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['positive', 'neutral', 'negative'], yticklabels=['positive', 'neutral', 'negative']) |
|
|
st.pyplot(fig) |
|
|
|
|
|
|
|
|
def page_results(): |
|
|
st.title("π Results & Conclusion") |
|
|
st.markdown(""" |
|
|
### Summary |
|
|
- **Unsupervised Learning** helped group reviews into coherent topics using text similarity. |
|
|
- **Supervised Learning** effectively predicted sentiment with Naive Bayes and TF-IDF. |
|
|
|
|
|
### Insights |
|
|
- Clustering allows business teams to explore latent structures without labels. |
|
|
- Sentiment prediction can automate review analysis at scale. |
|
|
|
|
|
### Next Steps |
|
|
- Try BERT for more accurate sentiment classification. |
|
|
- Explore topic labeling using BERTopic. |
|
|
- Add interactivity to select specific reviews or visualize clusters in 2D. |
|
|
""") |
|
|
|
|
|
|
|
|
pages = { |
|
|
"1 - Introduction": page_intro, |
|
|
"2 - Unsupervised Learning": page_unsupervised, |
|
|
"3 - Supervised Learning": page_supervised, |
|
|
"4 - Results & Conclusion": page_results, |
|
|
} |
|
|
|
|
|
st.sidebar.title("Navigation") |
|
|
page = st.sidebar.radio("Go to", list(pages.keys())) |
|
|
pages[page]() |
|
|
|