mrciomnl's picture
initial commit
5e61364
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
# Load the dataset
df = pd.read_csv("amazon_reviews.csv")
# --- Page 1: Introduction ---
def page_intro():
st.title("πŸ“˜ Amazon Reviews ML App")
st.markdown("""
## Dataset Overview
This dataset contains Amazon product reviews, including the text and corresponding star ratings.
### What This App Does:
1. **Unsupervised Learning**: We'll use **TF-IDF** + **K-Means** clustering to discover hidden topics in the reviews.
2. **Supervised Learning**: We'll apply **Naive Bayes** to classify sentiment based on the review text.
### How to Use:
- Navigate using the sidebar to each section.
- Interact with charts and view model outputs.
- Explore results and gain insights into customer sentiment and topics.
""")
st.markdown("### Dataset Preview")
st.dataframe(df.head())
# --- Page 2: Unsupervised Learning ---
def page_unsupervised():
st.title("πŸ” Unsupervised Learning: Topic Clustering")
num_clusters = st.slider("Select number of clusters", 2, 10, 5)
max_features = st.slider("Max TF-IDF features", 100, 3000, 1000, step=100)
tfidf = TfidfVectorizer(stop_words='english', max_features=max_features)
tfidf_matrix = tfidf.fit_transform(df['reviewText'].astype(str))
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(tfidf_matrix)
st.markdown("### Cluster Distribution")
cluster_counts = df['Cluster'].value_counts().sort_index()
st.bar_chart(cluster_counts)
st.markdown("### Sample Reviews Per Cluster")
num_samples = st.slider("Number of sample reviews per cluster", 1, 5, 2)
for i in range(num_clusters):
st.subheader(f"Cluster {i}")
samples = df[df['Cluster'] == i]['reviewText'].sample(num_samples, random_state=42).tolist()
for s in samples:
st.markdown(f"- {s}")
# --- Page 3: Supervised Learning ---
def page_supervised():
st.title("🧠 Supervised Learning: Sentiment Classification")
def convert_sentiment(star):
if star <= 2:
return 'negative'
elif star == 3:
return 'neutral'
else:
return 'positive'
df['Sentiment'] = df['overall'].apply(convert_sentiment)
max_features = st.slider("Max TF-IDF features (for classification)", 100, 3000, 1000, step=100)
test_size = st.slider("Test set size (%)", 10, 50, 20, step=5) / 100
tfidf = TfidfVectorizer(stop_words='english', max_features=max_features)
X = tfidf.fit_transform(df['reviewText'].astype(str))
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
st.markdown("### Classification Report")
report = classification_report(y_test, y_pred, output_dict=True)
st.dataframe(pd.DataFrame(report).transpose())
st.markdown("### Confusion Matrix")
cm = confusion_matrix(y_test, y_pred, labels=['positive', 'neutral', 'negative'])
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['positive', 'neutral', 'negative'], yticklabels=['positive', 'neutral', 'negative'])
st.pyplot(fig)
# --- Page 4: Results & Conclusion ---
def page_results():
st.title("πŸ“Š Results & Conclusion")
st.markdown("""
### Summary
- **Unsupervised Learning** helped group reviews into coherent topics using text similarity.
- **Supervised Learning** effectively predicted sentiment with Naive Bayes and TF-IDF.
### Insights
- Clustering allows business teams to explore latent structures without labels.
- Sentiment prediction can automate review analysis at scale.
### Next Steps
- Try BERT for more accurate sentiment classification.
- Explore topic labeling using BERTopic.
- Add interactivity to select specific reviews or visualize clusters in 2D.
""")
# --- Sidebar Navigation ---
pages = {
"1 - Introduction": page_intro,
"2 - Unsupervised Learning": page_unsupervised,
"3 - Supervised Learning": page_supervised,
"4 - Results & Conclusion": page_results,
}
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", list(pages.keys()))
pages[page]()