import streamlit as st import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import classification_report, confusion_matrix # Load the dataset df = pd.read_csv("amazon_reviews.csv") # --- Page 1: Introduction --- def page_intro(): st.title("📘 Amazon Reviews ML App") st.markdown(""" ## Dataset Overview This dataset contains Amazon product reviews, including the text and corresponding star ratings. ### What This App Does: 1. **Unsupervised Learning**: We'll use **TF-IDF** + **K-Means** clustering to discover hidden topics in the reviews. 2. **Supervised Learning**: We'll apply **Naive Bayes** to classify sentiment based on the review text. ### How to Use: - Navigate using the sidebar to each section. - Interact with charts and view model outputs. - Explore results and gain insights into customer sentiment and topics. """) st.markdown("### Dataset Preview") st.dataframe(df.head()) # --- Page 2: Unsupervised Learning --- def page_unsupervised(): st.title("🔍 Unsupervised Learning: Topic Clustering") num_clusters = st.slider("Select number of clusters", 2, 10, 5) max_features = st.slider("Max TF-IDF features", 100, 3000, 1000, step=100) tfidf = TfidfVectorizer(stop_words='english', max_features=max_features) tfidf_matrix = tfidf.fit_transform(df['reviewText'].astype(str)) kmeans = KMeans(n_clusters=num_clusters, random_state=42) df['Cluster'] = kmeans.fit_predict(tfidf_matrix) st.markdown("### Cluster Distribution") cluster_counts = df['Cluster'].value_counts().sort_index() st.bar_chart(cluster_counts) st.markdown("### Sample Reviews Per Cluster") num_samples = st.slider("Number of sample reviews per cluster", 1, 5, 2) for i in range(num_clusters): st.subheader(f"Cluster {i}") samples = df[df['Cluster'] == i]['reviewText'].sample(num_samples, random_state=42).tolist() for s in samples: st.markdown(f"- {s}") # --- Page 3: Supervised Learning --- def page_supervised(): st.title("🧠 Supervised Learning: Sentiment Classification") def convert_sentiment(star): if star <= 2: return 'negative' elif star == 3: return 'neutral' else: return 'positive' df['Sentiment'] = df['overall'].apply(convert_sentiment) max_features = st.slider("Max TF-IDF features (for classification)", 100, 3000, 1000, step=100) test_size = st.slider("Test set size (%)", 10, 50, 20, step=5) / 100 tfidf = TfidfVectorizer(stop_words='english', max_features=max_features) X = tfidf.fit_transform(df['reviewText'].astype(str)) y = df['Sentiment'] X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state=42) model = MultinomialNB() model.fit(X_train, y_train) y_pred = model.predict(X_test) st.markdown("### Classification Report") report = classification_report(y_test, y_pred, output_dict=True) st.dataframe(pd.DataFrame(report).transpose()) st.markdown("### Confusion Matrix") cm = confusion_matrix(y_test, y_pred, labels=['positive', 'neutral', 'negative']) fig, ax = plt.subplots() sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['positive', 'neutral', 'negative'], yticklabels=['positive', 'neutral', 'negative']) st.pyplot(fig) # --- Page 4: Results & Conclusion --- def page_results(): st.title("📊 Results & Conclusion") st.markdown(""" ### Summary - **Unsupervised Learning** helped group reviews into coherent topics using text similarity. - **Supervised Learning** effectively predicted sentiment with Naive Bayes and TF-IDF. ### Insights - Clustering allows business teams to explore latent structures without labels. - Sentiment prediction can automate review analysis at scale. ### Next Steps - Try BERT for more accurate sentiment classification. - Explore topic labeling using BERTopic. - Add interactivity to select specific reviews or visualize clusters in 2D. """) # --- Sidebar Navigation --- pages = { "1 - Introduction": page_intro, "2 - Unsupervised Learning": page_unsupervised, "3 - Supervised Learning": page_supervised, "4 - Results & Conclusion": page_results, } st.sidebar.title("Navigation") page = st.sidebar.radio("Go to", list(pages.keys())) pages[page]()