Spaces:

mrciomnl
/

text_topic_clustering_and_sentiment_classification

Runtime error

App Files Files Community

text_topic_clustering_and_sentiment_classification / app.py

mrciomnl

initial commit

5e61364 9 months ago

raw

history blame contribute delete

4.67 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cluster import KMeans
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.metrics import classification_report, confusion_matrix

	# Load the dataset
	df = pd.read_csv("amazon_reviews.csv")

	# --- Page 1: Introduction ---
	def page_intro():
	st.title("📘 Amazon Reviews ML App")
	st.markdown("""
	## Dataset Overview
	This dataset contains Amazon product reviews, including the text and corresponding star ratings.

	### What This App Does:
	1. Unsupervised Learning: We'll use TF-IDF + K-Means clustering to discover hidden topics in the reviews.
	2. Supervised Learning: We'll apply Naive Bayes to classify sentiment based on the review text.

	### How to Use:
	- Navigate using the sidebar to each section.
	- Interact with charts and view model outputs.
	- Explore results and gain insights into customer sentiment and topics.
	""")
	st.markdown("### Dataset Preview")
	st.dataframe(df.head())

	# --- Page 2: Unsupervised Learning ---
	def page_unsupervised():
	st.title("🔍 Unsupervised Learning: Topic Clustering")

	num_clusters = st.slider("Select number of clusters", 2, 10, 5)
	max_features = st.slider("Max TF-IDF features", 100, 3000, 1000, step=100)

	tfidf = TfidfVectorizer(stop_words='english', max_features=max_features)
	tfidf_matrix = tfidf.fit_transform(df['reviewText'].astype(str))

	kmeans = KMeans(n_clusters=num_clusters, random_state=42)
	df['Cluster'] = kmeans.fit_predict(tfidf_matrix)

	st.markdown("### Cluster Distribution")
	cluster_counts = df['Cluster'].value_counts().sort_index()
	st.bar_chart(cluster_counts)

	st.markdown("### Sample Reviews Per Cluster")
	num_samples = st.slider("Number of sample reviews per cluster", 1, 5, 2)
	for i in range(num_clusters):
	st.subheader(f"Cluster {i}")
	samples = df[df['Cluster'] == i]['reviewText'].sample(num_samples, random_state=42).tolist()
	for s in samples:
	st.markdown(f"- {s}")

	# --- Page 3: Supervised Learning ---
	def page_supervised():
	st.title("🧠 Supervised Learning: Sentiment Classification")

	def convert_sentiment(star):
	if star <= 2:
	return 'negative'
	elif star == 3:
	return 'neutral'
	else:
	return 'positive'

	df['Sentiment'] = df['overall'].apply(convert_sentiment)

	max_features = st.slider("Max TF-IDF features (for classification)", 100, 3000, 1000, step=100)
	test_size = st.slider("Test set size (%)", 10, 50, 20, step=5) / 100

	tfidf = TfidfVectorizer(stop_words='english', max_features=max_features)
	X = tfidf.fit_transform(df['reviewText'].astype(str))
	y = df['Sentiment']

	X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state=42)

	model = MultinomialNB()
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	st.markdown("### Classification Report")
	report = classification_report(y_test, y_pred, output_dict=True)
	st.dataframe(pd.DataFrame(report).transpose())

	st.markdown("### Confusion Matrix")
	cm = confusion_matrix(y_test, y_pred, labels=['positive', 'neutral', 'negative'])
	fig, ax = plt.subplots()
	sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['positive', 'neutral', 'negative'], yticklabels=['positive', 'neutral', 'negative'])
	st.pyplot(fig)

	# --- Page 4: Results & Conclusion ---
	def page_results():
	st.title("📊 Results & Conclusion")
	st.markdown("""
	### Summary
	- Unsupervised Learning helped group reviews into coherent topics using text similarity.
	- Supervised Learning effectively predicted sentiment with Naive Bayes and TF-IDF.

	### Insights
	- Clustering allows business teams to explore latent structures without labels.
	- Sentiment prediction can automate review analysis at scale.

	### Next Steps
	- Try BERT for more accurate sentiment classification.
	- Explore topic labeling using BERTopic.
	- Add interactivity to select specific reviews or visualize clusters in 2D.
	""")

	# --- Sidebar Navigation ---
	pages = {
	"1 - Introduction": page_intro,
	"2 - Unsupervised Learning": page_unsupervised,
	"3 - Supervised Learning": page_supervised,
	"4 - Results & Conclusion": page_results,
	}

	st.sidebar.title("Navigation")
	page = st.sidebar.radio("Go to", list(pages.keys()))
	pages[page]()