Spaces:

stanlys96
/

NLP-Sentiment-Analysis

Sleeping

App Files Files Community

NLP-Sentiment-Analysis / eda.py

stanlys96

Upload 6 files

7865840 verified about 1 year ago

raw

history blame contribute delete

3.02 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import nltk
	nltk.download('stopwords')
	from nltk.corpus import stopwords
	from wordcloud import WordCloud

	def app():
	df_original = pd.read_csv("data.csv", delimiter=";")
	df = df_original.copy()
	df.drop_duplicates(inplace=True)

	temp_a = df.copy()
	temp_a['text_length'] = temp_a['text'].apply(len)
	st.header('Exploratory Data Analysis', divider='rainbow')
	eda_list = ["Text Length Distribution", "Sentiment Distribution", "Word Clouds", "Stopwords Boxplot Distributions"]
	val = st.sidebar.radio("Choose plot to show", eda_list)
	stop_words = set(stopwords.words('english'))
	def plot_wordcloud(sentiment):
	text = ' '.join(df[df['feeling'] == sentiment]['text'])
	wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
	plt.figure(figsize=(10, 6))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis('off')
	plt.title(f"Word Cloud for {sentiment} Sentiment")
	st.pyplot(plt)
	if val == "Text Length Distribution":
	# Plot distribution
	st.header('Text Length Distribution')
	plt.figure(figsize=(10, 6))
	plt.hist(temp_a['text_length'], bins=30, color='skyblue')
	plt.title('Text Length Distribution')
	plt.xlabel('Text Length (characters)')
	plt.ylabel('Frequency')
	st.pyplot(plt)
	st.write("Insight: text length distribution of our data appears to be right-skewed.")
	elif val == "Sentiment Distribution":
	sentiment_counts = df['feeling'].value_counts()
	st.header('Sentiment Distribution')
	# Plot pie chart
	plt.figure(figsize=(8, 6))
	plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('viridis', len(sentiment_counts)))
	plt.title('Sentiment Distribution')
	plt.axis('equal')
	st.pyplot(plt)
	st.write("Insight: joy and sadness dominate the sentiment dataset, with joy taking the first place in 33.8%")

	plt.figure(figsize=(8, 6))
	sns.countplot(data=df, x='feeling', palette='viridis')
	plt.title('Sentiment Distribution')
	plt.xlabel('Sentiment')
	plt.ylabel('Count')
	st.pyplot(plt)
	st.write("Insight: surprise sentiment has the lowest value of around 900 data")
	elif val == "Word Clouds":
	st.header('Word Clouds')
	plot_wordcloud('joy')
	plot_wordcloud('sadness')
	plot_wordcloud('anger')
	plot_wordcloud('love')
	plot_wordcloud('surprise')
	plot_wordcloud('fear')
	pass
	elif val == "Stopwords Boxplot Distributions":
	temp_b = df.copy()
	temp_b['stopword_count'] = temp_b['text'].apply(lambda x: sum(1 for word in x.split() if word.lower() in stop_words))
	st.header('Stopword Count Boxplot')
	# Plot stopword count distribution by sentiment
	sns.boxplot(data=temp_b, x='feeling', y='stopword_count')
	plt.title('Stopword Count by Sentiment')
	st.pyplot(plt)
	st.write("Insight: each sentiment has quite a number of stop words outliers")