stanlys96's picture
Upload 6 files
7865840 verified
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from wordcloud import WordCloud
def app():
df_original = pd.read_csv("data.csv", delimiter=";")
df = df_original.copy()
df.drop_duplicates(inplace=True)
temp_a = df.copy()
temp_a['text_length'] = temp_a['text'].apply(len)
st.header('Exploratory Data Analysis', divider='rainbow')
eda_list = ["Text Length Distribution", "Sentiment Distribution", "Word Clouds", "Stopwords Boxplot Distributions"]
val = st.sidebar.radio("Choose plot to show", eda_list)
stop_words = set(stopwords.words('english'))
def plot_wordcloud(sentiment):
text = ' '.join(df[df['feeling'] == sentiment]['text'])
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f"Word Cloud for {sentiment} Sentiment")
st.pyplot(plt)
if val == "Text Length Distribution":
# Plot distribution
st.header('Text Length Distribution')
plt.figure(figsize=(10, 6))
plt.hist(temp_a['text_length'], bins=30, color='skyblue')
plt.title('Text Length Distribution')
plt.xlabel('Text Length (characters)')
plt.ylabel('Frequency')
st.pyplot(plt)
st.write("Insight: text length distribution of our data appears to be right-skewed.")
elif val == "Sentiment Distribution":
sentiment_counts = df['feeling'].value_counts()
st.header('Sentiment Distribution')
# Plot pie chart
plt.figure(figsize=(8, 6))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('viridis', len(sentiment_counts)))
plt.title('Sentiment Distribution')
plt.axis('equal')
st.pyplot(plt)
st.write("Insight: joy and sadness dominate the sentiment dataset, with joy taking the first place in 33.8%")
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='feeling', palette='viridis')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
st.pyplot(plt)
st.write("Insight: surprise sentiment has the lowest value of around 900 data")
elif val == "Word Clouds":
st.header('Word Clouds')
plot_wordcloud('joy')
plot_wordcloud('sadness')
plot_wordcloud('anger')
plot_wordcloud('love')
plot_wordcloud('surprise')
plot_wordcloud('fear')
pass
elif val == "Stopwords Boxplot Distributions":
temp_b = df.copy()
temp_b['stopword_count'] = temp_b['text'].apply(lambda x: sum(1 for word in x.split() if word.lower() in stop_words))
st.header('Stopword Count Boxplot')
# Plot stopword count distribution by sentiment
sns.boxplot(data=temp_b, x='feeling', y='stopword_count')
plt.title('Stopword Count by Sentiment')
st.pyplot(plt)
st.write("Insight: each sentiment has quite a number of stop words outliers")