Spaces:
Sleeping
Sleeping
File size: 3,019 Bytes
f65d9db 21a9c98 0281523 f65d9db 682fd81 f65d9db 0281523 f65d9db 63226c9 f65d9db 7865840 0281523 682fd81 7865840 682fd81 63226c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from wordcloud import WordCloud
def app():
df_original = pd.read_csv("data.csv", delimiter=";")
df = df_original.copy()
df.drop_duplicates(inplace=True)
temp_a = df.copy()
temp_a['text_length'] = temp_a['text'].apply(len)
st.header('Exploratory Data Analysis', divider='rainbow')
eda_list = ["Text Length Distribution", "Sentiment Distribution", "Word Clouds", "Stopwords Boxplot Distributions"]
val = st.sidebar.radio("Choose plot to show", eda_list)
stop_words = set(stopwords.words('english'))
def plot_wordcloud(sentiment):
text = ' '.join(df[df['feeling'] == sentiment]['text'])
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f"Word Cloud for {sentiment} Sentiment")
st.pyplot(plt)
if val == "Text Length Distribution":
# Plot distribution
st.header('Text Length Distribution')
plt.figure(figsize=(10, 6))
plt.hist(temp_a['text_length'], bins=30, color='skyblue')
plt.title('Text Length Distribution')
plt.xlabel('Text Length (characters)')
plt.ylabel('Frequency')
st.pyplot(plt)
st.write("Insight: text length distribution of our data appears to be right-skewed.")
elif val == "Sentiment Distribution":
sentiment_counts = df['feeling'].value_counts()
st.header('Sentiment Distribution')
# Plot pie chart
plt.figure(figsize=(8, 6))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('viridis', len(sentiment_counts)))
plt.title('Sentiment Distribution')
plt.axis('equal')
st.pyplot(plt)
st.write("Insight: joy and sadness dominate the sentiment dataset, with joy taking the first place in 33.8%")
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='feeling', palette='viridis')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
st.pyplot(plt)
st.write("Insight: surprise sentiment has the lowest value of around 900 data")
elif val == "Word Clouds":
st.header('Word Clouds')
plot_wordcloud('joy')
plot_wordcloud('sadness')
plot_wordcloud('anger')
plot_wordcloud('love')
plot_wordcloud('surprise')
plot_wordcloud('fear')
pass
elif val == "Stopwords Boxplot Distributions":
temp_b = df.copy()
temp_b['stopword_count'] = temp_b['text'].apply(lambda x: sum(1 for word in x.split() if word.lower() in stop_words))
st.header('Stopword Count Boxplot')
# Plot stopword count distribution by sentiment
sns.boxplot(data=temp_b, x='feeling', y='stopword_count')
plt.title('Stopword Count by Sentiment')
st.pyplot(plt)
st.write("Insight: each sentiment has quite a number of stop words outliers") |