import streamlit as st import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import nltk nltk.download('stopwords') from nltk.corpus import stopwords from wordcloud import WordCloud def app(): df_original = pd.read_csv("data.csv", delimiter=";") df = df_original.copy() df.drop_duplicates(inplace=True) temp_a = df.copy() temp_a['text_length'] = temp_a['text'].apply(len) st.header('Exploratory Data Analysis', divider='rainbow') eda_list = ["Text Length Distribution", "Sentiment Distribution", "Word Clouds", "Stopwords Boxplot Distributions"] val = st.sidebar.radio("Choose plot to show", eda_list) stop_words = set(stopwords.words('english')) def plot_wordcloud(sentiment): text = ' '.join(df[df['feeling'] == sentiment]['text']) wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text) plt.figure(figsize=(10, 6)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.title(f"Word Cloud for {sentiment} Sentiment") st.pyplot(plt) if val == "Text Length Distribution": # Plot distribution st.header('Text Length Distribution') plt.figure(figsize=(10, 6)) plt.hist(temp_a['text_length'], bins=30, color='skyblue') plt.title('Text Length Distribution') plt.xlabel('Text Length (characters)') plt.ylabel('Frequency') st.pyplot(plt) st.write("Insight: text length distribution of our data appears to be right-skewed.") elif val == "Sentiment Distribution": sentiment_counts = df['feeling'].value_counts() st.header('Sentiment Distribution') # Plot pie chart plt.figure(figsize=(8, 6)) plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('viridis', len(sentiment_counts))) plt.title('Sentiment Distribution') plt.axis('equal') st.pyplot(plt) st.write("Insight: joy and sadness dominate the sentiment dataset, with joy taking the first place in 33.8%") plt.figure(figsize=(8, 6)) sns.countplot(data=df, x='feeling', palette='viridis') plt.title('Sentiment Distribution') plt.xlabel('Sentiment') plt.ylabel('Count') st.pyplot(plt) st.write("Insight: surprise sentiment has the lowest value of around 900 data") elif val == "Word Clouds": st.header('Word Clouds') plot_wordcloud('joy') plot_wordcloud('sadness') plot_wordcloud('anger') plot_wordcloud('love') plot_wordcloud('surprise') plot_wordcloud('fear') pass elif val == "Stopwords Boxplot Distributions": temp_b = df.copy() temp_b['stopword_count'] = temp_b['text'].apply(lambda x: sum(1 for word in x.split() if word.lower() in stop_words)) st.header('Stopword Count Boxplot') # Plot stopword count distribution by sentiment sns.boxplot(data=temp_b, x='feeling', y='stopword_count') plt.title('Stopword Count by Sentiment') st.pyplot(plt) st.write("Insight: each sentiment has quite a number of stop words outliers")