import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from wordcloud import WordCloud

def app():
  df_original = pd.read_csv("data.csv", delimiter=";")
  df = df_original.copy()
  df.drop_duplicates(inplace=True)

  temp_a = df.copy()
  temp_a['text_length'] = temp_a['text'].apply(len)
  st.header('Exploratory Data Analysis', divider='rainbow')
  eda_list = ["Text Length Distribution", "Sentiment Distribution", "Word Clouds", "Stopwords Boxplot Distributions"]
  val = st.sidebar.radio("Choose plot to show", eda_list)
  stop_words = set(stopwords.words('english'))
  def plot_wordcloud(sentiment):
    text = ' '.join(df[df['feeling'] == sentiment]['text'])
    wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Word Cloud for {sentiment} Sentiment")
    st.pyplot(plt)
  if val == "Text Length Distribution":
    # Plot distribution
    st.header('Text Length Distribution')
    plt.figure(figsize=(10, 6))
    plt.hist(temp_a['text_length'], bins=30, color='skyblue')
    plt.title('Text Length Distribution')
    plt.xlabel('Text Length (characters)')
    plt.ylabel('Frequency')
    st.pyplot(plt)
    st.write("Insight: text length distribution of our data appears to be right-skewed.")
  elif val == "Sentiment Distribution":
    sentiment_counts = df['feeling'].value_counts()
    st.header('Sentiment Distribution')
    # Plot pie chart
    plt.figure(figsize=(8, 6))
    plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('viridis', len(sentiment_counts)))
    plt.title('Sentiment Distribution')
    plt.axis('equal')
    st.pyplot(plt)
    st.write("Insight: joy and sadness dominate the sentiment dataset, with joy taking the first place in 33.8%")

    plt.figure(figsize=(8, 6))
    sns.countplot(data=df, x='feeling', palette='viridis')
    plt.title('Sentiment Distribution')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    st.pyplot(plt)
    st.write("Insight: surprise sentiment has the lowest value of around 900 data")
  elif val == "Word Clouds":
    st.header('Word Clouds')
    plot_wordcloud('joy')
    plot_wordcloud('sadness')
    plot_wordcloud('anger')
    plot_wordcloud('love')
    plot_wordcloud('surprise')
    plot_wordcloud('fear')
    pass
  elif val == "Stopwords Boxplot Distributions":
    temp_b = df.copy()
    temp_b['stopword_count'] = temp_b['text'].apply(lambda x: sum(1 for word in x.split() if word.lower() in stop_words))
    st.header('Stopword Count Boxplot')
    # Plot stopword count distribution by sentiment
    sns.boxplot(data=temp_b, x='feeling', y='stopword_count')
    plt.title('Stopword Count by Sentiment')
    st.pyplot(plt)
    st.write("Insight: each sentiment has quite a number of stop words outliers")