Spaces:

Santarabantoosoo
/

Sentiments_topic_modeling_ITALIAN

Runtime error

App Files Files Community

santarabantoosoo commited on Jul 31, 2022

Commit

102b824

1 Parent(s): 571d313

added word frequency

Browse files

Files changed (2) hide show

app.py +171 -22
requirements.txt +3 -0

app.py CHANGED Viewed

@@ -6,42 +6,166 @@ import plotly.express as px
 from stop_words import get_stop_words
 from wordcloud import WordCloud
 from datasets import load_dataset
 ## import data
 dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets")
 data = pd.DataFrame.from_dict(dataset["train"])
-# formulate a wordcloud for each emotion
-stop = get_stop_words('italian')
 # Wordcloud with anger tweets
 angry_tweets = data['tweet'][data["emotion"] == 'anger']
-stop_words = ["https", "co", "RT"] + list(stop)
 anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets))
 # Wordcloud with sad tweets
 sad_tweets = data['tweet'][data["emotion"] == 'sadness']
-stop_words = ["https", "co", "RT"] + list(stop)
 sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets))
  # Wordcloud with joy tweets
 joy_tweets = data['tweet'][data["emotion"] == 'joy']
-stop_words = ["https", "co", "RT"] + list(stop)
 joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets))
  # Wordcloud with fear tweets
 fear_tweets = data['tweet'][data["emotion"] == 'fear']
-stop_words = ["https", "co", "RT"] + list(stop)
 fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets))
-# combine wordclouds in a single matplotlib figure
 wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)
 wc_fig.tight_layout()
 ax1.imshow(sad_wordcloud, interpolation="bilinear")
@@ -65,6 +189,7 @@ ax3.axis("off")
 ax3.set_title('Fear', {'fontsize': 30})
 ax4.imshow(anger_wordcloud, interpolation="bilinear")
 ax4.axis("off")
@@ -72,8 +197,6 @@ ax4.axis("off")
 ax4.set_title('Anger', {'fontsize': 30})
-plt.show()
 # plot a pie plot for emotions' distribution
 number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index()
@@ -91,7 +214,6 @@ sent_fig = px.pie(sentiment_counts, values='count', names='emotion', title='Twee
                  color_discrete_sequence=px.colors.qualitative.G10)
 sent_fig
 def display_plot(image_choice):
     if image_choice == 'Sentiment distribution':
@@ -103,22 +225,49 @@ def display_plot(image_choice):
     elif image_choice == 'Word clouds':
         return wc_fig
 with gr.Blocks() as demo:
     gr.Markdown("## Choose your adventure")
     with gr.Tabs():
-        with gr.TabItem("Sentiment analysis"):
-            text_input =  [gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot')]
-            plot_output = gr.Plot()
-            text_button = gr.Button("Submit")
-            text_button.click(display_plot, inputs=text_input, outputs=plot_output)
-        with gr.TabItem("Word frequency"):
-            gr.Markdown("Nothing here yet")
         with gr.TabItem("Topic modeling"):
             gr.Markdown("Nothing here yet")
-demo.launch();

 from stop_words import get_stop_words
 from wordcloud import WordCloud
 from datasets import load_dataset
+import re
 ## import data
 dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets")
 data = pd.DataFrame.from_dict(dataset["train"])
+# load stop words
+it_stop_words = load_dataset("Santarabantoosoo/italian-stopwords")
+it_stop = pd.DataFrame.from_dict(it_stop_words["train"])
+it_stop = it_stop.text.to_list()
+## Optimize stop words according to Luca's repo
+def format_input(user_key, stopwords):
+  '''
+  format user input request to lookup in the database of frequencies
+  input:
+    user_key is a string
+    stopwords is a list of strings
+  output:
+    key is a string
+  '''
+  key = user_key.lower()
+  key = re.sub(r'[^\w\s]', ' ', key)
+  key = ' '.join([el for el in key.split() if not (el in stopwords)])
+  return key
+### Loading TFIDF
+TFIDF_21_Jul_Oct = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Jul_Oct")
+TFIDF_22_Feb_Apr = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_Feb_Apr")
+TFIDF_22_May_Jul = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_May_Jul")
+TFIDF_21_Nov_22_Jan = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Nov_22_Jan")
+## Loading whole_text
+whole_text_21_Jul_Oct = load_dataset("Santarabantoosoo/whole_text_TF_21_Jul_Oct")
+whole_text_22_Feb_Apr = load_dataset("Santarabantoosoo/whole_text_TF_22_Feb_Apr")
+whole_text_22_May_Jul = load_dataset("Santarabantoosoo/whole_text_TF_22_May_Jul")
+whole_text_21_Nov_22_Jan = load_dataset("Santarabantoosoo/whole_text_TF_21_Nov_22_Jan")
+TFIDF_21_Jul_Oct = pd.DataFrame.from_dict(TFIDF_21_Jul_Oct["train"])
+TFIDF_22_Feb_Apr = pd.DataFrame.from_dict(TFIDF_22_Feb_Apr["train"])
+TFIDF_22_May_Jul = pd.DataFrame.from_dict(TFIDF_22_May_Jul["train"])
+TFIDF_21_Nov_22_Jan = pd.DataFrame.from_dict(TFIDF_21_Nov_22_Jan["train"])
+whole_text_21_Jul_Oct = pd.DataFrame.from_dict(whole_text_21_Jul_Oct["train"])
+whole_text_22_Feb_Apr = pd.DataFrame.from_dict(whole_text_22_Feb_Apr["train"])
+whole_text_22_May_Jul = pd.DataFrame.from_dict(whole_text_22_May_Jul["train"])
+whole_text_21_Nov_22_Jan = pd.DataFrame.from_dict(whole_text_21_Nov_22_Jan["train"])
+ser_TFIDF = []
+ser_TFIDF.append(TFIDF_21_Jul_Oct.transpose()[0])
+ser_TFIDF.append(TFIDF_22_Feb_Apr.transpose()[0])
+ser_TFIDF.append(TFIDF_22_May_Jul.transpose()[0])
+ser_TFIDF.append(TFIDF_21_Nov_22_Jan.transpose()[0])
+ser_whole_text = []
+ser_whole_text.append(whole_text_21_Jul_Oct.transpose()[0])
+ser_whole_text.append(whole_text_22_Feb_Apr.transpose()[0])
+ser_whole_text.append(whole_text_22_May_Jul.transpose()[0])
+ser_whole_text.append(whole_text_21_Nov_22_Jan.transpose()[0])
+def plot_time_series(choice, keyword, user_keys):
+    x = np.arange(2,10,2)
+    y = [[] for j in range(len(keyword))]
+    for j in range(len(keyword)):
+      i=0
+      while i < len(choice):
+        try:
+          y[j].append(choice[i][keyword[j]])
+          i += 1
+        except:
+          y[j].append(0.0)
+          i += 1
+      y[j] = np.array(y[j])
+    x_ticks_labels = ['Q1','Q2','Q3','Q4']
+    fig, ax = plt.subplots(1,1)
+    for j in range(len(keyword)):
+      ax.plot(x,y[j], label = user_keys[j].lower())
+    # Set number of ticks for x-axis
+    ax.set_xticks(x)
+    ax.set_xticklabels(x_ticks_labels, fontsize=12)
+    leg = plt.legend(loc='best')
+    plt.xlabel('Time')
+    plt.title("keywords quartely analysis (July 2021 - July 2022)")
+    plt.ylabel(f'Freq. from {user_choice}')
+    return fig
 # Wordcloud with anger tweets
 angry_tweets = data['tweet'][data["emotion"] == 'anger']
+angry_tweets = angry_tweets.apply(format_input, args = [it_stop])
+stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
 anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets))
 # Wordcloud with sad tweets
 sad_tweets = data['tweet'][data["emotion"] == 'sadness']
+sad_tweets = sad_tweets.apply(format_input, args = [it_stop])
+stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
 sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets))
  # Wordcloud with joy tweets
 joy_tweets = data['tweet'][data["emotion"] == 'joy']
+joy_tweets = joy_tweets.apply(format_input, args = [it_stop])
+stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
 joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets))
  # Wordcloud with fear tweets
 fear_tweets = data['tweet'][data["emotion"] == 'fear']
+fear_tweets = fear_tweets.apply(format_input, args = [it_stop])
+stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
 fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets))
+## COmbine all plots in a single plot
 wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)
+# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
 wc_fig.tight_layout()
 ax1.imshow(sad_wordcloud, interpolation="bilinear")
 ax3.set_title('Fear', {'fontsize': 30})
 ax4.imshow(anger_wordcloud, interpolation="bilinear")
 ax4.axis("off")
 ax4.set_title('Anger', {'fontsize': 30})
 # plot a pie plot for emotions' distribution
 number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index()
                  color_discrete_sequence=px.colors.qualitative.G10)
 sent_fig
 def display_plot(image_choice):
     if image_choice == 'Sentiment distribution':
     elif image_choice == 'Word clouds':
         return wc_fig
+def display_freq_plot(choice, *args):
+    user_keys = [arg for arg in args]
+    # clean input strings to match keywords in the database
+    keyword = []
+    for key in user_keys:
+        keyword.append(format_input(key, it_stop))
+    if choice == "TFIDF":
+        return plot_time_series(ser_TFIDF, keyword, user_keys)
+    elif choice == "Whole_text":
+        return plot_time_series(ser_whole_text, keyword, user_keys)
 with gr.Blocks() as demo:
     gr.Markdown("## Choose your adventure")
     with gr.Tabs():
         with gr.TabItem("Topic modeling"):
             gr.Markdown("Nothing here yet")
+        with gr.TabItem("Word frequency"):
+            inputs =  [gr.Radio(choices = ['TFIDF', 'Whole_text'], label = 'Choose ur method'),
+                      gr.Textbox(label = 'word 1'),
+                      gr.Textbox(label = 'word 2'),
+                      gr.Textbox(label = 'word 3'),
+                      gr.Textbox(label = 'word 4')]
+            plot_output = gr.Plot(elem_id = 1)
+            freq_button = gr.Button("Submit")
+        freq_button.click(display_freq_plot, inputs=inputs, outputs=plot_output)
+        with gr.TabItem("Sentiment analysis"):
+            text_input =  gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot')
+            sent_plot = gr.Plot(label = 'jhg')
+            sent_button = gr.Button("Submit")
+        sent_button.click(display_plot, inputs=text_input, outputs= sent_plot)
+demo.launch();

requirements.txt CHANGED Viewed

@@ -4,3 +4,6 @@ matplotlib
 plotly
 stop_words
 wordcloud

 plotly
 stop_words
 wordcloud
+datasets
+re