Spaces:
Runtime error
Runtime error
santarabantoosoo commited on
Commit ·
102b824
1
Parent(s): 571d313
added word frequency
Browse files- app.py +171 -22
- requirements.txt +3 -0
app.py
CHANGED
|
@@ -6,42 +6,166 @@ import plotly.express as px
|
|
| 6 |
from stop_words import get_stop_words
|
| 7 |
from wordcloud import WordCloud
|
| 8 |
from datasets import load_dataset
|
| 9 |
-
|
| 10 |
|
| 11 |
## import data
|
| 12 |
|
| 13 |
dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets")
|
| 14 |
data = pd.DataFrame.from_dict(dataset["train"])
|
| 15 |
|
| 16 |
-
# formulate a wordcloud for each emotion
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# Wordcloud with anger tweets
|
| 21 |
angry_tweets = data['tweet'][data["emotion"] == 'anger']
|
| 22 |
-
|
|
|
|
| 23 |
anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets))
|
| 24 |
|
|
|
|
| 25 |
# Wordcloud with sad tweets
|
| 26 |
sad_tweets = data['tweet'][data["emotion"] == 'sadness']
|
| 27 |
-
|
|
|
|
| 28 |
sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets))
|
| 29 |
|
|
|
|
| 30 |
# Wordcloud with joy tweets
|
| 31 |
joy_tweets = data['tweet'][data["emotion"] == 'joy']
|
| 32 |
-
|
|
|
|
| 33 |
joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets))
|
| 34 |
|
| 35 |
|
| 36 |
# Wordcloud with fear tweets
|
| 37 |
fear_tweets = data['tweet'][data["emotion"] == 'fear']
|
| 38 |
-
|
|
|
|
| 39 |
fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets))
|
| 40 |
|
| 41 |
-
#
|
| 42 |
|
| 43 |
wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)
|
| 44 |
|
|
|
|
|
|
|
| 45 |
wc_fig.tight_layout()
|
| 46 |
|
| 47 |
ax1.imshow(sad_wordcloud, interpolation="bilinear")
|
|
@@ -65,6 +189,7 @@ ax3.axis("off")
|
|
| 65 |
ax3.set_title('Fear', {'fontsize': 30})
|
| 66 |
|
| 67 |
|
|
|
|
| 68 |
ax4.imshow(anger_wordcloud, interpolation="bilinear")
|
| 69 |
|
| 70 |
ax4.axis("off")
|
|
@@ -72,8 +197,6 @@ ax4.axis("off")
|
|
| 72 |
ax4.set_title('Anger', {'fontsize': 30})
|
| 73 |
|
| 74 |
|
| 75 |
-
plt.show()
|
| 76 |
-
|
| 77 |
# plot a pie plot for emotions' distribution
|
| 78 |
|
| 79 |
number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index()
|
|
@@ -91,7 +214,6 @@ sent_fig = px.pie(sentiment_counts, values='count', names='emotion', title='Twee
|
|
| 91 |
color_discrete_sequence=px.colors.qualitative.G10)
|
| 92 |
sent_fig
|
| 93 |
|
| 94 |
-
|
| 95 |
def display_plot(image_choice):
|
| 96 |
|
| 97 |
if image_choice == 'Sentiment distribution':
|
|
@@ -103,22 +225,49 @@ def display_plot(image_choice):
|
|
| 103 |
elif image_choice == 'Word clouds':
|
| 104 |
return wc_fig
|
| 105 |
|
|
|
|
|
|
|
|
|
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
with gr.Blocks() as demo:
|
| 108 |
gr.Markdown("## Choose your adventure")
|
|
|
|
| 109 |
with gr.Tabs():
|
| 110 |
-
|
| 111 |
-
text_input = [gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot')]
|
| 112 |
-
plot_output = gr.Plot()
|
| 113 |
-
text_button = gr.Button("Submit")
|
| 114 |
-
|
| 115 |
-
text_button.click(display_plot, inputs=text_input, outputs=plot_output)
|
| 116 |
-
|
| 117 |
-
with gr.TabItem("Word frequency"):
|
| 118 |
-
gr.Markdown("Nothing here yet")
|
| 119 |
-
|
| 120 |
with gr.TabItem("Topic modeling"):
|
| 121 |
gr.Markdown("Nothing here yet")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
from stop_words import get_stop_words
|
| 7 |
from wordcloud import WordCloud
|
| 8 |
from datasets import load_dataset
|
| 9 |
+
import re
|
| 10 |
|
| 11 |
## import data
|
| 12 |
|
| 13 |
dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets")
|
| 14 |
data = pd.DataFrame.from_dict(dataset["train"])
|
| 15 |
|
|
|
|
| 16 |
|
| 17 |
+
# load stop words
|
| 18 |
+
|
| 19 |
+
it_stop_words = load_dataset("Santarabantoosoo/italian-stopwords")
|
| 20 |
+
it_stop = pd.DataFrame.from_dict(it_stop_words["train"])
|
| 21 |
+
|
| 22 |
+
it_stop = it_stop.text.to_list()
|
| 23 |
+
|
| 24 |
+
## Optimize stop words according to Luca's repo
|
| 25 |
+
|
| 26 |
+
def format_input(user_key, stopwords):
|
| 27 |
+
'''
|
| 28 |
+
format user input request to lookup in the database of frequencies
|
| 29 |
+
|
| 30 |
+
input:
|
| 31 |
+
user_key is a string
|
| 32 |
+
stopwords is a list of strings
|
| 33 |
+
output:
|
| 34 |
+
key is a string
|
| 35 |
+
'''
|
| 36 |
+
|
| 37 |
+
key = user_key.lower()
|
| 38 |
+
key = re.sub(r'[^\w\s]', ' ', key)
|
| 39 |
+
|
| 40 |
+
key = ' '.join([el for el in key.split() if not (el in stopwords)])
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
return key
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
### Loading TFIDF
|
| 47 |
+
|
| 48 |
+
TFIDF_21_Jul_Oct = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Jul_Oct")
|
| 49 |
+
|
| 50 |
+
TFIDF_22_Feb_Apr = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_Feb_Apr")
|
| 51 |
+
|
| 52 |
+
TFIDF_22_May_Jul = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_May_Jul")
|
| 53 |
+
|
| 54 |
+
TFIDF_21_Nov_22_Jan = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Nov_22_Jan")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
## Loading whole_text
|
| 58 |
+
|
| 59 |
+
whole_text_21_Jul_Oct = load_dataset("Santarabantoosoo/whole_text_TF_21_Jul_Oct")
|
| 60 |
+
|
| 61 |
+
whole_text_22_Feb_Apr = load_dataset("Santarabantoosoo/whole_text_TF_22_Feb_Apr")
|
| 62 |
+
|
| 63 |
+
whole_text_22_May_Jul = load_dataset("Santarabantoosoo/whole_text_TF_22_May_Jul")
|
| 64 |
+
|
| 65 |
+
whole_text_21_Nov_22_Jan = load_dataset("Santarabantoosoo/whole_text_TF_21_Nov_22_Jan")
|
| 66 |
+
|
| 67 |
+
TFIDF_21_Jul_Oct = pd.DataFrame.from_dict(TFIDF_21_Jul_Oct["train"])
|
| 68 |
+
|
| 69 |
+
TFIDF_22_Feb_Apr = pd.DataFrame.from_dict(TFIDF_22_Feb_Apr["train"])
|
| 70 |
+
|
| 71 |
+
TFIDF_22_May_Jul = pd.DataFrame.from_dict(TFIDF_22_May_Jul["train"])
|
| 72 |
+
|
| 73 |
+
TFIDF_21_Nov_22_Jan = pd.DataFrame.from_dict(TFIDF_21_Nov_22_Jan["train"])
|
| 74 |
+
|
| 75 |
+
whole_text_21_Jul_Oct = pd.DataFrame.from_dict(whole_text_21_Jul_Oct["train"])
|
| 76 |
+
|
| 77 |
+
whole_text_22_Feb_Apr = pd.DataFrame.from_dict(whole_text_22_Feb_Apr["train"])
|
| 78 |
+
|
| 79 |
+
whole_text_22_May_Jul = pd.DataFrame.from_dict(whole_text_22_May_Jul["train"])
|
| 80 |
+
|
| 81 |
+
whole_text_21_Nov_22_Jan = pd.DataFrame.from_dict(whole_text_21_Nov_22_Jan["train"])
|
| 82 |
+
|
| 83 |
+
ser_TFIDF = []
|
| 84 |
+
|
| 85 |
+
ser_TFIDF.append(TFIDF_21_Jul_Oct.transpose()[0])
|
| 86 |
+
ser_TFIDF.append(TFIDF_22_Feb_Apr.transpose()[0])
|
| 87 |
+
ser_TFIDF.append(TFIDF_22_May_Jul.transpose()[0])
|
| 88 |
+
ser_TFIDF.append(TFIDF_21_Nov_22_Jan.transpose()[0])
|
| 89 |
+
|
| 90 |
+
ser_whole_text = []
|
| 91 |
+
|
| 92 |
+
ser_whole_text.append(whole_text_21_Jul_Oct.transpose()[0])
|
| 93 |
+
ser_whole_text.append(whole_text_22_Feb_Apr.transpose()[0])
|
| 94 |
+
ser_whole_text.append(whole_text_22_May_Jul.transpose()[0])
|
| 95 |
+
ser_whole_text.append(whole_text_21_Nov_22_Jan.transpose()[0])
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def plot_time_series(choice, keyword, user_keys):
|
| 99 |
+
|
| 100 |
+
x = np.arange(2,10,2)
|
| 101 |
+
|
| 102 |
+
y = [[] for j in range(len(keyword))]
|
| 103 |
+
|
| 104 |
+
for j in range(len(keyword)):
|
| 105 |
+
i=0
|
| 106 |
+
while i < len(choice):
|
| 107 |
+
try:
|
| 108 |
+
y[j].append(choice[i][keyword[j]])
|
| 109 |
+
i += 1
|
| 110 |
+
except:
|
| 111 |
+
y[j].append(0.0)
|
| 112 |
+
i += 1
|
| 113 |
+
|
| 114 |
+
y[j] = np.array(y[j])
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
x_ticks_labels = ['Q1','Q2','Q3','Q4']
|
| 118 |
+
|
| 119 |
+
fig, ax = plt.subplots(1,1)
|
| 120 |
+
|
| 121 |
+
for j in range(len(keyword)):
|
| 122 |
+
ax.plot(x,y[j], label = user_keys[j].lower())
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# Set number of ticks for x-axis
|
| 126 |
+
ax.set_xticks(x)
|
| 127 |
+
ax.set_xticklabels(x_ticks_labels, fontsize=12)
|
| 128 |
+
|
| 129 |
+
leg = plt.legend(loc='best')
|
| 130 |
+
plt.xlabel('Time')
|
| 131 |
+
plt.title("keywords quartely analysis (July 2021 - July 2022)")
|
| 132 |
+
plt.ylabel(f'Freq. from {user_choice}')
|
| 133 |
+
return fig
|
| 134 |
+
|
| 135 |
|
| 136 |
# Wordcloud with anger tweets
|
| 137 |
angry_tweets = data['tweet'][data["emotion"] == 'anger']
|
| 138 |
+
angry_tweets = angry_tweets.apply(format_input, args = [it_stop])
|
| 139 |
+
stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
|
| 140 |
anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets))
|
| 141 |
|
| 142 |
+
|
| 143 |
# Wordcloud with sad tweets
|
| 144 |
sad_tweets = data['tweet'][data["emotion"] == 'sadness']
|
| 145 |
+
sad_tweets = sad_tweets.apply(format_input, args = [it_stop])
|
| 146 |
+
stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
|
| 147 |
sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets))
|
| 148 |
|
| 149 |
+
|
| 150 |
# Wordcloud with joy tweets
|
| 151 |
joy_tweets = data['tweet'][data["emotion"] == 'joy']
|
| 152 |
+
joy_tweets = joy_tweets.apply(format_input, args = [it_stop])
|
| 153 |
+
stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
|
| 154 |
joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets))
|
| 155 |
|
| 156 |
|
| 157 |
# Wordcloud with fear tweets
|
| 158 |
fear_tweets = data['tweet'][data["emotion"] == 'fear']
|
| 159 |
+
fear_tweets = fear_tweets.apply(format_input, args = [it_stop])
|
| 160 |
+
stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
|
| 161 |
fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets))
|
| 162 |
|
| 163 |
+
## COmbine all plots in a single plot
|
| 164 |
|
| 165 |
wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)
|
| 166 |
|
| 167 |
+
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
|
| 168 |
+
|
| 169 |
wc_fig.tight_layout()
|
| 170 |
|
| 171 |
ax1.imshow(sad_wordcloud, interpolation="bilinear")
|
|
|
|
| 189 |
ax3.set_title('Fear', {'fontsize': 30})
|
| 190 |
|
| 191 |
|
| 192 |
+
|
| 193 |
ax4.imshow(anger_wordcloud, interpolation="bilinear")
|
| 194 |
|
| 195 |
ax4.axis("off")
|
|
|
|
| 197 |
ax4.set_title('Anger', {'fontsize': 30})
|
| 198 |
|
| 199 |
|
|
|
|
|
|
|
| 200 |
# plot a pie plot for emotions' distribution
|
| 201 |
|
| 202 |
number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index()
|
|
|
|
| 214 |
color_discrete_sequence=px.colors.qualitative.G10)
|
| 215 |
sent_fig
|
| 216 |
|
|
|
|
| 217 |
def display_plot(image_choice):
|
| 218 |
|
| 219 |
if image_choice == 'Sentiment distribution':
|
|
|
|
| 225 |
elif image_choice == 'Word clouds':
|
| 226 |
return wc_fig
|
| 227 |
|
| 228 |
+
def display_freq_plot(choice, *args):
|
| 229 |
+
|
| 230 |
+
user_keys = [arg for arg in args]
|
| 231 |
|
| 232 |
+
# clean input strings to match keywords in the database
|
| 233 |
+
keyword = []
|
| 234 |
+
for key in user_keys:
|
| 235 |
+
keyword.append(format_input(key, it_stop))
|
| 236 |
+
|
| 237 |
+
if choice == "TFIDF":
|
| 238 |
+
return plot_time_series(ser_TFIDF, keyword, user_keys)
|
| 239 |
+
|
| 240 |
+
elif choice == "Whole_text":
|
| 241 |
+
return plot_time_series(ser_whole_text, keyword, user_keys)
|
| 242 |
+
|
| 243 |
+
|
| 244 |
with gr.Blocks() as demo:
|
| 245 |
gr.Markdown("## Choose your adventure")
|
| 246 |
+
|
| 247 |
with gr.Tabs():
|
| 248 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
with gr.TabItem("Topic modeling"):
|
| 250 |
gr.Markdown("Nothing here yet")
|
| 251 |
+
|
| 252 |
+
with gr.TabItem("Word frequency"):
|
| 253 |
+
|
| 254 |
+
inputs = [gr.Radio(choices = ['TFIDF', 'Whole_text'], label = 'Choose ur method'),
|
| 255 |
+
gr.Textbox(label = 'word 1'),
|
| 256 |
+
gr.Textbox(label = 'word 2'),
|
| 257 |
+
gr.Textbox(label = 'word 3'),
|
| 258 |
+
gr.Textbox(label = 'word 4')]
|
| 259 |
+
plot_output = gr.Plot(elem_id = 1)
|
| 260 |
+
freq_button = gr.Button("Submit")
|
| 261 |
|
| 262 |
+
freq_button.click(display_freq_plot, inputs=inputs, outputs=plot_output)
|
| 263 |
+
|
| 264 |
+
with gr.TabItem("Sentiment analysis"):
|
| 265 |
+
text_input = gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot')
|
| 266 |
+
sent_plot = gr.Plot(label = 'jhg')
|
| 267 |
+
sent_button = gr.Button("Submit")
|
| 268 |
|
| 269 |
+
sent_button.click(display_plot, inputs=text_input, outputs= sent_plot)
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
demo.launch();
|
| 273 |
+
|
requirements.txt
CHANGED
|
@@ -4,3 +4,6 @@ matplotlib
|
|
| 4 |
plotly
|
| 5 |
stop_words
|
| 6 |
wordcloud
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
plotly
|
| 5 |
stop_words
|
| 6 |
wordcloud
|
| 7 |
+
datasets
|
| 8 |
+
re
|
| 9 |
+
|