Spaces:
Runtime error
Runtime error
Commit ·
d09b322
1
Parent(s): f746f70
first
Browse files- app.py +138 -25
- helper_functions.py +118 -65
- requirements.txt +1 -0
- static/yt_mask.png +0 -0
app.py
CHANGED
|
@@ -10,36 +10,68 @@ import plotly.io as pio
|
|
| 10 |
import plotly
|
| 11 |
|
| 12 |
# Whenever the search button is clicked, the search_callback function is called
|
| 13 |
-
def
|
| 14 |
-
if
|
| 15 |
-
if len(st.session_state.
|
| 16 |
st.error("Please enter a search term")
|
| 17 |
return
|
| 18 |
try:
|
| 19 |
-
st.session_state.df = hf.get_tweets(st.session_state.
|
| 20 |
st.session_state.df = hf.get_sentiment(st.session_state.df)
|
| 21 |
except:
|
| 22 |
st.error("Please enter a valid search term")
|
| 23 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def twitter_form():
|
| 26 |
with st.form(key="search_form"):
|
| 27 |
st.subheader("Search Parameters")
|
| 28 |
-
st.text_input("Enter a User handle (like _@elonmusk_), Hashtag (like _#Bitcoin_) or Topic (like _climate change_)", key="
|
| 29 |
st.slider("Number of tweets", min_value=100, max_value=500, key="num_tweets")
|
| 30 |
-
st.form_submit_button(label="Search", on_click=
|
| 31 |
st.markdown(
|
| 32 |
"Note: it may take a while to load the results, especially with large number of tweets"
|
| 33 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
with st.sidebar:
|
| 37 |
st.title("Social Media Sentiment Analyzer")
|
| 38 |
-
st.subheader("Choose your platform")
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
if
|
| 42 |
twitter_form()
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
st.markdown(
|
| 45 |
"<div style='position: fixed; bottom: 0;'>Created by Taaha Bajwa</div>",
|
|
@@ -91,6 +123,56 @@ if "df" in st.session_state:
|
|
| 91 |
wordcloud = hf.plot_wordcloud(tweet_df, colormap=wc_color)
|
| 92 |
st.pyplot(wordcloud)
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
adjust_tab_font = """
|
| 95 |
<style>
|
| 96 |
button[data-baseweb="tab"] > div[data-testid="stMarkdownContainer"] > p {
|
|
@@ -101,19 +183,50 @@ if "df" in st.session_state:
|
|
| 101 |
|
| 102 |
st.write(adjust_tab_font, unsafe_allow_html=True)
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
import plotly
|
| 11 |
|
| 12 |
# Whenever the search button is clicked, the search_callback function is called
|
| 13 |
+
def search_callback_twitter():
|
| 14 |
+
if platform == "Twitter":
|
| 15 |
+
if len(st.session_state.search_term_twitter) == 0:
|
| 16 |
st.error("Please enter a search term")
|
| 17 |
return
|
| 18 |
try:
|
| 19 |
+
st.session_state.df = hf.get_tweets(st.session_state.search_term_twitter, st.session_state.num_tweets)
|
| 20 |
st.session_state.df = hf.get_sentiment(st.session_state.df)
|
| 21 |
except:
|
| 22 |
st.error("Please enter a valid search term")
|
| 23 |
return
|
| 24 |
+
|
| 25 |
+
def search_callback_youtube():
|
| 26 |
+
if platform == "Youtube":
|
| 27 |
+
if len(st.session_state.search_term_youtube) == 0:
|
| 28 |
+
st.error("Please enter a valid url")
|
| 29 |
+
return
|
| 30 |
+
try:
|
| 31 |
+
st.session_state.df = hf.get_youtube_comments(st.session_state.search_term_youtube, st.session_state.num_comments)
|
| 32 |
+
st.session_state.df = hf.get_sentiment_youtube(st.session_state.df)
|
| 33 |
+
except:
|
| 34 |
+
st.error("Please enter a valid url")
|
| 35 |
+
return
|
| 36 |
|
| 37 |
def twitter_form():
|
| 38 |
with st.form(key="search_form"):
|
| 39 |
st.subheader("Search Parameters")
|
| 40 |
+
st.text_input("Enter a User handle (like _@elonmusk_), Hashtag (like _#Bitcoin_) or Topic (like _climate change_)", key="search_term_twitter")
|
| 41 |
st.slider("Number of tweets", min_value=100, max_value=500, key="num_tweets")
|
| 42 |
+
st.form_submit_button(label="Search", on_click=search_callback_twitter)
|
| 43 |
st.markdown(
|
| 44 |
"Note: it may take a while to load the results, especially with large number of tweets"
|
| 45 |
)
|
| 46 |
+
|
| 47 |
+
def youtube_form():
|
| 48 |
+
with st.form(key="search_form"):
|
| 49 |
+
st.subheader("Search Parameters")
|
| 50 |
+
st.text_input("Enter a Video link to analyse comments", key="search_term_youtube")
|
| 51 |
+
st.slider("Number of Comments", min_value=100, max_value=500, key="num_comments")
|
| 52 |
+
st.form_submit_button(label="Search", on_click=search_callback_youtube)
|
| 53 |
+
st.markdown(
|
| 54 |
+
"Note: it may take a while to load the results, especially with large number of comments"
|
| 55 |
+
)
|
| 56 |
|
| 57 |
|
| 58 |
with st.sidebar:
|
| 59 |
st.title("Social Media Sentiment Analyzer")
|
| 60 |
+
#st.subheader("Choose your platform")
|
| 61 |
+
platform = st.radio(
|
| 62 |
+
"Choose your platform 👇",
|
| 63 |
+
["Twitter", "Youtube"],
|
| 64 |
+
# key="visibility",
|
| 65 |
+
# label_visibility=st.session_state.visibility,
|
| 66 |
+
# disabled=st.session_state.disabled,
|
| 67 |
+
horizontal=True,
|
| 68 |
+
)
|
| 69 |
|
| 70 |
+
if platform == "Twitter":
|
| 71 |
twitter_form()
|
| 72 |
+
|
| 73 |
+
if platform == "Youtube":
|
| 74 |
+
youtube_form()
|
| 75 |
|
| 76 |
st.markdown(
|
| 77 |
"<div style='position: fixed; bottom: 0;'>Created by Taaha Bajwa</div>",
|
|
|
|
| 123 |
wordcloud = hf.plot_wordcloud(tweet_df, colormap=wc_color)
|
| 124 |
st.pyplot(wordcloud)
|
| 125 |
|
| 126 |
+
|
| 127 |
+
def make_dashboard_youtube(tweet_df, bar_color, wc_color):
|
| 128 |
+
tweet_df = tweet_df.rename(columns={"Comment": "Tweet"})
|
| 129 |
+
# first row
|
| 130 |
+
col1, col2, col3 = st.columns([28, 34, 38])
|
| 131 |
+
with col1:
|
| 132 |
+
sentiment_plot = hf.plot_sentiment(tweet_df)
|
| 133 |
+
sentiment_plot.update_layout(height=350, title_x=0.5)
|
| 134 |
+
st.plotly_chart(sentiment_plot, theme=None, use_container_width=True)
|
| 135 |
+
with col2:
|
| 136 |
+
top_unigram = hf.get_top_n_gram(tweet_df, ngram_range=(1, 1), n=10)
|
| 137 |
+
unigram_plot = hf.plot_n_gram(
|
| 138 |
+
top_unigram, title="Top 10 Occuring Words", color=bar_color
|
| 139 |
+
)
|
| 140 |
+
unigram_plot.update_layout(height=350)
|
| 141 |
+
st.plotly_chart(unigram_plot, theme=None, use_container_width=True)
|
| 142 |
+
with col3:
|
| 143 |
+
top_bigram = hf.get_top_n_gram(tweet_df, ngram_range=(2, 2), n=10)
|
| 144 |
+
bigram_plot = hf.plot_n_gram(
|
| 145 |
+
top_bigram, title="Top 10 Occuring Bigrams", color=bar_color
|
| 146 |
+
)
|
| 147 |
+
bigram_plot.update_layout(height=350)
|
| 148 |
+
st.plotly_chart(bigram_plot, theme=None, use_container_width=True)
|
| 149 |
+
|
| 150 |
+
# second row
|
| 151 |
+
col1, col2 = st.columns([60, 40])
|
| 152 |
+
with col1:
|
| 153 |
+
|
| 154 |
+
def sentiment_color(sentiment):
|
| 155 |
+
if sentiment == "Positive":
|
| 156 |
+
return "background-color: #54A24B; color: white"
|
| 157 |
+
elif sentiment == "Negative":
|
| 158 |
+
return "background-color: #FF7F0E"
|
| 159 |
+
else:
|
| 160 |
+
return "background-color: #1F77B4"
|
| 161 |
+
tweet_df_temp = tweet_df[["Sentiment", "Tweet"]]
|
| 162 |
+
tweet_df_temp = tweet_df_temp.rename(columns={"Tweet": "Comment"})
|
| 163 |
+
st.dataframe(
|
| 164 |
+
tweet_df_temp[["Sentiment", "Comment"]].style.applymap(
|
| 165 |
+
sentiment_color, subset=["Sentiment"]
|
| 166 |
+
),
|
| 167 |
+
height=350,
|
| 168 |
+
)
|
| 169 |
+
with col2:
|
| 170 |
+
wordcloud = hf.plot_wordcloud(tweet_df, colormap=wc_color, mask_url='static/yt_mask.png')
|
| 171 |
+
try:
|
| 172 |
+
st.pyplot(wordcloud)
|
| 173 |
+
except:
|
| 174 |
+
st.write("Wordcloud not available for this search term")
|
| 175 |
+
|
| 176 |
adjust_tab_font = """
|
| 177 |
<style>
|
| 178 |
button[data-baseweb="tab"] > div[data-testid="stMarkdownContainer"] > p {
|
|
|
|
| 183 |
|
| 184 |
st.write(adjust_tab_font, unsafe_allow_html=True)
|
| 185 |
|
| 186 |
+
if platform == "Twitter" and st.session_state.search_term_twitter != "":
|
| 187 |
+
try:
|
| 188 |
+
tab1, tab2, tab3, tab4 = st.tabs(["All", "Positive 😊", "Negative ☹️", "Neutral 😐"])
|
| 189 |
+
with tab1:
|
| 190 |
+
tweet_df = st.session_state.df
|
| 191 |
+
make_dashboard(tweet_df, bar_color="#1F77B4", wc_color="Blues")
|
| 192 |
+
with tab2:
|
| 193 |
+
tweet_df = st.session_state.df.query("Sentiment == 'Positive'")
|
| 194 |
+
make_dashboard(tweet_df, bar_color="#54A24B", wc_color="Greens")
|
| 195 |
+
with tab3:
|
| 196 |
+
tweet_df = st.session_state.df.query("Sentiment == 'Negative'")
|
| 197 |
+
make_dashboard(tweet_df, bar_color="#FF7F0E", wc_color="Oranges")
|
| 198 |
+
with tab4:
|
| 199 |
+
tweet_df = st.session_state.df.query("Sentiment == 'Neutral'")
|
| 200 |
+
make_dashboard(tweet_df, bar_color="#1F77B4", wc_color="Blues")
|
| 201 |
+
except:
|
| 202 |
+
st.error("No plots to display.")
|
| 203 |
+
|
| 204 |
+
elif platform == "Youtube" and st.session_state.search_term_youtube != "":
|
| 205 |
+
try:
|
| 206 |
+
tab1, tab2, tab3, tab4 = st.tabs(["All", "Positive 😊", "Negative ☹️", "Neutral 😐"])
|
| 207 |
+
with tab1:
|
| 208 |
+
tweet_df = st.session_state.df
|
| 209 |
+
if tweet_df.shape[0] > 0:
|
| 210 |
+
make_dashboard_youtube(tweet_df, bar_color="#1F77B4", wc_color="Blues")
|
| 211 |
+
else:
|
| 212 |
+
st.write("No comments found.")
|
| 213 |
+
with tab2:
|
| 214 |
+
tweet_df = st.session_state.df.query("Sentiment == 'Positive'")
|
| 215 |
+
if tweet_df.shape[0] > 0:
|
| 216 |
+
make_dashboard_youtube(tweet_df, bar_color="#54A24B", wc_color="Greens")
|
| 217 |
+
else:
|
| 218 |
+
st.write("No positive comments found.")
|
| 219 |
+
with tab3:
|
| 220 |
+
tweet_df = st.session_state.df.query("Sentiment == 'Negative'")
|
| 221 |
+
if tweet_df.shape[0] > 0:
|
| 222 |
+
make_dashboard_youtube(tweet_df, bar_color="#FF7F0E", wc_color="Oranges")
|
| 223 |
+
else:
|
| 224 |
+
st.write("No negative comments found.")
|
| 225 |
+
with tab4:
|
| 226 |
+
tweet_df = st.session_state.df.query("Sentiment == 'Neutral'")
|
| 227 |
+
if tweet_df.shape[0] > 0:
|
| 228 |
+
make_dashboard_youtube(tweet_df, bar_color="#1F77B4", wc_color="Blues")
|
| 229 |
+
else:
|
| 230 |
+
st.write("No neutral comments found.")
|
| 231 |
+
except:
|
| 232 |
+
st.error("No plots to display.")
|
helper_functions.py
CHANGED
|
@@ -6,10 +6,15 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
| 6 |
from transformers import pipeline
|
| 7 |
import plotly.express as px
|
| 8 |
import plotly.io as pio
|
|
|
|
| 9 |
import matplotlib as mpl
|
| 10 |
import matplotlib.pyplot as plt
|
| 11 |
from wordcloud import WordCloud
|
| 12 |
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
@st.cache(allow_output_mutation=True)
|
| 15 |
def get_nltk():
|
|
@@ -108,6 +113,45 @@ def get_tweets(query, max_tweets):
|
|
| 108 |
tweets_df.drop('Datetime', axis=1, inplace=True)
|
| 109 |
return tweets_df
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
def text_preprocessing(text):
|
| 112 |
stopwords = set()
|
| 113 |
with open("static/en_stopwords.txt", "r") as file:
|
|
@@ -127,7 +171,6 @@ def text_preprocessing(text):
|
|
| 127 |
cleaned_text = re.sub(entity_pattern, " ", cleaned_text)
|
| 128 |
cleaned_text = re.sub(non_alpha, " ", cleaned_text)
|
| 129 |
tokens = word_tokenize(cleaned_text)
|
| 130 |
-
#print('tokens')
|
| 131 |
# provide POS tag for lemmatization to yield better result
|
| 132 |
word_tag_tuples = pos_tag(tokens, tagset="universal")
|
| 133 |
tag_dict = {"NOUN": "n", "VERB": "v", "ADJ": "a", "ADV": "r"}
|
|
@@ -183,73 +226,83 @@ def plot_sentiment(tweet_df):
|
|
| 183 |
fig.update_layout(showlegend=False)
|
| 184 |
return fig
|
| 185 |
|
|
|
|
|
|
|
| 186 |
def get_top_n_gram(tweet_df, ngram_range, n=10):
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
def plot_n_gram(n_gram_df, title, color="#54A24B"):
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
-
def plot_wordcloud(tweet_df, colormap="Greens"):
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
|
|
|
| 254 |
|
| 255 |
|
|
|
|
| 6 |
from transformers import pipeline
|
| 7 |
import plotly.express as px
|
| 8 |
import plotly.io as pio
|
| 9 |
+
import plotly.graph_objects as go
|
| 10 |
import matplotlib as mpl
|
| 11 |
import matplotlib.pyplot as plt
|
| 12 |
from wordcloud import WordCloud
|
| 13 |
from PIL import Image
|
| 14 |
+
import requests
|
| 15 |
+
from itertools import islice
|
| 16 |
+
from youtube_comment_downloader import *
|
| 17 |
+
|
| 18 |
|
| 19 |
@st.cache(allow_output_mutation=True)
|
| 20 |
def get_nltk():
|
|
|
|
| 113 |
tweets_df.drop('Datetime', axis=1, inplace=True)
|
| 114 |
return tweets_df
|
| 115 |
|
| 116 |
+
def get_youtube_comments(url, num_comments):
|
| 117 |
+
pattern = '"playabilityStatus":{"status":"ERROR","reason":"Video unavailable"'
|
| 118 |
+
def try_site(url):
|
| 119 |
+
request = requests.get(url)
|
| 120 |
+
return False if pattern in request.text else True
|
| 121 |
+
|
| 122 |
+
video_exists = try_site(url)
|
| 123 |
+
if video_exists:
|
| 124 |
+
comment_list = []
|
| 125 |
+
downloader = YoutubeCommentDownloader()
|
| 126 |
+
comments = downloader.get_comments_from_url(url, sort_by=SORT_BY_POPULAR)
|
| 127 |
+
for comment in islice(comments, num_comments):
|
| 128 |
+
comment_list.append(comment['text'])
|
| 129 |
+
return comment_list
|
| 130 |
+
else:
|
| 131 |
+
raise Exception('Video does not exist')
|
| 132 |
+
|
| 133 |
+
def get_sentiment_youtube(useful_sentence):
|
| 134 |
+
tokenizer = tokenizer_sentiment
|
| 135 |
+
model = model_sentiment
|
| 136 |
+
pipe = pipeline(model="ProsusAI/finbert")
|
| 137 |
+
classifier = pipeline(model="ProsusAI/finbert")
|
| 138 |
+
output=[]
|
| 139 |
+
i=0
|
| 140 |
+
useful_sentence_len = len(useful_sentence)
|
| 141 |
+
for temp in useful_sentence:
|
| 142 |
+
output.extend(classifier(temp))
|
| 143 |
+
i=i+1
|
| 144 |
+
df = pd.DataFrame.from_dict(useful_sentence)
|
| 145 |
+
df_temp = pd.DataFrame.from_dict(output)
|
| 146 |
+
df = pd.concat([df, df_temp], axis=1)
|
| 147 |
+
df = df.rename(columns={'label': 'Sentiment'})
|
| 148 |
+
df = df.rename(columns={0: 'Comment'})
|
| 149 |
+
df['Sentiment'] = df['Sentiment'].replace('positive', 'Positive')
|
| 150 |
+
df['Sentiment'] = df['Sentiment'].replace('negative', 'Negative')
|
| 151 |
+
df['Sentiment'] = df['Sentiment'].replace('neutral', 'Neutral')
|
| 152 |
+
return df
|
| 153 |
+
|
| 154 |
+
|
| 155 |
def text_preprocessing(text):
|
| 156 |
stopwords = set()
|
| 157 |
with open("static/en_stopwords.txt", "r") as file:
|
|
|
|
| 171 |
cleaned_text = re.sub(entity_pattern, " ", cleaned_text)
|
| 172 |
cleaned_text = re.sub(non_alpha, " ", cleaned_text)
|
| 173 |
tokens = word_tokenize(cleaned_text)
|
|
|
|
| 174 |
# provide POS tag for lemmatization to yield better result
|
| 175 |
word_tag_tuples = pos_tag(tokens, tagset="universal")
|
| 176 |
tag_dict = {"NOUN": "n", "VERB": "v", "ADJ": "a", "ADV": "r"}
|
|
|
|
| 226 |
fig.update_layout(showlegend=False)
|
| 227 |
return fig
|
| 228 |
|
| 229 |
+
|
| 230 |
+
|
| 231 |
def get_top_n_gram(tweet_df, ngram_range, n=10):
|
| 232 |
+
try:
|
| 233 |
+
stopwords = set()
|
| 234 |
+
with open("static/en_stopwords_ngram.txt", "r") as file:
|
| 235 |
+
for word in file:
|
| 236 |
+
stopwords.add(word.rstrip("\n"))
|
| 237 |
+
stopwords = list(stopwords)
|
| 238 |
+
corpus = tweet_df["Tweet"]
|
| 239 |
+
vectorizer = CountVectorizer(
|
| 240 |
+
analyzer="word", ngram_range=ngram_range, stop_words=stopwords
|
| 241 |
+
)
|
| 242 |
+
X = vectorizer.fit_transform(corpus.astype(str).values)
|
| 243 |
+
words = vectorizer.get_feature_names_out()
|
| 244 |
+
words_count = np.ravel(X.sum(axis=0))
|
| 245 |
+
df = pd.DataFrame(zip(words, words_count))
|
| 246 |
+
df.columns = ["words", "counts"]
|
| 247 |
+
df = df.sort_values(by="counts", ascending=False).head(n)
|
| 248 |
+
df["words"] = df["words"].str.title()
|
| 249 |
+
return df
|
| 250 |
+
except:
|
| 251 |
+
pass
|
| 252 |
|
| 253 |
def plot_n_gram(n_gram_df, title, color="#54A24B"):
|
| 254 |
+
try:
|
| 255 |
+
fig = px.bar(
|
| 256 |
+
# n_gram_df,
|
| 257 |
+
# x="counts",
|
| 258 |
+
# y="words",
|
| 259 |
+
x=n_gram_df.counts,
|
| 260 |
+
y=n_gram_df.words,
|
| 261 |
+
title="<b>{}</b>".format(title),
|
| 262 |
+
text_auto=True,
|
| 263 |
+
)
|
| 264 |
+
fig.update_layout(plot_bgcolor="white")
|
| 265 |
+
fig.update_xaxes(title=None)
|
| 266 |
+
fig.update_yaxes(autorange="reversed", title=None)
|
| 267 |
+
fig.update_traces(hovertemplate="<b>%{y}</b><br>Count=%{x}", marker_color=color)
|
| 268 |
+
return fig
|
| 269 |
+
except:
|
| 270 |
+
fig = go.Figure()
|
| 271 |
+
return fig
|
| 272 |
|
| 273 |
+
def plot_wordcloud(tweet_df, colormap="Greens", mask_url="static/twitter_mask.png"):
|
| 274 |
+
try:
|
| 275 |
+
stopwords = set()
|
| 276 |
+
with open("static/en_stopwords_ngram.txt", "r") as file:
|
| 277 |
+
for word in file:
|
| 278 |
+
stopwords.add(word.rstrip("\n"))
|
| 279 |
+
cmap = mpl.cm.get_cmap(colormap)(np.linspace(0, 1, 20))
|
| 280 |
+
cmap = mpl.colors.ListedColormap(cmap[10:15])
|
| 281 |
+
mask = np.array(Image.open(mask_url))
|
| 282 |
+
font = "static/quartzo.ttf"
|
| 283 |
+
tweet_df["Cleaned_Tweet"] = tweet_df["Tweet"].apply(text_preprocessing)
|
| 284 |
+
text = " ".join(tweet_df["Cleaned_Tweet"])
|
| 285 |
+
wc = WordCloud(
|
| 286 |
+
background_color="white",
|
| 287 |
+
font_path=font,
|
| 288 |
+
stopwords=stopwords,
|
| 289 |
+
max_words=90,
|
| 290 |
+
colormap=cmap,
|
| 291 |
+
mask=mask,
|
| 292 |
+
random_state=42,
|
| 293 |
+
collocations=False,
|
| 294 |
+
min_word_length=2,
|
| 295 |
+
max_font_size=200,
|
| 296 |
+
)
|
| 297 |
+
wc.generate(text)
|
| 298 |
+
fig = plt.figure(figsize=(8, 8))
|
| 299 |
+
ax = fig.add_subplot(1, 1, 1)
|
| 300 |
+
plt.imshow(wc, interpolation="bilinear")
|
| 301 |
+
plt.axis("off")
|
| 302 |
+
plt.title("Wordcloud", fontdict={"fontsize": 16}, fontweight="heavy", pad=20, y=1.0)
|
| 303 |
+
return fig
|
| 304 |
+
except:
|
| 305 |
+
fig = go.Figure()
|
| 306 |
+
return fig
|
| 307 |
|
| 308 |
|
requirements.txt
CHANGED
|
@@ -7,3 +7,4 @@ plotly==5.9.0
|
|
| 7 |
nltk
|
| 8 |
scikit-learn
|
| 9 |
wordcloud
|
|
|
|
|
|
| 7 |
nltk
|
| 8 |
scikit-learn
|
| 9 |
wordcloud
|
| 10 |
+
youtube-comment-downloader
|
static/yt_mask.png
ADDED
|
|