Spaces:
Runtime error
Runtime error
updated app
Browse files- app.py +28 -22
- test.ipynb +3 -3
app.py
CHANGED
|
@@ -27,12 +27,17 @@ def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'], nlp=Non
|
|
| 27 |
]))
|
| 28 |
return texts_out
|
| 29 |
|
| 30 |
-
def get_lda(n_components):
|
| 31 |
df = pd.read_csv('./data/results.csv', index_col=0)
|
| 32 |
data = concat_comments(df.subreddit, df.sup_comment, df.comment)
|
| 33 |
data_words = list(sent_to_words(data))
|
| 34 |
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
if not spacy.util.is_package("en_core_web_sm"):
|
| 37 |
print('[x] en_core_web_sm not found, downloading...')
|
| 38 |
os.system("python -m spacy download en_core_web_sm")
|
|
@@ -162,7 +167,7 @@ def get_lda(n_components):
|
|
| 162 |
print('Percentuale di commenti ironici per ogni topic')
|
| 163 |
perc_topic_irony = {}
|
| 164 |
for t in topics:
|
| 165 |
-
total_0label = sum((df
|
| 166 |
if total_0label != 0:
|
| 167 |
total_X_topic = df.Topic_key_word.value_counts()[t]
|
| 168 |
else:
|
|
@@ -182,10 +187,6 @@ def get_lda(n_components):
|
|
| 182 |
plt.xticks(rotation=70)
|
| 183 |
plt.legend()
|
| 184 |
plt.axhline(0.5, color = 'red', ls=":")
|
| 185 |
-
|
| 186 |
-
# Should this be a parameter?
|
| 187 |
-
# Max number of biggest subreddits to analyse
|
| 188 |
-
n_top_subreddit_to_analyse = 20
|
| 189 |
|
| 190 |
# probably not necessary (?) To drop eventually if log are to much cluttered!
|
| 191 |
print('Percentage of each topic for each subreddit')
|
|
@@ -205,17 +206,11 @@ def get_lda(n_components):
|
|
| 205 |
print('[x] Generating plot [2]')
|
| 206 |
# plot
|
| 207 |
subreddits = list(df.subreddit.value_counts().index)[:n_top_subreddit_to_analyse]
|
| 208 |
-
|
| 209 |
-
# weight_counts = {
|
| 210 |
-
# t: [
|
| 211 |
-
# df[df.Topic_key_word == t].subreddit.value_counts()[subreddit] / df.subreddit.value_counts()[subreddit] for subreddit in subreddits
|
| 212 |
-
# ] for t in topics
|
| 213 |
-
# }
|
| 214 |
|
| 215 |
irony_percs = {
|
| 216 |
t: [
|
| 217 |
len(
|
| 218 |
-
df[df.subreddit == subreddit][(df[df.subreddit == subreddit].Topic_key_word == t) & (df[df.subreddit == subreddit]
|
| 219 |
) /
|
| 220 |
len(
|
| 221 |
df[df.subreddit == subreddit]
|
|
@@ -234,7 +229,7 @@ def get_lda(n_components):
|
|
| 234 |
ax.bar(subreddits, irony_percs[k], width - 0.01, bottom=bottom, color = 'black', edgecolor = 'white', alpha = .2, hatch = '\\')
|
| 235 |
bottom += v
|
| 236 |
|
| 237 |
-
ax.set_title("
|
| 238 |
ax.legend(loc="upper right")
|
| 239 |
plt.xticks(rotation=50)
|
| 240 |
|
|
@@ -250,21 +245,32 @@ def get_lda(n_components):
|
|
| 250 |
|
| 251 |
with gr.Blocks() as demo:
|
| 252 |
gr.Markdown("# Dashboard per l'analisi con LDA")
|
| 253 |
-
gr.Markdown("###
|
| 254 |
# gradio.Dataframe(路路路)
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
btn = gr.Button(value="Submit")
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
btn.click(
|
| 260 |
get_lda,
|
| 261 |
-
inputs=
|
| 262 |
-
gr.Slider(2, 25, value=5, step = 1, label="N components", info="Scegli il numero di componenti per LDA"),
|
| 263 |
-
],
|
| 264 |
outputs=[
|
| 265 |
gr.DataFrame(),
|
| 266 |
-
gr.Plot(label="
|
| 267 |
-
gr.Plot(label="
|
| 268 |
]
|
| 269 |
)
|
| 270 |
|
|
|
|
| 27 |
]))
|
| 28 |
return texts_out
|
| 29 |
|
| 30 |
+
def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
|
| 31 |
df = pd.read_csv('./data/results.csv', index_col=0)
|
| 32 |
data = concat_comments(df.subreddit, df.sup_comment, df.comment)
|
| 33 |
data_words = list(sent_to_words(data))
|
| 34 |
|
| 35 |
+
if what_label_to_use == 'Use True label':
|
| 36 |
+
label = 'label'
|
| 37 |
+
else:
|
| 38 |
+
label = 'prediction'
|
| 39 |
+
|
| 40 |
+
|
| 41 |
if not spacy.util.is_package("en_core_web_sm"):
|
| 42 |
print('[x] en_core_web_sm not found, downloading...')
|
| 43 |
os.system("python -m spacy download en_core_web_sm")
|
|
|
|
| 167 |
print('Percentuale di commenti ironici per ogni topic')
|
| 168 |
perc_topic_irony = {}
|
| 169 |
for t in topics:
|
| 170 |
+
total_0label = sum((df[label] == 1) & (df.Topic_key_word == t))
|
| 171 |
if total_0label != 0:
|
| 172 |
total_X_topic = df.Topic_key_word.value_counts()[t]
|
| 173 |
else:
|
|
|
|
| 187 |
plt.xticks(rotation=70)
|
| 188 |
plt.legend()
|
| 189 |
plt.axhline(0.5, color = 'red', ls=":")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
# probably not necessary (?) To drop eventually if log are to much cluttered!
|
| 192 |
print('Percentage of each topic for each subreddit')
|
|
|
|
| 206 |
print('[x] Generating plot [2]')
|
| 207 |
# plot
|
| 208 |
subreddits = list(df.subreddit.value_counts().index)[:n_top_subreddit_to_analyse]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
irony_percs = {
|
| 211 |
t: [
|
| 212 |
len(
|
| 213 |
+
df[df.subreddit == subreddit][(df[df.subreddit == subreddit].Topic_key_word == t) & (df[df.subreddit == subreddit][label] == 1)]
|
| 214 |
) /
|
| 215 |
len(
|
| 216 |
df[df.subreddit == subreddit]
|
|
|
|
| 229 |
ax.bar(subreddits, irony_percs[k], width - 0.01, bottom=bottom, color = 'black', edgecolor = 'white', alpha = .2, hatch = '\\')
|
| 230 |
bottom += v
|
| 231 |
|
| 232 |
+
ax.set_title("% of topics for each subreddit")
|
| 233 |
ax.legend(loc="upper right")
|
| 234 |
plt.xticks(rotation=50)
|
| 235 |
|
|
|
|
| 245 |
|
| 246 |
with gr.Blocks() as demo:
|
| 247 |
gr.Markdown("# Dashboard per l'analisi con LDA")
|
| 248 |
+
gr.Markdown("### La dashboard permette l'addestramento di un modello LDA per controllare se e quali topic sono pi霉 propensi a commenti di tipo sarcastico")
|
| 249 |
# gradio.Dataframe(路路路)
|
| 250 |
+
|
| 251 |
+
inputs = []
|
| 252 |
+
with gr.Row():
|
| 253 |
+
inputs.append(gr.Slider(2, 25, value=5, step = 1, label="LDA N components", info="Scegli il numero di componenti per LDA"))
|
| 254 |
+
inputs.append(gr.Slider(2, 20, value=5, step = 1, label="Subreddit dal dataset", info="Numero di subreddit da analizzare"))
|
| 255 |
+
inputs.append(gr.Radio(
|
| 256 |
+
choices = ['Use True label', 'Use BERT prediction'],
|
| 257 |
+
value = 'Use True label',
|
| 258 |
+
label = "Scegliere quali label sull'ironia utilizzare:",
|
| 259 |
+
)
|
| 260 |
+
)
|
| 261 |
|
| 262 |
btn = gr.Button(value="Submit")
|
| 263 |
+
|
| 264 |
+
gr.Markdown("## Risulati ottenuti")
|
| 265 |
+
gr.Markdown("#### Top 15 parole che pi霉 contribuiscono al topic di riferimento (utlima colonna):")
|
| 266 |
|
| 267 |
btn.click(
|
| 268 |
get_lda,
|
| 269 |
+
inputs=inputs,
|
|
|
|
|
|
|
| 270 |
outputs=[
|
| 271 |
gr.DataFrame(),
|
| 272 |
+
gr.Plot(label="Quanto i topic trovati portano ironia?"),
|
| 273 |
+
gr.Plot(label="Come i topic sono correlati ai diversi subreddit del dataset?"),
|
| 274 |
]
|
| 275 |
)
|
| 276 |
|
test.ipynb
CHANGED
|
@@ -255,9 +255,9 @@
|
|
| 255 |
"metadata": {},
|
| 256 |
"source": [
|
| 257 |
"TODO:\n",
|
| 258 |
-
"- Show LDA top words for each topic\n",
|
| 259 |
-
"- I topic con una bassa percentuale di ironia sono i topic considerati pi霉 \"seri\" (?)\n",
|
| 260 |
-
"- Per ora sto utilizzando le label assegnate dal dataset, se non avessi le label e dovessi prevedere l'ironia LDA 猫 cmq affidabile?"
|
| 261 |
]
|
| 262 |
}
|
| 263 |
],
|
|
|
|
| 255 |
"metadata": {},
|
| 256 |
"source": [
|
| 257 |
"TODO:\n",
|
| 258 |
+
"- [x] Show LDA top words for each topic\n",
|
| 259 |
+
"- [ ] I topic con una bassa percentuale di ironia sono i topic considerati pi霉 \"seri\" (?)\n",
|
| 260 |
+
"- [x] Per ora sto utilizzando le label assegnate dal dataset, se non avessi le label e dovessi prevedere l'ironia LDA 猫 cmq affidabile?"
|
| 261 |
]
|
| 262 |
}
|
| 263 |
],
|