Spaces:

DanielSc4
/

DataAnalyticsNLP

Runtime error

App Files Files Community

DanielSc4 commited on Sep 13, 2023

Commit

465ab59

1 Parent(s): 6534dfb

updated app

Browse files

Files changed (2) hide show

app.py +28 -22
test.ipynb +3 -3

app.py CHANGED Viewed

@@ -27,12 +27,17 @@ def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'], nlp=Non
         ]))
     return texts_out
-def get_lda(n_components):
     df = pd.read_csv('./data/results.csv', index_col=0)
     data = concat_comments(df.subreddit, df.sup_comment, df.comment)
     data_words = list(sent_to_words(data))
     if not spacy.util.is_package("en_core_web_sm"):
         print('[x] en_core_web_sm not found, downloading...')
         os.system("python -m spacy download en_core_web_sm")
@@ -162,7 +167,7 @@ def get_lda(n_components):
     print('Percentuale di commenti ironici per ogni topic')
     perc_topic_irony = {}
     for t in topics:
-        total_0label = sum((df.label == 1) & (df.Topic_key_word == t))
         if total_0label != 0:
             total_X_topic = df.Topic_key_word.value_counts()[t]
         else:
@@ -182,10 +187,6 @@ def get_lda(n_components):
     plt.xticks(rotation=70)
     plt.legend()
     plt.axhline(0.5, color = 'red', ls=":")
-    # Should this be a parameter?
-    # Max number of biggest subreddits to analyse
-    n_top_subreddit_to_analyse = 20
     # probably not necessary (?) To drop eventually if log are to much cluttered!
     print('Percentage of each topic for each subreddit')
@@ -205,17 +206,11 @@ def get_lda(n_components):
     print('[x] Generating plot [2]')
     # plot
     subreddits = list(df.subreddit.value_counts().index)[:n_top_subreddit_to_analyse]
-    # weight_counts = {
-    #     t: [
-    #         df[df.Topic_key_word == t].subreddit.value_counts()[subreddit] / df.subreddit.value_counts()[subreddit] for subreddit in subreddits
-    #     ] for t in topics
-    # }
     irony_percs = {
         t: [
             len(
-                df[df.subreddit == subreddit][(df[df.subreddit == subreddit].Topic_key_word == t) & (df[df.subreddit == subreddit].label == 1)]
             ) /
             len(
                 df[df.subreddit == subreddit]
@@ -234,7 +229,7 @@ def get_lda(n_components):
         ax.bar(subreddits, irony_percs[k], width - 0.01, bottom=bottom, color = 'black', edgecolor = 'white', alpha = .2, hatch = '\\')
         bottom += v
-    ax.set_title("Perc of topics for each subreddit")
     ax.legend(loc="upper right")
     plt.xticks(rotation=50)
@@ -250,21 +245,32 @@ def get_lda(n_components):
 with gr.Blocks() as demo:
     gr.Markdown("# Dashboard per l'analisi con LDA")
-    gr.Markdown("### Questo è un sottotitolo")
     # gradio.Dataframe(···)
     btn = gr.Button(value="Submit")
     btn.click(
         get_lda,
-        inputs=[
-            gr.Slider(2, 25, value=5, step = 1, label="N components", info="Scegli il numero di componenti per LDA"),
-        ],
         outputs=[
             gr.DataFrame(),
-            gr.Plot(label="Plot 1"),
-            gr.Plot(label="Plot 2"),
         ]
     )

         ]))
     return texts_out
+def get_lda(n_components, n_top_subreddit_to_analyse, what_label_to_use):
     df = pd.read_csv('./data/results.csv', index_col=0)
     data = concat_comments(df.subreddit, df.sup_comment, df.comment)
     data_words = list(sent_to_words(data))
+    if what_label_to_use == 'Use True label':
+        label = 'label'
+    else:
+        label = 'prediction'
     if not spacy.util.is_package("en_core_web_sm"):
         print('[x] en_core_web_sm not found, downloading...')
         os.system("python -m spacy download en_core_web_sm")
     print('Percentuale di commenti ironici per ogni topic')
     perc_topic_irony = {}
     for t in topics:
+        total_0label = sum((df[label] == 1) & (df.Topic_key_word == t))
         if total_0label != 0:
             total_X_topic = df.Topic_key_word.value_counts()[t]
         else:
     plt.xticks(rotation=70)
     plt.legend()
     plt.axhline(0.5, color = 'red', ls=":")
     # probably not necessary (?) To drop eventually if log are to much cluttered!
     print('Percentage of each topic for each subreddit')
     print('[x] Generating plot [2]')
     # plot
     subreddits = list(df.subreddit.value_counts().index)[:n_top_subreddit_to_analyse]
     irony_percs = {
         t: [
             len(
+                df[df.subreddit == subreddit][(df[df.subreddit == subreddit].Topic_key_word == t) & (df[df.subreddit == subreddit][label] == 1)]
             ) /
             len(
                 df[df.subreddit == subreddit]
         ax.bar(subreddits, irony_percs[k], width - 0.01, bottom=bottom, color = 'black', edgecolor = 'white', alpha = .2, hatch = '\\')
         bottom += v
+    ax.set_title("% of topics for each subreddit")
     ax.legend(loc="upper right")
     plt.xticks(rotation=50)
 with gr.Blocks() as demo:
     gr.Markdown("# Dashboard per l'analisi con LDA")
+    gr.Markdown("### La dashboard permette l'addestramento di un modello LDA per controllare se e quali topic sono più propensi a commenti di tipo sarcastico")
     # gradio.Dataframe(···)
+    inputs = []
+    with gr.Row():
+        inputs.append(gr.Slider(2, 25, value=5, step = 1, label="LDA N components", info="Scegli il numero di componenti per LDA"))
+        inputs.append(gr.Slider(2, 20, value=5, step = 1, label="Subreddit dal dataset", info="Numero di subreddit da analizzare"))
+        inputs.append(gr.Radio(
+            choices = ['Use True label', 'Use BERT prediction'],
+            value = 'Use True label',
+            label = "Scegliere quali label sull'ironia utilizzare:",
+            )
+        )
     btn = gr.Button(value="Submit")
+    gr.Markdown("## Risulati ottenuti")
+    gr.Markdown("#### Top 15 parole che più contribuiscono al topic di riferimento (utlima colonna):")
     btn.click(
         get_lda,
+        inputs=inputs,
         outputs=[
             gr.DataFrame(),
+            gr.Plot(label="Quanto i topic trovati portano ironia?"),
+            gr.Plot(label="Come i topic sono correlati ai diversi subreddit del dataset?"),
         ]
     )

test.ipynb CHANGED Viewed

@@ -255,9 +255,9 @@
    "metadata": {},
    "source": [
     "TODO:\n",
-    "- Show LDA top words for each topic\n",
-    "- I topic con una bassa percentuale di ironia sono i topic considerati più \"seri\" (?)\n",
-    "- Per ora sto utilizzando le label assegnate dal dataset, se non avessi le label e dovessi prevedere l'ironia LDA è cmq affidabile?"
    ]
   }
  ],

    "metadata": {},
    "source": [
     "TODO:\n",
+    "- [x] Show LDA top words for each topic\n",
+    "- [ ] I topic con una bassa percentuale di ironia sono i topic considerati più \"seri\" (?)\n",
+    "- [x] Per ora sto utilizzando le label assegnate dal dataset, se non avessi le label e dovessi prevedere l'ironia LDA è cmq affidabile?"
    ]
   }
  ],