Spaces:

DanielSc4
/

DataAnalyticsNLP

Runtime error

App Files Files Community

DanielSc4 commited on Sep 13, 2023

Commit

6534dfb

1 Parent(s): 2180e70

now working

Browse files

Files changed (1) hide show

app.py +77 -24

app.py CHANGED Viewed

@@ -56,7 +56,7 @@ def get_lda(n_components):
     print('[x] Init LDA model')
     lda_model = LatentDirichletAllocation(
-        n_components=5,
         max_iter=10,
         learning_method='online',
         random_state=100,
@@ -65,7 +65,7 @@ def get_lda(n_components):
         n_jobs = -1,
         verbose=1,
     )
     print('[x] Fitting LDA model')
     lda_output = lda_model.fit_transform(data_vectorized)
     print(lda_model)    # Model attributes
@@ -87,13 +87,16 @@ def get_lda(n_components):
     print('[x] Getting LDA output')
     lda_output = best_lda_model.transform(data_vectorized)
     topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
     docnames = ["Doc" + str(i) for i in range(len(data))]
     df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
     dominant_topic = np.argmax(df_document_topic.values, axis=1)
     df_document_topic["dominant_topic"] = dominant_topic
     # Topic-Keyword Matrix
     df_topic_keywords = pd.DataFrame(best_lda_model.components_)
     df_topic_keywords
@@ -101,6 +104,7 @@ def get_lda(n_components):
     df_topic_keywords.columns = vectorizer.get_feature_names_out()
     df_topic_keywords.index = topicnames
     # Show top n keywords for each topic
     def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
         keywords = np.array(vectorizer.get_feature_names_out())
@@ -122,6 +126,7 @@ def get_lda(n_components):
     df_topic_keywords["Topics"] = topics
     df_topic_keywords
     # Define function to predict topic for a given text document.
     def predict_topic(text, nlp=nlp):
         global sent_to_words
@@ -142,9 +147,9 @@ def get_lda(n_components):
         #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
         return infer_topic, topic, topic_probability_scores
-    # Predict the topic
-    mytext = ["This is a test of a random topic where I talk about politics"]
-    infer_topic, topic, prob_scores = predict_topic(text = mytext, nlp=nlp)
     def apply_predict_topic(text):
         text = [text]
@@ -153,16 +158,60 @@ def get_lda(n_components):
     df["Topic_key_word"] = df['comment'].apply(apply_predict_topic)
-    # plot
-    subreddits = df.subreddit.value_counts().index[:22]
-    weight_counts = {
-        t: [
-            df[df.Topic_key_word == t].subreddit.value_counts()[subreddit] / df.subreddit.value_counts()[subreddit] for subreddit in subreddits
-        ] for t in topics
-    }
     irony_percs = {
         t: [
             len(
@@ -175,7 +224,7 @@ def get_lda(n_components):
     }
     width = 0.9
-    fig, ax = plt.subplots(figsize = (10, 7))
     plt.axhline(0.5, color = 'red', ls=":", alpha = .3)
     bottom = np.zeros(len(subreddits))
@@ -187,9 +236,11 @@ def get_lda(n_components):
     ax.set_title("Perc of topics for each subreddit")
     ax.legend(loc="upper right")
-    plt.xticks(rotation=70)
-    return fig
 # def main():
@@ -202,18 +253,20 @@ with gr.Blocks() as demo:
     gr.Markdown("### Questo è un sottotitolo")
     # gradio.Dataframe(···)
-    n_comp = gr.Slider(2, 25, value=5, step = 1, label="N components", info="Scegli il numero di componenti per LDA"),
     btn = gr.Button(value="Submit")
-    plot = gr.Plot(label="Plot")
-    btn.click(get_lda, inputs=[n_comp[0]], outputs=[plot])
-    # demo.load(main, inputs=[], outputs=[plot])
 # iface = gr.Interface(fn=greet, inputs="text", outputs="text")

     print('[x] Init LDA model')
     lda_model = LatentDirichletAllocation(
+        n_components=n_components,
         max_iter=10,
         learning_method='online',
         random_state=100,
         n_jobs = -1,
         verbose=1,
     )
     print('[x] Fitting LDA model')
     lda_output = lda_model.fit_transform(data_vectorized)
     print(lda_model)    # Model attributes
     print('[x] Getting LDA output')
     lda_output = best_lda_model.transform(data_vectorized)
+    print('[x] Assigning topics')
     topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
     docnames = ["Doc" + str(i) for i in range(len(data))]
     df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
+    print('[x] Checking dominant topics')
     dominant_topic = np.argmax(df_document_topic.values, axis=1)
     df_document_topic["dominant_topic"] = dominant_topic
     # Topic-Keyword Matrix
     df_topic_keywords = pd.DataFrame(best_lda_model.components_)
     df_topic_keywords
     df_topic_keywords.columns = vectorizer.get_feature_names_out()
     df_topic_keywords.index = topicnames
+    print('[x] Computing word-topic association')
     # Show top n keywords for each topic
     def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
         keywords = np.array(vectorizer.get_feature_names_out())
     df_topic_keywords["Topics"] = topics
     df_topic_keywords
+    print('[x] Predicting dominant topic for each document')
     # Define function to predict topic for a given text document.
     def predict_topic(text, nlp=nlp):
         global sent_to_words
         #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
         return infer_topic, topic, topic_probability_scores
+    # # Predict the topic
+    # mytext = ["This is a test of a random topic where I talk about politics"]
+    # infer_topic, topic, prob_scores = predict_topic(text = mytext, nlp=nlp)
     def apply_predict_topic(text):
         text = [text]
     df["Topic_key_word"] = df['comment'].apply(apply_predict_topic)
+    print('[x] Generating plot [1]')
+    print('Percentuale di commenti ironici per ogni topic')
+    perc_topic_irony = {}
+    for t in topics:
+        total_0label = sum((df.label == 1) & (df.Topic_key_word == t))
+        if total_0label != 0:
+            total_X_topic = df.Topic_key_word.value_counts()[t]
+        else:
+            total_0label, total_X_topic = 0, 0.001      # Non ci cono topic nel dataset
+        perc_topic_irony[t] = total_0label / total_X_topic
+        print(f'{t} w/ label 1: {total_0label}/{total_X_topic} ({total_0label / total_X_topic * 100 :.2f}%)')
+    fig1, ax = plt.subplots(figsize = (10, 7))
+    bottom = np.zeros(len(perc_topic_irony))
+    width = 0.9
+    ax.bar(perc_topic_irony.keys(), perc_topic_irony.values(), width, label = 'sarcastic')
+    comp = list(map(lambda x: 1 - x if x > 0 else 0, perc_topic_irony.values()))
+    ax.bar(perc_topic_irony.keys(), comp, width, bottom=list(perc_topic_irony.values()), label = 'not sarcastic')
+    ax.set_title("% of sarcastic comments for each topic")
+    plt.xticks(rotation=70)
+    plt.legend()
+    plt.axhline(0.5, color = 'red', ls=":")
+    # Should this be a parameter?
+    # Max number of biggest subreddits to analyse
+    n_top_subreddit_to_analyse = 20
+    # probably not necessary (?) To drop eventually if log are to much cluttered!
+    print('Percentage of each topic for each subreddit')
+    weight_counts = {}
+    for t in topics:
+        weight_counts[t] = []
+        for subreddit in df['subreddit'].value_counts().index[:n_top_subreddit_to_analyse]:        # first 10 big subreddits
+            if sum(df[df.Topic_key_word == t].subreddit == subreddit) > 0:         # se ci sono subreddit per il topic t (almeno una riga nel df)
+                perc_sub = df[df.Topic_key_word == t]['subreddit'].value_counts()[subreddit] / df['subreddit'].value_counts()[subreddit]
+            else:
+                perc_sub = 0
+            weight_counts[t].append(perc_sub)
+            print(f'Perc of topic {t} in subreddit {subreddit}: {perc_sub * 100:.2f}')
+        print()
+    print('[x] Generating plot [2]')
+    # plot
+    subreddits = list(df.subreddit.value_counts().index)[:n_top_subreddit_to_analyse]
+    # weight_counts = {
+    #     t: [
+    #         df[df.Topic_key_word == t].subreddit.value_counts()[subreddit] / df.subreddit.value_counts()[subreddit] for subreddit in subreddits
+    #     ] for t in topics
+    # }
     irony_percs = {
         t: [
             len(
     }
     width = 0.9
+    fig2, ax = plt.subplots(figsize = (10, 7))
     plt.axhline(0.5, color = 'red', ls=":", alpha = .3)
     bottom = np.zeros(len(subreddits))
     ax.set_title("Perc of topics for each subreddit")
     ax.legend(loc="upper right")
+    plt.xticks(rotation=50)
+    print('[v] All looking good!')
+    return df_topic_keywords, fig1, fig2
 # def main():
     gr.Markdown("### Questo è un sottotitolo")
     # gradio.Dataframe(···)
     btn = gr.Button(value="Submit")
+    btn.click(
+        get_lda,
+        inputs=[
+            gr.Slider(2, 25, value=5, step = 1, label="N components", info="Scegli il numero di componenti per LDA"),
+        ],
+        outputs=[
+            gr.DataFrame(),
+            gr.Plot(label="Plot 1"),
+            gr.Plot(label="Plot 2"),
+        ]
+    )
 # iface = gr.Interface(fn=greet, inputs="text", outputs="text")