Spaces:
Runtime error
Runtime error
Fixed some code smell
Browse files
app.py
CHANGED
|
@@ -8,12 +8,11 @@ import nltk, spacy, gensim
|
|
| 8 |
from sklearn.decomposition import LatentDirichletAllocation
|
| 9 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 10 |
from pprint import pprint
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
def concat_comments(
|
| 13 |
-
|
| 14 |
-
return [
|
| 15 |
-
format_s.format(s=s, c=c) for s, c in zip(sup_comment, comment)
|
| 16 |
-
]
|
| 17 |
|
| 18 |
def sent_to_words(sentences):
|
| 19 |
for sentence in sentences:
|
|
@@ -28,38 +27,34 @@ def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'], nlp=Non
|
|
| 28 |
]))
|
| 29 |
return texts_out
|
| 30 |
|
| 31 |
-
|
| 32 |
-
def main(choose_context):
|
| 33 |
df = pd.read_csv('./data/results.csv', index_col=0)
|
| 34 |
-
|
| 35 |
-
print(choose_context)
|
| 36 |
-
|
| 37 |
-
if choose_context == 'comment':
|
| 38 |
-
data = df.comment
|
| 39 |
-
elif choose_context == 'sup comment':
|
| 40 |
-
data = df.sup_comment
|
| 41 |
-
elif choose_context == 'sup comment + comment':
|
| 42 |
-
data = concat_comments(df.sup_comment, df.comment)
|
| 43 |
-
|
| 44 |
data_words = list(sent_to_words(data))
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
| 50 |
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
|
| 51 |
data_lemmatized = lemmatization(data_words, allowed_postags=["NOUN", "ADJ"], nlp=nlp) #select noun and verb
|
| 52 |
|
|
|
|
| 53 |
vectorizer = CountVectorizer(
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
)
|
| 60 |
-
data_vectorized = vectorizer.fit_transform(data_lemmatized)
|
| 61 |
|
|
|
|
|
|
|
| 62 |
|
|
|
|
| 63 |
lda_model = LatentDirichletAllocation(
|
| 64 |
n_components=5,
|
| 65 |
max_iter=10,
|
|
@@ -68,19 +63,28 @@ def main(choose_context):
|
|
| 68 |
batch_size=128,
|
| 69 |
evaluate_every = -1,
|
| 70 |
n_jobs = -1,
|
|
|
|
| 71 |
)
|
|
|
|
|
|
|
| 72 |
lda_output = lda_model.fit_transform(data_vectorized)
|
| 73 |
print(lda_model) # Model attributes
|
| 74 |
|
|
|
|
|
|
|
| 75 |
# Log Likelyhood: Higher the better
|
| 76 |
-
print("Log Likelihood: ",
|
| 77 |
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
|
| 78 |
-
print("Perplexity: ",
|
|
|
|
|
|
|
| 79 |
# See model parameters
|
| 80 |
pprint(lda_model.get_params())
|
| 81 |
|
|
|
|
| 82 |
best_lda_model = lda_model
|
| 83 |
|
|
|
|
| 84 |
lda_output = best_lda_model.transform(data_vectorized)
|
| 85 |
|
| 86 |
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
|
|
@@ -96,8 +100,6 @@ def main(choose_context):
|
|
| 96 |
# Assign Column and Index
|
| 97 |
df_topic_keywords.columns = vectorizer.get_feature_names_out()
|
| 98 |
df_topic_keywords.index = topicnames
|
| 99 |
-
# View
|
| 100 |
-
df_topic_keywords
|
| 101 |
|
| 102 |
# Show top n keywords for each topic
|
| 103 |
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
|
|
@@ -190,25 +192,28 @@ def main(choose_context):
|
|
| 190 |
return fig
|
| 191 |
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
with gr.Blocks() as demo:
|
| 194 |
-
gr.Markdown("
|
| 195 |
-
gr.Markdown("
|
| 196 |
-
button = gr.Radio(
|
| 197 |
-
label="Plot type",
|
| 198 |
-
choices=['scatter_plot', 'heatmap', 'us_map', 'interactive_barplot', "radial", "multiline"], value='scatter_plot'
|
| 199 |
-
)
|
| 200 |
# gradio.Dataframe(路路路)
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
choices=['comment', 'sup comment', 'sup comment + comment'], value='sup comment'
|
| 204 |
-
)
|
| 205 |
-
plot = gr.Plot(label="Plot")
|
| 206 |
-
choose_context.change(main, inputs=[choose_context], outputs=[plot])
|
| 207 |
|
| 208 |
btn = gr.Button(value="Submit")
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
-
demo.load(main, inputs=[
|
| 212 |
|
| 213 |
|
| 214 |
# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|
|
|
|
| 8 |
from sklearn.decomposition import LatentDirichletAllocation
|
| 9 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 10 |
from pprint import pprint
|
| 11 |
+
import matplotlib
|
| 12 |
+
matplotlib.use('agg')
|
| 13 |
|
| 14 |
+
def concat_comments(*kwargs):
|
| 15 |
+
return ['\n'.join(ele) for ele in zip(*kwargs)]
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
def sent_to_words(sentences):
|
| 18 |
for sentence in sentences:
|
|
|
|
| 27 |
]))
|
| 28 |
return texts_out
|
| 29 |
|
| 30 |
+
def get_lda(n_components):
|
|
|
|
| 31 |
df = pd.read_csv('./data/results.csv', index_col=0)
|
| 32 |
+
data = concat_comments(df.subreddit, df.sup_comment, df.comment)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
data_words = list(sent_to_words(data))
|
| 34 |
|
| 35 |
+
|
| 36 |
+
if not spacy.util.is_package("en_core_web_sm"):
|
| 37 |
+
print('[x] en_core_web_sm not found, downloading...')
|
| 38 |
+
os.system("python -m spacy download en_core_web_sm")
|
| 39 |
+
print('[x] en_core_web_sm downloaded')
|
| 40 |
+
|
| 41 |
+
print('[x] Lemmatization begins')
|
| 42 |
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
|
| 43 |
data_lemmatized = lemmatization(data_words, allowed_postags=["NOUN", "ADJ"], nlp=nlp) #select noun and verb
|
| 44 |
|
| 45 |
+
print('[x] Vectorizing')
|
| 46 |
vectorizer = CountVectorizer(
|
| 47 |
+
analyzer='word',
|
| 48 |
+
min_df=10,
|
| 49 |
+
stop_words='english',
|
| 50 |
+
lowercase=True,
|
| 51 |
+
token_pattern='[a-zA-Z0-9]{3,}'
|
| 52 |
+
)
|
|
|
|
| 53 |
|
| 54 |
+
print('[x] Fitting vectorized data on lemmatization')
|
| 55 |
+
data_vectorized = vectorizer.fit_transform(data_lemmatized)
|
| 56 |
|
| 57 |
+
print('[x] Init LDA model')
|
| 58 |
lda_model = LatentDirichletAllocation(
|
| 59 |
n_components=5,
|
| 60 |
max_iter=10,
|
|
|
|
| 63 |
batch_size=128,
|
| 64 |
evaluate_every = -1,
|
| 65 |
n_jobs = -1,
|
| 66 |
+
verbose=1,
|
| 67 |
)
|
| 68 |
+
|
| 69 |
+
print('[x] Fitting LDA model')
|
| 70 |
lda_output = lda_model.fit_transform(data_vectorized)
|
| 71 |
print(lda_model) # Model attributes
|
| 72 |
|
| 73 |
+
print('[x] Getting performances')
|
| 74 |
+
performances = lda_model.score(data_vectorized), lda_model.perplexity(data_vectorized)
|
| 75 |
# Log Likelyhood: Higher the better
|
| 76 |
+
print("Log Likelihood: ", performances[0])
|
| 77 |
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
|
| 78 |
+
print("Perplexity: ", performances[1])
|
| 79 |
+
|
| 80 |
+
print('[x] Check parameters if they look correct')
|
| 81 |
# See model parameters
|
| 82 |
pprint(lda_model.get_params())
|
| 83 |
|
| 84 |
+
# switching to the best model
|
| 85 |
best_lda_model = lda_model
|
| 86 |
|
| 87 |
+
print('[x] Getting LDA output')
|
| 88 |
lda_output = best_lda_model.transform(data_vectorized)
|
| 89 |
|
| 90 |
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
|
|
|
|
| 100 |
# Assign Column and Index
|
| 101 |
df_topic_keywords.columns = vectorizer.get_feature_names_out()
|
| 102 |
df_topic_keywords.index = topicnames
|
|
|
|
|
|
|
| 103 |
|
| 104 |
# Show top n keywords for each topic
|
| 105 |
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
|
|
|
|
| 192 |
return fig
|
| 193 |
|
| 194 |
|
| 195 |
+
# def main():
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
with gr.Blocks() as demo:
|
| 201 |
+
gr.Markdown("# Dashboard per l'analisi con LDA")
|
| 202 |
+
gr.Markdown("### Questo 猫 un sottotitolo")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
# gradio.Dataframe(路路路)
|
| 204 |
+
|
| 205 |
+
n_comp = gr.Slider(2, 25, value=5, step = 1, label="N components", info="Scegli il numero di componenti per LDA"),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
btn = gr.Button(value="Submit")
|
| 208 |
+
|
| 209 |
+
plot = gr.Plot(label="Plot")
|
| 210 |
+
|
| 211 |
+
btn.click(get_lda, inputs=[n_comp[0]], outputs=[plot])
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
|
| 215 |
|
| 216 |
+
# demo.load(main, inputs=[], outputs=[plot])
|
| 217 |
|
| 218 |
|
| 219 |
# iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|