haotle commited on
Commit
593d9c6
ยท
verified ยท
1 Parent(s): e0f3355

Update pages/2 Topic Modeling.py

Browse files
Files changed (1) hide show
  1. pages/2 Topic Modeling.py +681 -673
pages/2 Topic Modeling.py CHANGED
@@ -1,677 +1,685 @@
1
  <<<<<<< HEAD
2
- #import module
3
- import streamlit as st
4
- import streamlit.components.v1 as components
5
- import pandas as pd
6
- import numpy as np
7
- import re
8
- import string
9
- import nltk
10
- nltk.download('wordnet')
11
- from nltk.stem import WordNetLemmatizer
12
- nltk.download('stopwords')
13
- from nltk.corpus import stopwords
14
- import gensim
15
- import gensim.corpora as corpora
16
- from gensim.corpora import Dictionary
17
- from gensim.models.coherencemodel import CoherenceModel
18
- from gensim.models.ldamodel import LdaModel
19
- from gensim.models import Phrases
20
- from gensim.models.phrases import Phraser
21
- from pprint import pprint
22
- import pickle
23
- import pyLDAvis
24
- import pyLDAvis.gensim_models as gensimvis
25
- from io import StringIO
26
- from ipywidgets.embed import embed_minimal_html
27
- from nltk.stem.snowball import SnowballStemmer
28
- from bertopic import BERTopic
29
- from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, TextGeneration
30
- import plotly.express as px
31
- from sklearn.cluster import KMeans
32
- from sklearn.feature_extraction.text import CountVectorizer
33
- import bitermplus as btm
34
- import tmplot as tmp
35
- import tomotopy
36
- import sys
37
- import spacy
38
- import en_core_web_sm
39
- import pipeline
40
- from html2image import Html2Image
41
- from umap import UMAP
42
- import os
43
- import time
44
- import json
45
- from tools import sourceformat as sf
46
- import datamapplot
47
- from sentence_transformers import SentenceTransformer
48
- import openai
49
- from transformers import pipeline
50
-
51
- #===config===
52
- st.set_page_config(
53
- page_title="Coconut",
54
- page_icon="๐Ÿฅฅ",
55
- layout="wide",
56
- initial_sidebar_state="collapsed"
57
- )
58
-
59
- hide_streamlit_style = """
60
- <style>
61
- #MainMenu
62
- {visibility: hidden;}
63
- footer {visibility: hidden;}
64
- [data-testid="collapsedControl"] {display: none}
65
- </style>
66
- """
67
- st.markdown(hide_streamlit_style, unsafe_allow_html=True)
68
-
69
- with st.popover("๐Ÿ”— Menu"):
70
- st.page_link("https://www.coconut-libtool.com/", label="Home", icon="๐Ÿ ")
71
- st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1๏ธโƒฃ")
72
- st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2๏ธโƒฃ")
73
- st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3๏ธโƒฃ")
74
- st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4๏ธโƒฃ")
75
- st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5๏ธโƒฃ")
76
- st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6๏ธโƒฃ")
77
- st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7๏ธโƒฃ")
78
- st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8๏ธโƒฃ")
79
- st.page_link("pages/9 Summarization.py", label = "Summarization",icon ="9๏ธโƒฃ")
80
- st.page_link("pages/10 WordCloud.py", label = "WordCloud", icon = "๐Ÿ”Ÿ")
81
-
82
- st.header("Topic Modeling", anchor=False)
83
- st.subheader('Put your file here...', anchor=False)
84
-
85
- #========unique id========
86
- @st.cache_resource(ttl=3600)
87
- def create_list():
88
- l = [1, 2, 3]
89
- return l
90
-
91
- l = create_list()
92
- first_list_value = l[0]
93
- l[0] = first_list_value + 1
94
- uID = str(l[0])
95
-
96
- @st.cache_data(ttl=3600)
97
- def get_ext(uploaded_file):
98
- extype = uID+uploaded_file.name
99
- return extype
100
-
101
- #===clear cache===
102
-
103
- def reset_biterm():
104
- try:
105
- biterm_map.clear()
106
- biterm_bar.clear()
107
- except NameError:
108
- biterm_topic.clear()
109
-
110
- def reset_all():
111
- st.cache_data.clear()
112
-
113
- #===avoiding deadlock===
114
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
115
-
116
- #===upload file===
117
- @st.cache_data(ttl=3600)
118
- def upload(file):
119
- papers = pd.read_csv(uploaded_file)
120
- if "About the data" in papers.columns[0]:
121
- papers = sf.dim(papers)
122
- col_dict = {'MeSH terms': 'Keywords',
123
- 'PubYear': 'Year',
124
- 'Times cited': 'Cited by',
125
- 'Publication Type': 'Document Type'
126
- }
127
- papers.rename(columns=col_dict, inplace=True)
128
-
129
- return papers
130
-
131
- @st.cache_data(ttl=3600)
132
- def conv_txt(extype):
133
- if("PMID" in (uploaded_file.read()).decode()):
134
- uploaded_file.seek(0)
135
- papers = sf.medline(uploaded_file)
136
- print(papers)
137
- return papers
138
- col_dict = {'TI': 'Title',
139
- 'SO': 'Source title',
140
- 'DE': 'Author Keywords',
141
- 'DT': 'Document Type',
142
- 'AB': 'Abstract',
143
- 'TC': 'Cited by',
144
- 'PY': 'Year',
145
- 'ID': 'Keywords Plus',
146
- 'rights_date_used': 'Year'}
147
- uploaded_file.seek(0)
148
- papers = pd.read_csv(uploaded_file, sep='\t')
149
- if("htid" in papers.columns):
150
- papers = sf.htrc(papers)
151
- papers.rename(columns=col_dict, inplace=True)
152
- print(papers)
153
- return papers
154
-
155
-
156
- @st.cache_data(ttl=3600)
157
- def conv_json(extype):
158
- col_dict={'title': 'title',
159
- 'rights_date_used': 'Year',
160
- }
161
-
162
- data = json.load(uploaded_file)
163
- hathifile = data['gathers']
164
- keywords = pd.DataFrame.from_records(hathifile)
165
-
166
- keywords = sf.htrc(keywords)
167
- keywords.rename(columns=col_dict,inplace=True)
168
- return keywords
169
-
170
- @st.cache_resource(ttl=3600)
171
- def conv_pub(extype):
172
- if (get_ext(extype)).endswith('.tar.gz'):
173
- bytedata = extype.read()
174
- keywords = sf.readPub(bytedata)
175
- elif (get_ext(extype)).endswith('.xml'):
176
- bytedata = extype.read()
177
- keywords = sf.readxml(bytedata)
178
- return keywords
179
-
180
- #===Read data===
181
- uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz','xml'], on_change=reset_all)
182
-
183
- if uploaded_file is not None:
184
- try:
185
- extype = get_ext(uploaded_file)
186
-
187
- if extype.endswith('.csv'):
188
- papers = upload(extype)
189
- elif extype.endswith('.txt'):
190
- papers = conv_txt(extype)
191
-
192
- elif extype.endswith('.json'):
193
- papers = conv_json(extype)
194
- elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
195
- papers = conv_pub(uploaded_file)
196
-
197
- coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
198
-
199
- c1, c2, c3 = st.columns([3,3,4])
200
- method = c1.selectbox(
201
- 'Choose method',
202
- ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'))
203
- ColCho = c2.selectbox('Choose column', (["Abstract","Title", "Abstract + Title"]))
204
- num_cho = c3.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
205
-
206
- d1, d2 = st.columns([3,7])
207
- xgram = d1.selectbox("N-grams", ("1", "2", "3"))
208
- xgram = int(xgram)
209
- words_to_remove = d2.text_input("Remove specific words. Separate words by semicolons (;)")
210
-
211
- rem_copyright = d1.toggle('Remove copyright statement', value=True)
212
- rem_punc = d2.toggle('Remove punctuation', value=True)
213
-
214
- #===advance settings===
215
- with st.expander("๐Ÿงฎ Show advance settings"):
216
- t1, t2, t3 = st.columns([3,3,4])
217
- if method == 'pyLDA':
218
- py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
219
- py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
220
- opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
221
-
222
- elif method == 'Biterm':
223
- btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
224
- btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
225
- opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
226
-
227
- elif method == 'BERTopic':
228
- u1, u2 = st.columns([5,5])
229
-
230
- bert_top_n_words = u1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
231
- bert_random_state = u2.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
232
- bert_n_components = u1.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
233
- bert_n_neighbors = u2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
234
- bert_embedding_model = st.radio(
235
- "embedding_model",
236
- ["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2", "en_core_web_sm"], index=0, horizontal=True)
237
-
238
- fine_tuning = st.toggle("Use Fine-tuning")
239
- if fine_tuning:
240
- topic_labelling = st.toggle("Automatic topic labelling")
241
- if topic_labelling:
242
- llm_provider = st.selectbox("Model",["OpenAI/gpt-4o","Google/flan-t5","LiquidAI/LFM2-350M"])
243
- if llm_provider == "OpenAI/gpt-4o":
244
- api_key = st.text_input("API Key")
245
-
246
- else:
247
- st.write('Please choose your preferred method')
248
-
249
- #===clean csv===
250
- @st.cache_data(ttl=3600, show_spinner=False)
251
- def clean_csv(extype):
252
- if (ColCho=="Abstract + Title"):
253
- papers["Abstract + Title"] = papers["Title"] + " " + papers["Abstract"]
254
- st.write(papers["Abstract + Title"])
255
-
256
- paper = papers.dropna(subset=[ColCho])
257
-
258
- #===mapping===
259
- paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
260
- if rem_punc:
261
- paper['Abstract_pre'] = paper['Abstract_pre'].map(
262
- lambda x: re.sub(f"[{re.escape(string.punctuation)}]", " ", x)
263
- ).map(lambda x: re.sub(r"\s+", " ", x).strip())
264
- paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('[\u2018\u2019\u201c\u201d]', '', regex=True)
265
- if rem_copyright:
266
- paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('ยฉ.*', '', x))
267
-
268
- #===stopword removal===
269
- stop = stopwords.words('english')
270
- paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
271
-
272
- #===lemmatize===
273
- lemmatizer = WordNetLemmatizer()
274
-
275
- @st.cache_resource(ttl=3600)
276
- def lemmatize_words(text):
277
- words = text.split()
278
- words = [lemmatizer.lemmatize(word) for word in words]
279
- return ' '.join(words)
280
- paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
281
-
282
- words_rmv = [word.strip() for word in words_to_remove.split(";")]
283
- remove_dict = {word: None for word in words_rmv}
284
-
285
- @st.cache_resource(ttl=3600)
286
- def remove_words(text):
287
- words = text.split()
288
- cleaned_words = [word for word in words if word not in remove_dict]
289
- return ' '.join(cleaned_words)
290
- paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
291
-
292
- topic_abs = paper.Abstract_lem.values.tolist()
293
- return topic_abs, paper
294
-
295
- topic_abs, paper=clean_csv(extype)
296
-
297
- if st.button("Submit", on_click=reset_all):
298
- num_topic = num_cho
299
-
300
- if method == 'BERTopic':
301
- st.info('BERTopic is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="โ„น๏ธ")
302
-
303
- #===topic===
304
- if method == 'Choose...':
305
- st.write('')
306
-
307
- elif method == 'pyLDA':
308
- tab1, tab2, tab3, tab4 = st.tabs(["๐Ÿ“ˆ Generate visualization", "๐Ÿ“ƒ Reference", "๐Ÿ““ Recommended Reading", "โฌ‡๏ธ Download Help"])
309
-
310
- with tab1:
311
- #===visualization===
312
- @st.cache_data(ttl=3600, show_spinner=False)
313
- def pylda(extype):
314
- topic_abs_LDA = [t.split(' ') for t in topic_abs]
315
-
316
- bigram = Phrases(topic_abs_LDA, min_count=xgram, threshold=opt_threshold)
317
- trigram = Phrases(bigram[topic_abs_LDA], threshold=opt_threshold)
318
- bigram_mod = Phraser(bigram)
319
- trigram_mod = Phraser(trigram)
320
-
321
- topic_abs_LDA = [trigram_mod[bigram_mod[doc]] for doc in topic_abs_LDA]
322
-
323
- id2word = Dictionary(topic_abs_LDA)
324
- corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
325
- #===LDA===
326
- lda_model = LdaModel(corpus=corpus,
327
- id2word=id2word,
328
- num_topics=num_topic,
329
- random_state=py_random_state,
330
- chunksize=py_chunksize,
331
- alpha='auto',
332
- per_word_topics=False)
333
- pprint(lda_model.print_topics())
334
- doc_lda = lda_model[corpus]
335
- topics = lda_model.show_topics(num_words = 30,formatted=False)
336
-
337
- #===visualization===
338
- coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
339
- coherence_lda = coherence_model_lda.get_coherence()
340
- vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
341
- py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
342
- return py_lda_vis_html, coherence_lda, vis, topics
343
-
344
- with st.spinner('Performing computations. Please wait ...'):
345
- try:
346
- py_lda_vis_html, coherence_lda, vis, topics = pylda(extype)
347
- st.write('Coherence score: ', coherence_lda)
348
- components.html(py_lda_vis_html, width=1500, height=800)
349
- st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
350
-
351
- @st.cache_data(ttl=3600, show_spinner=False)
352
- def img_lda(vis):
353
- pyLDAvis.save_html(vis, 'output.html')
354
- hti = Html2Image()
355
- hti.browser.flags = ['--default-background-color=ffffff', '--hide-scrollbars']
356
- hti.browser.use_new_headless = None
357
- css = "body {background: white;}"
358
- hti.screenshot(
359
- other_file='output.html', css_str=css, size=(1500, 800),
360
- save_as='ldavis_img.png'
361
- )
362
-
363
- img_lda(vis)
364
-
365
- d1, d2 = st.columns(2)
366
- with open("ldavis_img.png", "rb") as file:
367
- btn = d1.download_button(
368
- label="Download image",
369
- data=file,
370
- file_name="ldavis_img.png",
371
- mime="image/png"
372
- )
373
-
374
- #===download results===#
375
- resultf = pd.DataFrame(topics)
376
- #formatting
377
- resultf = resultf.transpose()
378
- resultf = resultf.drop([0])
379
- resultf = resultf.explode(list(range(len(resultf.columns))), ignore_index=False)
380
-
381
- resultcsv = resultf.to_csv().encode("utf-8")
382
- d2.download_button(
383
- label = "Download Results",
384
- data=resultcsv,
385
- file_name="results.csv",
386
- mime="text\csv",
387
- on_click="ignore")
388
-
389
- except NameError as f:
390
- st.warning('๐Ÿ–ฑ๏ธ Please click Submit')
391
-
392
- with tab2:
393
- st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
394
-
395
- with tab3:
396
- st.markdown('**Chen, X., & Wang, H. (2019, January). Automated chat transcript analysis using topic modeling for library reference services. Proceedings of the Association for Information Science and Technology, 56(1), 368โ€“371.** https://doi.org/10.1002/pra2.31')
397
- st.markdown('**Joo, S., Ingram, E., & Cahill, M. (2021, December 15). Exploring Topics and Genres in Storytime Books: A Text Mining Approach. Evidence Based Library and Information Practice, 16(4), 41โ€“62.** https://doi.org/10.18438/eblip29963')
398
- st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Topic Modeling. Text Mining for Information Professionals, 105โ€“137.** https://doi.org/10.1007/978-3-030-85085-2_4')
399
- st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477โ€“505.** https://doi.org/10.1007/s11192-019-03137-5')
400
-
401
- with tab4:
402
- st.subheader(':blue[pyLDA]', anchor=False)
403
- st.button('Download image')
404
- st.text("Click Download Image button.")
405
- st.divider()
406
- st.subheader(':blue[Downloading CSV Results]', anchor=False)
407
- st.button("Download Results")
408
- st.text("Click Download results button at bottom of page")
409
-
410
- #===Biterm===
411
- elif method == 'Biterm':
412
-
413
- #===optimize Biterm===
414
- @st.cache_data(ttl=3600, show_spinner=False)
415
- def biterm_topic(extype):
416
- tokenized_abs = [t.split(' ') for t in topic_abs]
417
-
418
- bigram = Phrases(tokenized_abs, min_count=xgram, threshold=opt_threshold)
419
- trigram = Phrases(bigram[tokenized_abs], threshold=opt_threshold)
420
- bigram_mod = Phraser(bigram)
421
- trigram_mod = Phraser(trigram)
422
-
423
- topic_abs_ngram = [trigram_mod[bigram_mod[doc]] for doc in tokenized_abs]
424
-
425
- topic_abs_str = [' '.join(doc) for doc in topic_abs_ngram]
426
-
427
-
428
- X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs_str)
429
- tf = np.array(X.sum(axis=0)).ravel()
430
- docs_vec = btm.get_vectorized_docs(topic_abs, vocabulary)
431
- docs_lens = list(map(len, docs_vec))
432
- biterms = btm.get_biterms(docs_vec)
433
-
434
- model = btm.BTM(X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
435
- model.fit(biterms, iterations=btm_iterations)
436
-
437
- p_zd = model.transform(docs_vec)
438
- coherence = model.coherence_
439
- phi = tmp.get_phi(model)
440
- topics_coords = tmp.prepare_coords(model)
441
- totaltop = topics_coords.label.values.tolist()
442
- perplexity = model.perplexity_
443
- top_topics = model.df_words_topics_
444
-
445
- return topics_coords, phi, totaltop, perplexity, top_topics
446
-
447
- tab1, tab2, tab3, tab4 = st.tabs(["๐Ÿ“ˆ Generate visualization", "๐Ÿ“ƒ Reference", "๐Ÿ““ Recommended Reading", "โฌ‡๏ธ Download Help"])
448
- with tab1:
449
- try:
450
- with st.spinner('Performing computations. Please wait ...'):
451
- topics_coords, phi, totaltop, perplexity, top_topics = biterm_topic(extype)
452
- col1, col2 = st.columns([4,6])
453
-
454
- @st.cache_data(ttl=3600)
455
- def biterm_map(extype):
456
- btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
457
- return btmvis_coords
458
-
459
- @st.cache_data(ttl=3600)
460
- def biterm_bar(extype):
461
- terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
462
- btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
463
- return btmvis_probs
464
-
465
- with col1:
466
- st.write('Perplexity score: ', perplexity)
467
- st.write('')
468
- numvis = st.selectbox(
469
- 'Choose topic',
470
- (totaltop), on_change=reset_biterm)
471
- btmvis_coords = biterm_map(extype)
472
- st.altair_chart(btmvis_coords)
473
- with col2:
474
- btmvis_probs = biterm_bar(extype)
475
- st.altair_chart(btmvis_probs, use_container_width=True)
476
-
477
- #===download results===#
478
- resultcsv = top_topics.to_csv().encode("utf-8")
479
- st.download_button(label = "Download Results", data=resultcsv, file_name="results.csv", mime="text\csv", on_click="ignore")
480
-
481
- except ValueError as g:
482
- st.error('๐Ÿ™‡โ€โ™‚๏ธ Please raise the number of topics and click submit')
483
-
484
- except NameError as f:
485
- st.warning('๐Ÿ–ฑ๏ธ Please click Submit')
486
-
487
- with tab2:
488
- st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
489
- with tab3:
490
- st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
491
- st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
492
- st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
493
- st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
494
- with tab4:
495
- st.subheader(':blue[Biterm]', anchor=False)
496
- st.text("Click the three dots at the top right then select the desired format.")
497
- st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_biterm.jpg)")
498
- st.divider()
499
- st.subheader(':blue[Downloading CSV Results]', anchor=False)
500
- st.button("Download Results")
501
- st.text("Click Download results button at bottom of page")
502
-
503
-
504
- #===BERTopic===
505
- elif method == 'BERTopic':
506
- @st.cache_resource(ttl = 3600, show_spinner=False)
507
- #@st.cache_data(ttl=3600, show_spinner=False)
508
- def bertopic_vis(extype):
509
- umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
510
- min_dist=0.0, metric='cosine', random_state=bert_random_state)
511
- cluster_model = KMeans(n_clusters=num_topic)
512
- if bert_embedding_model == 'all-MiniLM-L6-v2':
513
- model = SentenceTransformer('all-MiniLM-L6-v2')
514
- lang = 'en'
515
- embeddings = model.encode(topic_abs, show_progress_bar=True)
516
-
517
- elif bert_embedding_model == 'en_core_web_sm':
518
- nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
519
- model = nlp
520
- lang = 'en'
521
- embeddings = np.array([nlp(text).vector for text in topic_abs])
522
-
523
- elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
524
- model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
525
- lang = 'multilingual'
526
- embeddings = model.encode(topic_abs, show_progress_bar=True)
527
-
528
- representation_model = ""
529
-
530
- if fine_tuning:
531
- keybert = KeyBERTInspired()
532
- mmr = MaximalMarginalRelevance(diversity=0.3)
533
- representation_model = {
534
- "KeyBERT": keybert,
535
- "MMR": mmr,
536
- }
537
- if topic_labelling:
538
- if llm_provider == "OpenAI/gpt-4o":
539
- client = openai.OpenAI(api_key=api_key)
540
- representation_model = {
541
- "KeyBERT": keybert,
542
- "MMR": mmr,
543
- "test": OpenAI(client, model = "gpt-4o-mini", delay_in_seconds=10)
544
- }
545
- elif llm_provider == "Google/flan-t5":
546
- pipe = pipeline("text2text-generation", model = "google/flan-t5-base")
547
- clientmod = TextGeneration(pipe)
548
- representation_model = {
549
- "KeyBERT": keybert,
550
- "MMR": mmr,
551
- "test": clientmod
552
- }
553
- elif llm_provider == "LiquidAI/LFM2-350M":
554
- pipe = pipeline("text-generation", model = "LiquidAI/LFM2-350M")
555
- clientmod = TextGeneration(pipe)
556
- representation_model = {
557
- "KeyBERT": keybert,
558
- "MMR": mmr,
559
- "test": clientmod
560
- }
561
-
562
- vectorizer_model = CountVectorizer(ngram_range=(1, xgram), stop_words='english')
563
- topic_model = BERTopic(representation_model = representation_model, embedding_model=model, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=bert_top_n_words)
564
- topics, probs = topic_model.fit_transform(topic_abs, embeddings=embeddings)
565
-
566
- if(fine_tuning and topic_labelling):
567
- generated_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["test"].values()]
568
- topic_model.set_topic_labels(generated_labels)
569
-
570
- return topic_model, topics, probs, embeddings
571
-
572
- @st.cache_resource(ttl = 3600, show_spinner=False)
573
- def Vis_Topics(extype):
574
- fig1 = topic_model.visualize_topics()
575
- return fig1
576
- @st.cache_resource(ttl = 3600, show_spinner=False)
577
- def Vis_Documents(extype):
578
- fig2 = topic_model.visualize_document_datamap(topic_abs, embeddings=embeddings, custom_labels = True)
579
- return fig2
580
- @st.cache_resource(ttl = 3600, show_spinner=False)
581
- def Vis_Hierarchy(extype):
582
- fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic, custom_labels = True)
583
- return fig3
584
- @st.cache_resource(ttl = 3600, show_spinner=False)
585
- def Vis_Heatmap(extype):
586
- global topic_model
587
- fig4 = topic_model.visualize_heatmap(n_clusters=num_topic-1, width=1000, height=1000, custom_labels = True)
588
- return fig4
589
- @st.cache_resource(ttl = 3600, show_spinner=False)
590
- def Vis_Barchart(extype):
591
- fig5 = topic_model.visualize_barchart(top_n_topics=num_topic, custom_labels = True)
592
- return fig5
593
-
594
- tab1, tab2, tab3, tab4 = st.tabs(["๐Ÿ“ˆ Generate visualization", "๐Ÿ“ƒ Reference", "๐Ÿ““ Recommended Reading", "โฌ‡๏ธ Download Help"])
595
- with tab1:
596
- try:
597
- with st.spinner('Performing computations. Please wait ...'):
598
-
599
- topic_model, topics, probs, embeddings = bertopic_vis(extype)
600
- time.sleep(.5)
601
- st.toast('Visualize Topics', icon='๐Ÿƒ')
602
- fig1 = Vis_Topics(extype)
603
-
604
- time.sleep(.5)
605
- st.toast('Visualize Document', icon='๐Ÿƒ')
606
- fig2 = Vis_Documents(extype)
607
-
608
- time.sleep(.5)
609
- st.toast('Visualize Document Hierarchy', icon='๐Ÿƒ')
610
- fig3 = Vis_Hierarchy(extype)
611
-
612
- time.sleep(.5)
613
- st.toast('Visualize Topic Similarity', icon='๐Ÿƒ')
614
- fig4 = Vis_Heatmap(extype)
615
-
616
- time.sleep(.5)
617
- st.toast('Visualize Terms', icon='๐Ÿƒ')
618
- fig5 = Vis_Barchart(extype)
619
-
620
- bertab1, bertab2, bertab3, bertab4, bertab5 = st.tabs(["Visualize Topics", "Visualize Terms", "Visualize Documents",
621
- "Visualize Document Hierarchy", "Visualize Topic Similarity"])
622
-
623
- with bertab1:
624
- st.plotly_chart(fig1, use_container_width=True)
625
- with bertab2:
626
- st.plotly_chart(fig5, use_container_width=True)
627
- with bertab3:
628
- st.plotly_chart(fig2, use_container_width=True)
629
- with bertab4:
630
- st.plotly_chart(fig3, use_container_width=True)
631
- with bertab5:
632
- st.plotly_chart(fig4, use_container_width=True)
633
-
634
- #===download results===#
635
- results = topic_model.get_topic_info()
636
- resultf = pd.DataFrame(results)
637
- resultcsv = resultf.to_csv().encode("utf-8")
638
- st.download_button(
639
- label = "Download Results",
640
- data=resultcsv,
641
- file_name="results.csv",
642
- mime="text\csv",
643
- on_click="ignore",
644
- )
645
-
646
- except ValueError as e:
647
- st.write(e)
648
- st.error('๐Ÿ™‡โ€โ™‚๏ธ Please raise the number of topics and click submit')
649
-
650
-
651
- except NameError as e:
652
- st.warning('๐Ÿ–ฑ๏ธ Please click Submit')
653
- st.write(e)
654
-
655
- with tab2:
656
- st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')
657
-
658
- with tab3:
659
- st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
660
- st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Communityโ€™s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')
661
-
662
- with tab4:
663
- st.divider()
664
- st.subheader(':blue[BERTopic]', anchor=False)
665
- st.text("Click the camera icon on the top right menu")
666
- st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_bertopic.jpg)")
667
- st.divider()
668
- st.subheader(':blue[Downloading CSV Results]', anchor=False)
669
- st.button("Download Results")
670
- st.text("Click Download results button at bottom of page")
671
-
672
- except:
673
- st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="๐Ÿšจ")
674
- st.stop()
 
 
 
 
 
 
 
 
675
  =======
676
  #import module
677
  import streamlit as st
 
1
  <<<<<<< HEAD
2
+ #import module
3
+ import streamlit as st
4
+ import streamlit.components.v1 as components
5
+ import pandas as pd
6
+ import numpy as np
7
+ import re
8
+ import string
9
+ import nltk
10
+ nltk.download('wordnet')
11
+ from nltk.stem import WordNetLemmatizer
12
+ nltk.download('stopwords')
13
+ from nltk.corpus import stopwords
14
+ import gensim
15
+ import gensim.corpora as corpora
16
+ from gensim.corpora import Dictionary
17
+ from gensim.models.coherencemodel import CoherenceModel
18
+ from gensim.models.ldamodel import LdaModel
19
+ from gensim.models import Phrases
20
+ from gensim.models.phrases import Phraser
21
+ from pprint import pprint
22
+ import pickle
23
+ import pyLDAvis
24
+ import pyLDAvis.gensim_models as gensimvis
25
+ from io import StringIO
26
+ from ipywidgets.embed import embed_minimal_html
27
+ from nltk.stem.snowball import SnowballStemmer
28
+ from bertopic import BERTopic
29
+ from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, TextGeneration
30
+ import plotly.express as px
31
+ from sklearn.cluster import KMeans
32
+ from sklearn.feature_extraction.text import CountVectorizer
33
+ import bitermplus as btm
34
+ import tmplot as tmp
35
+ import tomotopy
36
+ import sys
37
+ import spacy
38
+ import en_core_web_sm
39
+ import pipeline
40
+ from html2image import Html2Image
41
+ from umap import UMAP
42
+ import os
43
+ import time
44
+ import json
45
+ from tools import sourceformat as sf
46
+ import datamapplot
47
+ from sentence_transformers import SentenceTransformer
48
+ import openai
49
+ from transformers import pipeline
50
+
51
+ #===config===
52
+ st.set_page_config(
53
+ page_title="Coconut",
54
+ page_icon="๐Ÿฅฅ",
55
+ layout="wide",
56
+ initial_sidebar_state="collapsed"
57
+ )
58
+
59
+ hide_streamlit_style = """
60
+ <style>
61
+ #MainMenu
62
+ {visibility: hidden;}
63
+ footer {visibility: hidden;}
64
+ [data-testid="collapsedControl"] {display: none}
65
+ </style>
66
+ """
67
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
68
+
69
+ with st.popover("๐Ÿ”— Menu"):
70
+ st.page_link("https://www.coconut-libtool.com/", label="Home", icon="๐Ÿ ")
71
+ st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1๏ธโƒฃ")
72
+ st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2๏ธโƒฃ")
73
+ st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3๏ธโƒฃ")
74
+ st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4๏ธโƒฃ")
75
+ st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5๏ธโƒฃ")
76
+ st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6๏ธโƒฃ")
77
+ st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7๏ธโƒฃ")
78
+ st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8๏ธโƒฃ")
79
+ st.page_link("pages/9 Summarization.py", label = "Summarization",icon ="9๏ธโƒฃ")
80
+ st.page_link("pages/10 WordCloud.py", label = "WordCloud", icon = "๐Ÿ”Ÿ")
81
+
82
+ st.header("Topic Modeling", anchor=False)
83
+ st.subheader('Put your file here...', anchor=False)
84
+
85
+ #========unique id========
86
+ @st.cache_resource(ttl=3600)
87
+ def create_list():
88
+ l = [1, 2, 3]
89
+ return l
90
+
91
+ l = create_list()
92
+ first_list_value = l[0]
93
+ l[0] = first_list_value + 1
94
+ uID = str(l[0])
95
+
96
+ @st.cache_data(ttl=3600)
97
+ def get_ext(uploaded_file):
98
+ extype = uID+uploaded_file.name
99
+ return extype
100
+
101
+ #===clear cache===
102
+
103
+ def reset_biterm():
104
+ try:
105
+ biterm_map.clear()
106
+ biterm_bar.clear()
107
+ except NameError:
108
+ biterm_topic.clear()
109
+
110
+ def reset_all():
111
+ st.cache_data.clear()
112
+
113
+ #===avoiding deadlock===
114
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
115
+
116
+ #===upload file===
117
+ @st.cache_data(ttl=3600)
118
+ def upload(file):
119
+ papers = pd.read_csv(uploaded_file)
120
+ if "About the data" in papers.columns[0]:
121
+ papers = sf.dim(papers)
122
+ col_dict = {'MeSH terms': 'Keywords',
123
+ 'PubYear': 'Year',
124
+ 'Times cited': 'Cited by',
125
+ 'Publication Type': 'Document Type'
126
+ }
127
+ papers.rename(columns=col_dict, inplace=True)
128
+
129
+ return papers
130
+
131
+ @st.cache_data(ttl=3600)
132
+ def conv_txt(extype):
133
+ if("PMID" in (uploaded_file.read()).decode()):
134
+ uploaded_file.seek(0)
135
+ papers = sf.medline(uploaded_file)
136
+ print(papers)
137
+ return papers
138
+ col_dict = {'TI': 'Title',
139
+ 'SO': 'Source title',
140
+ 'DE': 'Author Keywords',
141
+ 'DT': 'Document Type',
142
+ 'AB': 'Abstract',
143
+ 'TC': 'Cited by',
144
+ 'PY': 'Year',
145
+ 'ID': 'Keywords Plus',
146
+ 'rights_date_used': 'Year'}
147
+ uploaded_file.seek(0)
148
+ papers = pd.read_csv(uploaded_file, sep='\t')
149
+ if("htid" in papers.columns):
150
+ papers = sf.htrc(papers)
151
+ papers.rename(columns=col_dict, inplace=True)
152
+ print(papers)
153
+ return papers
154
+
155
+
156
+ @st.cache_data(ttl=3600)
157
+ def conv_json(extype):
158
+ col_dict={'title': 'title',
159
+ 'rights_date_used': 'Year',
160
+ }
161
+
162
+ data = json.load(uploaded_file)
163
+ hathifile = data['gathers']
164
+ keywords = pd.DataFrame.from_records(hathifile)
165
+
166
+ keywords = sf.htrc(keywords)
167
+ keywords.rename(columns=col_dict,inplace=True)
168
+ return keywords
169
+
170
+ @st.cache_resource(ttl=3600)
171
+ def conv_pub(extype):
172
+ if (get_ext(extype)).endswith('.tar.gz'):
173
+ bytedata = extype.read()
174
+ keywords = sf.readPub(bytedata)
175
+ elif (get_ext(extype)).endswith('.xml'):
176
+ bytedata = extype.read()
177
+ keywords = sf.readxml(bytedata)
178
+ return keywords
179
+
180
+ #===Read data===
181
+ uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz','xml'], on_change=reset_all)
182
+
183
+ if uploaded_file is not None:
184
+ try:
185
+ extype = get_ext(uploaded_file)
186
+
187
+ if extype.endswith('.csv'):
188
+ papers = upload(extype)
189
+ elif extype.endswith('.txt'):
190
+ papers = conv_txt(extype)
191
+
192
+ elif extype.endswith('.json'):
193
+ papers = conv_json(extype)
194
+ elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
195
+ papers = conv_pub(uploaded_file)
196
+
197
+ coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
198
+
199
+ c1, c2, c3 = st.columns([3,3,4])
200
+ method = c1.selectbox(
201
+ 'Choose method',
202
+ ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'))
203
+ ColCho = c2.selectbox('Choose column', (["Abstract","Title", "Abstract + Title"]))
204
+ num_cho = c3.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
205
+
206
+ d1, d2 = st.columns([3,7])
207
+ xgram = d1.selectbox("N-grams", ("1", "2", "3"))
208
+ xgram = int(xgram)
209
+ words_to_remove = d2.text_input("Remove specific words. Separate words by semicolons (;)")
210
+
211
+ rem_copyright = d1.toggle('Remove copyright statement', value=True)
212
+ rem_punc = d2.toggle('Remove punctuation', value=True)
213
+
214
+ #===advance settings===
215
+ with st.expander("๐Ÿงฎ Show advance settings"):
216
+ t1, t2, t3 = st.columns([3,3,4])
217
+ if method == 'pyLDA':
218
+ py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
219
+ py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
220
+ opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
221
+
222
+ elif method == 'Biterm':
223
+ btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
224
+ btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
225
+ opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
226
+
227
+ elif method == 'BERTopic':
228
+ u1, u2 = st.columns([5,5])
229
+
230
+ bert_top_n_words = u1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
231
+ bert_random_state = u2.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
232
+ bert_n_components = u1.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
233
+ bert_n_neighbors = u2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
234
+ bert_embedding_model = st.radio(
235
+ "embedding_model",
236
+ ["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2", "en_core_web_sm"], index=0, horizontal=True)
237
+
238
+ fine_tuning = st.toggle("Use Fine-tuning")
239
+ if fine_tuning:
240
+ topic_labelling = st.toggle("Automatic topic labelling")
241
+ if topic_labelling:
242
+ llm_provider = st.selectbox("Model",["OpenAI/gpt-4o","Google/flan-t5","LiquidAI/LFM2-350M","deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"])
243
+ if llm_provider == "OpenAI/gpt-4o":
244
+ api_key = st.text_input("API Key")
245
+
246
+ else:
247
+ st.write('Please choose your preferred method')
248
+
249
+ #===clean csv===
250
+ @st.cache_data(ttl=3600, show_spinner=False)
251
+ def clean_csv(extype):
252
+ if (ColCho=="Abstract + Title"):
253
+ papers["Abstract + Title"] = papers["Title"] + " " + papers["Abstract"]
254
+ st.write(papers["Abstract + Title"])
255
+
256
+ paper = papers.dropna(subset=[ColCho])
257
+
258
+ #===mapping===
259
+ paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
260
+ if rem_punc:
261
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(
262
+ lambda x: re.sub(f"[{re.escape(string.punctuation)}]", " ", x)
263
+ ).map(lambda x: re.sub(r"\s+", " ", x).strip())
264
+ paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('[\u2018\u2019\u201c\u201d]', '', regex=True)
265
+ if rem_copyright:
266
+ paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('ยฉ.*', '', x))
267
+
268
+ #===stopword removal===
269
+ stop = stopwords.words('english')
270
+ paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
271
+
272
+ #===lemmatize===
273
+ lemmatizer = WordNetLemmatizer()
274
+
275
+ @st.cache_resource(ttl=3600)
276
+ def lemmatize_words(text):
277
+ words = text.split()
278
+ words = [lemmatizer.lemmatize(word) for word in words]
279
+ return ' '.join(words)
280
+ paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
281
+
282
+ words_rmv = [word.strip() for word in words_to_remove.split(";")]
283
+ remove_dict = {word: None for word in words_rmv}
284
+
285
+ @st.cache_resource(ttl=3600)
286
+ def remove_words(text):
287
+ words = text.split()
288
+ cleaned_words = [word for word in words if word not in remove_dict]
289
+ return ' '.join(cleaned_words)
290
+ paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
291
+
292
+ topic_abs = paper.Abstract_lem.values.tolist()
293
+ return topic_abs, paper
294
+
295
+ topic_abs, paper=clean_csv(extype)
296
+
297
+ if st.button("Submit", on_click=reset_all):
298
+ num_topic = num_cho
299
+
300
+ if method == 'BERTopic':
301
+ st.info('BERTopic is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="โ„น๏ธ")
302
+
303
+ #===topic===
304
+ if method == 'Choose...':
305
+ st.write('')
306
+
307
+ elif method == 'pyLDA':
308
+ tab1, tab2, tab3, tab4 = st.tabs(["๐Ÿ“ˆ Generate visualization", "๐Ÿ“ƒ Reference", "๐Ÿ““ Recommended Reading", "โฌ‡๏ธ Download Help"])
309
+
310
+ with tab1:
311
+ #===visualization===
312
+ @st.cache_data(ttl=3600, show_spinner=False)
313
+ def pylda(extype):
314
+ topic_abs_LDA = [t.split(' ') for t in topic_abs]
315
+
316
+ bigram = Phrases(topic_abs_LDA, min_count=xgram, threshold=opt_threshold)
317
+ trigram = Phrases(bigram[topic_abs_LDA], threshold=opt_threshold)
318
+ bigram_mod = Phraser(bigram)
319
+ trigram_mod = Phraser(trigram)
320
+
321
+ topic_abs_LDA = [trigram_mod[bigram_mod[doc]] for doc in topic_abs_LDA]
322
+
323
+ id2word = Dictionary(topic_abs_LDA)
324
+ corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
325
+ #===LDA===
326
+ lda_model = LdaModel(corpus=corpus,
327
+ id2word=id2word,
328
+ num_topics=num_topic,
329
+ random_state=py_random_state,
330
+ chunksize=py_chunksize,
331
+ alpha='auto',
332
+ per_word_topics=False)
333
+ pprint(lda_model.print_topics())
334
+ doc_lda = lda_model[corpus]
335
+ topics = lda_model.show_topics(num_words = 30,formatted=False)
336
+
337
+ #===visualization===
338
+ coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
339
+ coherence_lda = coherence_model_lda.get_coherence()
340
+ vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
341
+ py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
342
+ return py_lda_vis_html, coherence_lda, vis, topics
343
+
344
+ with st.spinner('Performing computations. Please wait ...'):
345
+ try:
346
+ py_lda_vis_html, coherence_lda, vis, topics = pylda(extype)
347
+ st.write('Coherence score: ', coherence_lda)
348
+ components.html(py_lda_vis_html, width=1500, height=800)
349
+ st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
350
+
351
+ @st.cache_data(ttl=3600, show_spinner=False)
352
+ def img_lda(vis):
353
+ pyLDAvis.save_html(vis, 'output.html')
354
+ hti = Html2Image()
355
+ hti.browser.flags = ['--default-background-color=ffffff', '--hide-scrollbars']
356
+ hti.browser.use_new_headless = None
357
+ css = "body {background: white;}"
358
+ hti.screenshot(
359
+ other_file='output.html', css_str=css, size=(1500, 800),
360
+ save_as='ldavis_img.png'
361
+ )
362
+
363
+ img_lda(vis)
364
+
365
+ d1, d2 = st.columns(2)
366
+ with open("ldavis_img.png", "rb") as file:
367
+ btn = d1.download_button(
368
+ label="Download image",
369
+ data=file,
370
+ file_name="ldavis_img.png",
371
+ mime="image/png"
372
+ )
373
+
374
+ #===download results===#
375
+ resultf = pd.DataFrame(topics)
376
+ #formatting
377
+ resultf = resultf.transpose()
378
+ resultf = resultf.drop([0])
379
+ resultf = resultf.explode(list(range(len(resultf.columns))), ignore_index=False)
380
+
381
+ resultcsv = resultf.to_csv().encode("utf-8")
382
+ d2.download_button(
383
+ label = "Download Results",
384
+ data=resultcsv,
385
+ file_name="results.csv",
386
+ mime="text\csv",
387
+ on_click="ignore")
388
+
389
+ except NameError as f:
390
+ st.warning('๐Ÿ–ฑ๏ธ Please click Submit')
391
+
392
+ with tab2:
393
+ st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
394
+
395
+ with tab3:
396
+ st.markdown('**Chen, X., & Wang, H. (2019, January). Automated chat transcript analysis using topic modeling for library reference services. Proceedings of the Association for Information Science and Technology, 56(1), 368โ€“371.** https://doi.org/10.1002/pra2.31')
397
+ st.markdown('**Joo, S., Ingram, E., & Cahill, M. (2021, December 15). Exploring Topics and Genres in Storytime Books: A Text Mining Approach. Evidence Based Library and Information Practice, 16(4), 41โ€“62.** https://doi.org/10.18438/eblip29963')
398
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Topic Modeling. Text Mining for Information Professionals, 105โ€“137.** https://doi.org/10.1007/978-3-030-85085-2_4')
399
+ st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477โ€“505.** https://doi.org/10.1007/s11192-019-03137-5')
400
+
401
+ with tab4:
402
+ st.subheader(':blue[pyLDA]', anchor=False)
403
+ st.button('Download image')
404
+ st.text("Click Download Image button.")
405
+ st.divider()
406
+ st.subheader(':blue[Downloading CSV Results]', anchor=False)
407
+ st.button("Download Results")
408
+ st.text("Click Download results button at bottom of page")
409
+
410
+ #===Biterm===
411
+ elif method == 'Biterm':
412
+
413
+ #===optimize Biterm===
414
+ @st.cache_data(ttl=3600, show_spinner=False)
415
+ def biterm_topic(extype):
416
+ tokenized_abs = [t.split(' ') for t in topic_abs]
417
+
418
+ bigram = Phrases(tokenized_abs, min_count=xgram, threshold=opt_threshold)
419
+ trigram = Phrases(bigram[tokenized_abs], threshold=opt_threshold)
420
+ bigram_mod = Phraser(bigram)
421
+ trigram_mod = Phraser(trigram)
422
+
423
+ topic_abs_ngram = [trigram_mod[bigram_mod[doc]] for doc in tokenized_abs]
424
+
425
+ topic_abs_str = [' '.join(doc) for doc in topic_abs_ngram]
426
+
427
+
428
+ X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs_str)
429
+ tf = np.array(X.sum(axis=0)).ravel()
430
+ docs_vec = btm.get_vectorized_docs(topic_abs, vocabulary)
431
+ docs_lens = list(map(len, docs_vec))
432
+ biterms = btm.get_biterms(docs_vec)
433
+
434
+ model = btm.BTM(X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
435
+ model.fit(biterms, iterations=btm_iterations)
436
+
437
+ p_zd = model.transform(docs_vec)
438
+ coherence = model.coherence_
439
+ phi = tmp.get_phi(model)
440
+ topics_coords = tmp.prepare_coords(model)
441
+ totaltop = topics_coords.label.values.tolist()
442
+ perplexity = model.perplexity_
443
+ top_topics = model.df_words_topics_
444
+
445
+ return topics_coords, phi, totaltop, perplexity, top_topics
446
+
447
+ tab1, tab2, tab3, tab4 = st.tabs(["๐Ÿ“ˆ Generate visualization", "๐Ÿ“ƒ Reference", "๐Ÿ““ Recommended Reading", "โฌ‡๏ธ Download Help"])
448
+ with tab1:
449
+ try:
450
+ with st.spinner('Performing computations. Please wait ...'):
451
+ topics_coords, phi, totaltop, perplexity, top_topics = biterm_topic(extype)
452
+ col1, col2 = st.columns([4,6])
453
+
454
+ @st.cache_data(ttl=3600)
455
+ def biterm_map(extype):
456
+ btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
457
+ return btmvis_coords
458
+
459
+ @st.cache_data(ttl=3600)
460
+ def biterm_bar(extype):
461
+ terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
462
+ btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
463
+ return btmvis_probs
464
+
465
+ with col1:
466
+ st.write('Perplexity score: ', perplexity)
467
+ st.write('')
468
+ numvis = st.selectbox(
469
+ 'Choose topic',
470
+ (totaltop), on_change=reset_biterm)
471
+ btmvis_coords = biterm_map(extype)
472
+ st.altair_chart(btmvis_coords)
473
+ with col2:
474
+ btmvis_probs = biterm_bar(extype)
475
+ st.altair_chart(btmvis_probs, use_container_width=True)
476
+
477
+ #===download results===#
478
+ resultcsv = top_topics.to_csv().encode("utf-8")
479
+ st.download_button(label = "Download Results", data=resultcsv, file_name="results.csv", mime="text\csv", on_click="ignore")
480
+
481
+ except ValueError as g:
482
+ st.error('๐Ÿ™‡โ€โ™‚๏ธ Please raise the number of topics and click submit')
483
+
484
+ except NameError as f:
485
+ st.warning('๐Ÿ–ฑ๏ธ Please click Submit')
486
+
487
+ with tab2:
488
+ st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
489
+ with tab3:
490
+ st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
491
+ st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
492
+ st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
493
+ st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
494
+ with tab4:
495
+ st.subheader(':blue[Biterm]', anchor=False)
496
+ st.text("Click the three dots at the top right then select the desired format.")
497
+ st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_biterm.jpg)")
498
+ st.divider()
499
+ st.subheader(':blue[Downloading CSV Results]', anchor=False)
500
+ st.button("Download Results")
501
+ st.text("Click Download results button at bottom of page")
502
+
503
+
504
+ #===BERTopic===
505
+ elif method == 'BERTopic':
506
+ @st.cache_resource(ttl = 3600, show_spinner=False)
507
+ #@st.cache_data(ttl=3600, show_spinner=False)
508
+ def bertopic_vis(extype):
509
+ umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
510
+ min_dist=0.0, metric='cosine', random_state=bert_random_state)
511
+ cluster_model = KMeans(n_clusters=num_topic)
512
+ if bert_embedding_model == 'all-MiniLM-L6-v2':
513
+ model = SentenceTransformer('all-MiniLM-L6-v2')
514
+ lang = 'en'
515
+ embeddings = model.encode(topic_abs, show_progress_bar=True)
516
+
517
+ elif bert_embedding_model == 'en_core_web_sm':
518
+ nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
519
+ model = nlp
520
+ lang = 'en'
521
+ embeddings = np.array([nlp(text).vector for text in topic_abs])
522
+
523
+ elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
524
+ model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
525
+ lang = 'multilingual'
526
+ embeddings = model.encode(topic_abs, show_progress_bar=True)
527
+
528
+ representation_model = ""
529
+
530
+ if fine_tuning:
531
+ keybert = KeyBERTInspired()
532
+ mmr = MaximalMarginalRelevance(diversity=0.3)
533
+ representation_model = {
534
+ "KeyBERT": keybert,
535
+ "MMR": mmr,
536
+ }
537
+ if topic_labelling:
538
+ if llm_provider == "OpenAI/gpt-4o":
539
+ client = openai.OpenAI(api_key=api_key)
540
+ representation_model = {
541
+ "KeyBERT": keybert,
542
+ "MMR": mmr,
543
+ "test": OpenAI(client, model = "gpt-4o-mini", delay_in_seconds=10)
544
+ }
545
+ elif llm_provider == "Google/flan-t5":
546
+ pipe = pipeline("text2text-generation", model = "google/flan-t5-base")
547
+ clientmod = TextGeneration(pipe)
548
+ representation_model = {
549
+ "KeyBERT": keybert,
550
+ "MMR": mmr,
551
+ "test": clientmod
552
+ }
553
+ elif llm_provider == "LiquidAI/LFM2-350M":
554
+ pipe = pipeline("text-generation", model = "LiquidAI/LFM2-350M")
555
+ clientmod = TextGeneration(pipe)
556
+ representation_model = {
557
+ "KeyBERT": keybert,
558
+ "MMR": mmr,
559
+ "test": clientmod
560
+ }
561
+ elif llm_provider == "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B":
562
+ pipe = pipeline("text-generation", model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
563
+ clientmod = TextGeneration(pipe)
564
+ representation_model = {
565
+ "KeyBERT": keybert,
566
+ "MMR": mmr,
567
+ "test": clientmod
568
+ }
569
+
570
+ vectorizer_model = CountVectorizer(ngram_range=(1, xgram), stop_words='english')
571
+ topic_model = BERTopic(representation_model = representation_model, embedding_model=model, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=bert_top_n_words)
572
+ topics, probs = topic_model.fit_transform(topic_abs, embeddings=embeddings)
573
+
574
+ if(fine_tuning and topic_labelling):
575
+ generated_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["test"].values()]
576
+ topic_model.set_topic_labels(generated_labels)
577
+
578
+ return topic_model, topics, probs, embeddings
579
+
580
+ @st.cache_resource(ttl = 3600, show_spinner=False)
581
+ def Vis_Topics(extype):
582
+ fig1 = topic_model.visualize_topics()
583
+ return fig1
584
+ @st.cache_resource(ttl = 3600, show_spinner=False)
585
+ def Vis_Documents(extype):
586
+ fig2 = topic_model.visualize_document_datamap(topic_abs, embeddings=embeddings, custom_labels = True)
587
+ return fig2
588
+ @st.cache_resource(ttl = 3600, show_spinner=False)
589
+ def Vis_Hierarchy(extype):
590
+ fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic, custom_labels = True)
591
+ return fig3
592
+ @st.cache_resource(ttl = 3600, show_spinner=False)
593
+ def Vis_Heatmap(extype):
594
+ global topic_model
595
+ fig4 = topic_model.visualize_heatmap(n_clusters=num_topic-1, width=1000, height=1000, custom_labels = True)
596
+ return fig4
597
+ @st.cache_resource(ttl = 3600, show_spinner=False)
598
+ def Vis_Barchart(extype):
599
+ fig5 = topic_model.visualize_barchart(top_n_topics=num_topic, custom_labels = True)
600
+ return fig5
601
+
602
+ tab1, tab2, tab3, tab4 = st.tabs(["๐Ÿ“ˆ Generate visualization", "๐Ÿ“ƒ Reference", "๐Ÿ““ Recommended Reading", "โฌ‡๏ธ Download Help"])
603
+ with tab1:
604
+ try:
605
+ with st.spinner('Performing computations. Please wait ...'):
606
+
607
+ topic_model, topics, probs, embeddings = bertopic_vis(extype)
608
+ time.sleep(.5)
609
+ st.toast('Visualize Topics', icon='๐Ÿƒ')
610
+ fig1 = Vis_Topics(extype)
611
+
612
+ time.sleep(.5)
613
+ st.toast('Visualize Document', icon='๐Ÿƒ')
614
+ fig2 = Vis_Documents(extype)
615
+
616
+ time.sleep(.5)
617
+ st.toast('Visualize Document Hierarchy', icon='๐Ÿƒ')
618
+ fig3 = Vis_Hierarchy(extype)
619
+
620
+ time.sleep(.5)
621
+ st.toast('Visualize Topic Similarity', icon='๐Ÿƒ')
622
+ fig4 = Vis_Heatmap(extype)
623
+
624
+ time.sleep(.5)
625
+ st.toast('Visualize Terms', icon='๐Ÿƒ')
626
+ fig5 = Vis_Barchart(extype)
627
+
628
+ bertab1, bertab2, bertab3, bertab4, bertab5 = st.tabs(["Visualize Topics", "Visualize Terms", "Visualize Documents",
629
+ "Visualize Document Hierarchy", "Visualize Topic Similarity"])
630
+
631
+ with bertab1:
632
+ st.plotly_chart(fig1, use_container_width=True)
633
+ with bertab2:
634
+ st.plotly_chart(fig5, use_container_width=True)
635
+ with bertab3:
636
+ st.plotly_chart(fig2, use_container_width=True)
637
+ with bertab4:
638
+ st.plotly_chart(fig3, use_container_width=True)
639
+ with bertab5:
640
+ st.plotly_chart(fig4, use_container_width=True)
641
+
642
+ #===download results===#
643
+ results = topic_model.get_topic_info()
644
+ resultf = pd.DataFrame(results)
645
+ resultcsv = resultf.to_csv().encode("utf-8")
646
+ st.download_button(
647
+ label = "Download Results",
648
+ data=resultcsv,
649
+ file_name="results.csv",
650
+ mime="text\csv",
651
+ on_click="ignore",
652
+ )
653
+
654
+ except ValueError as e:
655
+ st.write(e)
656
+ st.error('๐Ÿ™‡โ€โ™‚๏ธ Please raise the number of topics and click submit')
657
+
658
+
659
+ except NameError as e:
660
+ st.warning('๐Ÿ–ฑ๏ธ Please click Submit')
661
+ st.write(e)
662
+
663
+ with tab2:
664
+ st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')
665
+
666
+ with tab3:
667
+ st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
668
+ st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Communityโ€™s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')
669
+
670
+ with tab4:
671
+ st.divider()
672
+ st.subheader(':blue[BERTopic]', anchor=False)
673
+ st.text("Click the camera icon on the top right menu")
674
+ st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_bertopic.jpg)")
675
+ st.divider()
676
+ st.subheader(':blue[Downloading CSV Results]', anchor=False)
677
+ st.button("Download Results")
678
+ st.text("Click Download results button at bottom of page")
679
+
680
+ except:
681
+ st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="๐Ÿšจ")
682
+ st.stop()
683
  =======
684
  #import module
685
  import streamlit as st