haotle commited on
Commit
6e13ca0
·
verified ·
1 Parent(s): dcbe703

Update pages/2 Topic Modeling.py

Browse files
Files changed (1) hide show
  1. pages/2 Topic Modeling.py +0 -673
pages/2 Topic Modeling.py CHANGED
@@ -679,677 +679,4 @@ if uploaded_file is not None:
679
  except:
680
  st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
681
  st.stop()
682
- =======
683
- #import module
684
- import streamlit as st
685
- import streamlit.components.v1 as components
686
- import pandas as pd
687
- import numpy as np
688
- import re
689
- import string
690
- import nltk
691
- nltk.download('wordnet')
692
- from nltk.stem import WordNetLemmatizer
693
- nltk.download('stopwords')
694
- from nltk.corpus import stopwords
695
- import gensim
696
- import gensim.corpora as corpora
697
- from gensim.corpora import Dictionary
698
- from gensim.models.coherencemodel import CoherenceModel
699
- from gensim.models.ldamodel import LdaModel
700
- from gensim.models import Phrases
701
- from gensim.models.phrases import Phraser
702
- from pprint import pprint
703
- import pickle
704
- import pyLDAvis
705
- import pyLDAvis.gensim_models as gensimvis
706
- from io import StringIO
707
- from ipywidgets.embed import embed_minimal_html
708
- from nltk.stem.snowball import SnowballStemmer
709
- from bertopic import BERTopic
710
- from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, TextGeneration
711
- import plotly.express as px
712
- from sklearn.cluster import KMeans
713
- from sklearn.feature_extraction.text import CountVectorizer
714
- import bitermplus as btm
715
- import tmplot as tmp
716
- import tomotopy
717
- import sys
718
- import spacy
719
- import en_core_web_sm
720
- import pipeline
721
- from html2image import Html2Image
722
- from umap import UMAP
723
- import os
724
- import time
725
- import json
726
- from tools import sourceformat as sf
727
- import datamapplot
728
- from sentence_transformers import SentenceTransformer
729
- import openai
730
- from transformers import pipeline
731
-
732
- #===config===
733
- st.set_page_config(
734
- page_title="Coconut",
735
- page_icon="🥥",
736
- layout="wide",
737
- initial_sidebar_state="collapsed"
738
- )
739
-
740
- hide_streamlit_style = """
741
- <style>
742
- #MainMenu
743
- {visibility: hidden;}
744
- footer {visibility: hidden;}
745
- [data-testid="collapsedControl"] {display: none}
746
- </style>
747
- """
748
- st.markdown(hide_streamlit_style, unsafe_allow_html=True)
749
-
750
- with st.popover("🔗 Menu"):
751
- st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
752
- st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
753
- st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
754
- st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
755
- st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
756
- st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
757
- st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
758
- st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
759
- st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
760
- st.page_link("pages/9 Summarization.py", label = "Summarization",icon ="9️⃣")
761
- st.page_link("pages/10 WordCloud.py", label = "WordCloud", icon = "🔟")
762
-
763
- st.header("Topic Modeling", anchor=False)
764
- st.subheader('Put your file here...', anchor=False)
765
-
766
- #========unique id========
767
- @st.cache_resource(ttl=3600)
768
- def create_list():
769
- l = [1, 2, 3]
770
- return l
771
-
772
- l = create_list()
773
- first_list_value = l[0]
774
- l[0] = first_list_value + 1
775
- uID = str(l[0])
776
-
777
- @st.cache_data(ttl=3600)
778
- def get_ext(uploaded_file):
779
- extype = uID+uploaded_file.name
780
- return extype
781
-
782
- #===clear cache===
783
-
784
- def reset_biterm():
785
- try:
786
- biterm_map.clear()
787
- biterm_bar.clear()
788
- except NameError:
789
- biterm_topic.clear()
790
-
791
- def reset_all():
792
- st.cache_data.clear()
793
-
794
- #===avoiding deadlock===
795
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
796
-
797
- #===upload file===
798
- @st.cache_data(ttl=3600)
799
- def upload(file):
800
- papers = pd.read_csv(uploaded_file)
801
- if "About the data" in papers.columns[0]:
802
- papers = sf.dim(papers)
803
- col_dict = {'MeSH terms': 'Keywords',
804
- 'PubYear': 'Year',
805
- 'Times cited': 'Cited by',
806
- 'Publication Type': 'Document Type'
807
- }
808
- papers.rename(columns=col_dict, inplace=True)
809
-
810
- return papers
811
-
812
- @st.cache_data(ttl=3600)
813
- def conv_txt(extype):
814
- if("PMID" in (uploaded_file.read()).decode()):
815
- uploaded_file.seek(0)
816
- papers = sf.medline(uploaded_file)
817
- print(papers)
818
- return papers
819
- col_dict = {'TI': 'Title',
820
- 'SO': 'Source title',
821
- 'DE': 'Author Keywords',
822
- 'DT': 'Document Type',
823
- 'AB': 'Abstract',
824
- 'TC': 'Cited by',
825
- 'PY': 'Year',
826
- 'ID': 'Keywords Plus',
827
- 'rights_date_used': 'Year'}
828
- uploaded_file.seek(0)
829
- papers = pd.read_csv(uploaded_file, sep='\t')
830
- if("htid" in papers.columns):
831
- papers = sf.htrc(papers)
832
- papers.rename(columns=col_dict, inplace=True)
833
- print(papers)
834
- return papers
835
-
836
-
837
- @st.cache_data(ttl=3600)
838
- def conv_json(extype):
839
- col_dict={'title': 'title',
840
- 'rights_date_used': 'Year',
841
- }
842
-
843
- data = json.load(uploaded_file)
844
- hathifile = data['gathers']
845
- keywords = pd.DataFrame.from_records(hathifile)
846
-
847
- keywords = sf.htrc(keywords)
848
- keywords.rename(columns=col_dict,inplace=True)
849
- return keywords
850
-
851
- @st.cache_resource(ttl=3600)
852
- def conv_pub(extype):
853
- if (get_ext(extype)).endswith('.tar.gz'):
854
- bytedata = extype.read()
855
- keywords = sf.readPub(bytedata)
856
- elif (get_ext(extype)).endswith('.xml'):
857
- bytedata = extype.read()
858
- keywords = sf.readxml(bytedata)
859
- return keywords
860
-
861
- #===Read data===
862
- uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz','xml'], on_change=reset_all)
863
-
864
- if uploaded_file is not None:
865
- try:
866
- extype = get_ext(uploaded_file)
867
-
868
- if extype.endswith('.csv'):
869
- papers = upload(extype)
870
- elif extype.endswith('.txt'):
871
- papers = conv_txt(extype)
872
-
873
- elif extype.endswith('.json'):
874
- papers = conv_json(extype)
875
- elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
876
- papers = conv_pub(uploaded_file)
877
-
878
- coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
879
-
880
- c1, c2, c3 = st.columns([3,3,4])
881
- method = c1.selectbox(
882
- 'Choose method',
883
- ('Choose...', 'pyLDA', 'Biterm', 'BERTopic'))
884
- ColCho = c2.selectbox('Choose column', (["Abstract","Title", "Abstract + Title"]))
885
- num_cho = c3.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
886
-
887
- d1, d2 = st.columns([3,7])
888
- xgram = d1.selectbox("N-grams", ("1", "2", "3"))
889
- xgram = int(xgram)
890
- words_to_remove = d2.text_input("Remove specific words. Separate words by semicolons (;)")
891
-
892
- rem_copyright = d1.toggle('Remove copyright statement', value=True)
893
- rem_punc = d2.toggle('Remove punctuation', value=True)
894
-
895
- #===advance settings===
896
- with st.expander("🧮 Show advance settings"):
897
- t1, t2, t3 = st.columns([3,3,4])
898
- if method == 'pyLDA':
899
- py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
900
- py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
901
- opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
902
-
903
- elif method == 'Biterm':
904
- btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1)
905
- btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1)
906
- opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
907
-
908
- elif method == 'BERTopic':
909
- u1, u2 = st.columns([5,5])
910
-
911
- bert_top_n_words = u1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1)
912
- bert_random_state = u2.number_input('random_state', value=42 , min_value=1, max_value=None, step=1)
913
- bert_n_components = u1.number_input('n_components', value=5 , min_value=1, max_value=None, step=1)
914
- bert_n_neighbors = u2.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1)
915
- bert_embedding_model = st.radio(
916
- "embedding_model",
917
- ["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2", "en_core_web_sm"], index=0, horizontal=True)
918
-
919
- fine_tuning = st.toggle("Use Fine-tuning")
920
- if fine_tuning:
921
- topic_labelling = st.toggle("Automatic topic labelling")
922
- if topic_labelling:
923
- llm_provider = st.selectbox("Model",["OpenAI/gpt-4o","Google/flan-t5","LiquidAI/LFM2-350M"])
924
- if llm_provider == "OpenAI/gpt-4o":
925
- api_key = st.text_input("API Key")
926
-
927
- else:
928
- st.write('Please choose your preferred method')
929
-
930
- #===clean csv===
931
- @st.cache_data(ttl=3600, show_spinner=False)
932
- def clean_csv(extype):
933
- if (ColCho=="Abstract + Title"):
934
- papers["Abstract + Title"] = papers["Title"] + " " + papers["Abstract"]
935
- st.write(papers["Abstract + Title"])
936
-
937
- paper = papers.dropna(subset=[ColCho])
938
-
939
- #===mapping===
940
- paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
941
- if rem_punc:
942
- paper['Abstract_pre'] = paper['Abstract_pre'].map(
943
- lambda x: re.sub(f"[{re.escape(string.punctuation)}]", " ", x)
944
- ).map(lambda x: re.sub(r"\s+", " ", x).strip())
945
- paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('[\u2018\u2019\u201c\u201d]', '', regex=True)
946
- if rem_copyright:
947
- paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('©.*', '', x))
948
-
949
- #===stopword removal===
950
- stop = stopwords.words('english')
951
- paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
952
-
953
- #===lemmatize===
954
- lemmatizer = WordNetLemmatizer()
955
-
956
- @st.cache_resource(ttl=3600)
957
- def lemmatize_words(text):
958
- words = text.split()
959
- words = [lemmatizer.lemmatize(word) for word in words]
960
- return ' '.join(words)
961
- paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
962
-
963
- words_rmv = [word.strip() for word in words_to_remove.split(";")]
964
- remove_dict = {word: None for word in words_rmv}
965
-
966
- @st.cache_resource(ttl=3600)
967
- def remove_words(text):
968
- words = text.split()
969
- cleaned_words = [word for word in words if word not in remove_dict]
970
- return ' '.join(cleaned_words)
971
- paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
972
-
973
- topic_abs = paper.Abstract_lem.values.tolist()
974
- return topic_abs, paper
975
-
976
- topic_abs, paper=clean_csv(extype)
977
-
978
- if st.button("Submit", on_click=reset_all):
979
- num_topic = num_cho
980
-
981
- if method == 'BERTopic':
982
- st.info('BERTopic is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
983
-
984
- #===topic===
985
- if method == 'Choose...':
986
- st.write('')
987
-
988
- elif method == 'pyLDA':
989
- tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
990
-
991
- with tab1:
992
- #===visualization===
993
- @st.cache_data(ttl=3600, show_spinner=False)
994
- def pylda(extype):
995
- topic_abs_LDA = [t.split(' ') for t in topic_abs]
996
-
997
- bigram = Phrases(topic_abs_LDA, min_count=xgram, threshold=opt_threshold)
998
- trigram = Phrases(bigram[topic_abs_LDA], threshold=opt_threshold)
999
- bigram_mod = Phraser(bigram)
1000
- trigram_mod = Phraser(trigram)
1001
-
1002
- topic_abs_LDA = [trigram_mod[bigram_mod[doc]] for doc in topic_abs_LDA]
1003
-
1004
- id2word = Dictionary(topic_abs_LDA)
1005
- corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
1006
- #===LDA===
1007
- lda_model = LdaModel(corpus=corpus,
1008
- id2word=id2word,
1009
- num_topics=num_topic,
1010
- random_state=py_random_state,
1011
- chunksize=py_chunksize,
1012
- alpha='auto',
1013
- per_word_topics=False)
1014
- pprint(lda_model.print_topics())
1015
- doc_lda = lda_model[corpus]
1016
- topics = lda_model.show_topics(num_words = 30,formatted=False)
1017
-
1018
- #===visualization===
1019
- coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
1020
- coherence_lda = coherence_model_lda.get_coherence()
1021
- vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
1022
- py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
1023
- return py_lda_vis_html, coherence_lda, vis, topics
1024
-
1025
- with st.spinner('Performing computations. Please wait ...'):
1026
- try:
1027
- py_lda_vis_html, coherence_lda, vis, topics = pylda(extype)
1028
- st.write('Coherence score: ', coherence_lda)
1029
- components.html(py_lda_vis_html, width=1500, height=800)
1030
- st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
1031
-
1032
- @st.cache_data(ttl=3600, show_spinner=False)
1033
- def img_lda(vis):
1034
- pyLDAvis.save_html(vis, 'output.html')
1035
- hti = Html2Image()
1036
- hti.browser.flags = ['--default-background-color=ffffff', '--hide-scrollbars']
1037
- hti.browser.use_new_headless = None
1038
- css = "body {background: white;}"
1039
- hti.screenshot(
1040
- other_file='output.html', css_str=css, size=(1500, 800),
1041
- save_as='ldavis_img.png'
1042
- )
1043
-
1044
- img_lda(vis)
1045
-
1046
- d1, d2 = st.columns(2)
1047
- with open("ldavis_img.png", "rb") as file:
1048
- btn = d1.download_button(
1049
- label="Download image",
1050
- data=file,
1051
- file_name="ldavis_img.png",
1052
- mime="image/png"
1053
- )
1054
-
1055
- #===download results===#
1056
- resultf = pd.DataFrame(topics)
1057
- #formatting
1058
- resultf = resultf.transpose()
1059
- resultf = resultf.drop([0])
1060
- resultf = resultf.explode(list(range(len(resultf.columns))), ignore_index=False)
1061
-
1062
- resultcsv = resultf.to_csv().encode("utf-8")
1063
- d2.download_button(
1064
- label = "Download Results",
1065
- data=resultcsv,
1066
- file_name="results.csv",
1067
- mime="text\csv",
1068
- on_click="ignore")
1069
-
1070
- except NameError as f:
1071
- st.warning('🖱️ Please click Submit')
1072
-
1073
- with tab2:
1074
- st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
1075
-
1076
- with tab3:
1077
- st.markdown('**Chen, X., & Wang, H. (2019, January). Automated chat transcript analysis using topic modeling for library reference services. Proceedings of the Association for Information Science and Technology, 56(1), 368–371.** https://doi.org/10.1002/pra2.31')
1078
- st.markdown('**Joo, S., Ingram, E., & Cahill, M. (2021, December 15). Exploring Topics and Genres in Storytime Books: A Text Mining Approach. Evidence Based Library and Information Practice, 16(4), 41–62.** https://doi.org/10.18438/eblip29963')
1079
- st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Topic Modeling. Text Mining for Information Professionals, 105–137.** https://doi.org/10.1007/978-3-030-85085-2_4')
1080
- st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477–505.** https://doi.org/10.1007/s11192-019-03137-5')
1081
-
1082
- with tab4:
1083
- st.subheader(':blue[pyLDA]', anchor=False)
1084
- st.button('Download image')
1085
- st.text("Click Download Image button.")
1086
- st.divider()
1087
- st.subheader(':blue[Downloading CSV Results]', anchor=False)
1088
- st.button("Download Results")
1089
- st.text("Click Download results button at bottom of page")
1090
-
1091
- #===Biterm===
1092
- elif method == 'Biterm':
1093
-
1094
- #===optimize Biterm===
1095
- @st.cache_data(ttl=3600, show_spinner=False)
1096
- def biterm_topic(extype):
1097
- tokenized_abs = [t.split(' ') for t in topic_abs]
1098
-
1099
- bigram = Phrases(tokenized_abs, min_count=xgram, threshold=opt_threshold)
1100
- trigram = Phrases(bigram[tokenized_abs], threshold=opt_threshold)
1101
- bigram_mod = Phraser(bigram)
1102
- trigram_mod = Phraser(trigram)
1103
-
1104
- topic_abs_ngram = [trigram_mod[bigram_mod[doc]] for doc in tokenized_abs]
1105
-
1106
- topic_abs_str = [' '.join(doc) for doc in topic_abs_ngram]
1107
-
1108
-
1109
- X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs_str)
1110
- tf = np.array(X.sum(axis=0)).ravel()
1111
- docs_vec = btm.get_vectorized_docs(topic_abs, vocabulary)
1112
- docs_lens = list(map(len, docs_vec))
1113
- biterms = btm.get_biterms(docs_vec)
1114
-
1115
- model = btm.BTM(X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
1116
- model.fit(biterms, iterations=btm_iterations)
1117
-
1118
- p_zd = model.transform(docs_vec)
1119
- coherence = model.coherence_
1120
- phi = tmp.get_phi(model)
1121
- topics_coords = tmp.prepare_coords(model)
1122
- totaltop = topics_coords.label.values.tolist()
1123
- perplexity = model.perplexity_
1124
- top_topics = model.df_words_topics_
1125
-
1126
- return topics_coords, phi, totaltop, perplexity, top_topics
1127
-
1128
- tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
1129
- with tab1:
1130
- try:
1131
- with st.spinner('Performing computations. Please wait ...'):
1132
- topics_coords, phi, totaltop, perplexity, top_topics = biterm_topic(extype)
1133
- col1, col2 = st.columns([4,6])
1134
-
1135
- @st.cache_data(ttl=3600)
1136
- def biterm_map(extype):
1137
- btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
1138
- return btmvis_coords
1139
-
1140
- @st.cache_data(ttl=3600)
1141
- def biterm_bar(extype):
1142
- terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
1143
- btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
1144
- return btmvis_probs
1145
-
1146
- with col1:
1147
- st.write('Perplexity score: ', perplexity)
1148
- st.write('')
1149
- numvis = st.selectbox(
1150
- 'Choose topic',
1151
- (totaltop), on_change=reset_biterm)
1152
- btmvis_coords = biterm_map(extype)
1153
- st.altair_chart(btmvis_coords)
1154
- with col2:
1155
- btmvis_probs = biterm_bar(extype)
1156
- st.altair_chart(btmvis_probs, use_container_width=True)
1157
-
1158
- #===download results===#
1159
- resultcsv = top_topics.to_csv().encode("utf-8")
1160
- st.download_button(label = "Download Results", data=resultcsv, file_name="results.csv", mime="text\csv", on_click="ignore")
1161
-
1162
- except ValueError as g:
1163
- st.error('🙇‍♂️ Please raise the number of topics and click submit')
1164
-
1165
- except NameError as f:
1166
- st.warning('🖱️ Please click Submit')
1167
-
1168
- with tab2:
1169
- st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
1170
- with tab3:
1171
- st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
1172
- st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
1173
- st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
1174
- st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
1175
- with tab4:
1176
- st.subheader(':blue[Biterm]', anchor=False)
1177
- st.text("Click the three dots at the top right then select the desired format.")
1178
- st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_biterm.jpg)")
1179
- st.divider()
1180
- st.subheader(':blue[Downloading CSV Results]', anchor=False)
1181
- st.button("Download Results")
1182
- st.text("Click Download results button at bottom of page")
1183
-
1184
-
1185
- #===BERTopic===
1186
- elif method == 'BERTopic':
1187
- @st.cache_resource(ttl = 3600, show_spinner=False)
1188
- #@st.cache_data(ttl=3600, show_spinner=False)
1189
- def bertopic_vis(extype):
1190
- umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
1191
- min_dist=0.0, metric='cosine', random_state=bert_random_state)
1192
- cluster_model = KMeans(n_clusters=num_topic)
1193
- if bert_embedding_model == 'all-MiniLM-L6-v2':
1194
- model = SentenceTransformer('all-MiniLM-L6-v2')
1195
- lang = 'en'
1196
- embeddings = model.encode(topic_abs, show_progress_bar=True)
1197
-
1198
- elif bert_embedding_model == 'en_core_web_sm':
1199
- nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
1200
- model = nlp
1201
- lang = 'en'
1202
- embeddings = np.array([nlp(text).vector for text in topic_abs])
1203
-
1204
- elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
1205
- model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
1206
- lang = 'multilingual'
1207
- embeddings = model.encode(topic_abs, show_progress_bar=True)
1208
-
1209
- representation_model = ""
1210
-
1211
- if fine_tuning:
1212
- keybert = KeyBERTInspired()
1213
- mmr = MaximalMarginalRelevance(diversity=0.3)
1214
- representation_model = {
1215
- "KeyBERT": keybert,
1216
- "MMR": mmr,
1217
- }
1218
- if topic_labelling:
1219
- if llm_provider == "OpenAI/gpt-4o":
1220
- client = openai.OpenAI(api_key=api_key)
1221
- representation_model = {
1222
- "KeyBERT": keybert,
1223
- "MMR": mmr,
1224
- "test": OpenAI(client, model = "gpt-4o-mini", delay_in_seconds=10)
1225
- }
1226
- elif llm_provider == "Google/flan-t5":
1227
- pipe = pipeline("text2text-generation", model = "google/flan-t5-base")
1228
- clientmod = TextGeneration(pipe)
1229
- representation_model = {
1230
- "KeyBERT": keybert,
1231
- "MMR": mmr,
1232
- "test": clientmod
1233
- }
1234
- elif llm_provider == "LiquidAI/LFM2-350M":
1235
- pipe = pipeline("text-generation", model = "LiquidAI/LFM2-350M")
1236
- clientmod = TextGeneration(pipe)
1237
- representation_model = {
1238
- "KeyBERT": keybert,
1239
- "MMR": mmr,
1240
- "test": clientmod
1241
- }
1242
-
1243
- vectorizer_model = CountVectorizer(ngram_range=(1, xgram), stop_words='english')
1244
- topic_model = BERTopic(representation_model = representation_model, embedding_model=model, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=bert_top_n_words)
1245
- topics, probs = topic_model.fit_transform(topic_abs, embeddings=embeddings)
1246
-
1247
- if(fine_tuning and topic_labelling):
1248
- generated_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["test"].values()]
1249
- topic_model.set_topic_labels(generated_labels)
1250
-
1251
- return topic_model, topics, probs, embeddings
1252
-
1253
- @st.cache_resource(ttl = 3600, show_spinner=False)
1254
- def Vis_Topics(extype):
1255
- fig1 = topic_model.visualize_topics()
1256
- return fig1
1257
- @st.cache_resource(ttl = 3600, show_spinner=False)
1258
- def Vis_Documents(extype):
1259
- fig2 = topic_model.visualize_document_datamap(topic_abs, embeddings=embeddings, custom_labels = True)
1260
- return fig2
1261
- @st.cache_resource(ttl = 3600, show_spinner=False)
1262
- def Vis_Hierarchy(extype):
1263
- fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic, custom_labels = True)
1264
- return fig3
1265
- @st.cache_resource(ttl = 3600, show_spinner=False)
1266
- def Vis_Heatmap(extype):
1267
- global topic_model
1268
- fig4 = topic_model.visualize_heatmap(n_clusters=num_topic-1, width=1000, height=1000, custom_labels = True)
1269
- return fig4
1270
- @st.cache_resource(ttl = 3600, show_spinner=False)
1271
- def Vis_Barchart(extype):
1272
- fig5 = topic_model.visualize_barchart(top_n_topics=num_topic, custom_labels = True)
1273
- return fig5
1274
-
1275
- tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
1276
- with tab1:
1277
- try:
1278
- with st.spinner('Performing computations. Please wait ...'):
1279
-
1280
- topic_model, topics, probs, embeddings = bertopic_vis(extype)
1281
- time.sleep(.5)
1282
- st.toast('Visualize Topics', icon='🏃')
1283
- fig1 = Vis_Topics(extype)
1284
-
1285
- time.sleep(.5)
1286
- st.toast('Visualize Document', icon='🏃')
1287
- fig2 = Vis_Documents(extype)
1288
-
1289
- time.sleep(.5)
1290
- st.toast('Visualize Document Hierarchy', icon='🏃')
1291
- fig3 = Vis_Hierarchy(extype)
1292
-
1293
- time.sleep(.5)
1294
- st.toast('Visualize Topic Similarity', icon='🏃')
1295
- fig4 = Vis_Heatmap(extype)
1296
-
1297
- time.sleep(.5)
1298
- st.toast('Visualize Terms', icon='🏃')
1299
- fig5 = Vis_Barchart(extype)
1300
-
1301
- bertab1, bertab2, bertab3, bertab4, bertab5 = st.tabs(["Visualize Topics", "Visualize Terms", "Visualize Documents",
1302
- "Visualize Document Hierarchy", "Visualize Topic Similarity"])
1303
-
1304
- with bertab1:
1305
- st.plotly_chart(fig1, use_container_width=True)
1306
- with bertab2:
1307
- st.plotly_chart(fig5, use_container_width=True)
1308
- with bertab3:
1309
- st.plotly_chart(fig2, use_container_width=True)
1310
- with bertab4:
1311
- st.plotly_chart(fig3, use_container_width=True)
1312
- with bertab5:
1313
- st.plotly_chart(fig4, use_container_width=True)
1314
-
1315
- #===download results===#
1316
- results = topic_model.get_topic_info()
1317
- resultf = pd.DataFrame(results)
1318
- resultcsv = resultf.to_csv().encode("utf-8")
1319
- st.download_button(
1320
- label = "Download Results",
1321
- data=resultcsv,
1322
- file_name="results.csv",
1323
- mime="text\csv",
1324
- on_click="ignore",
1325
- )
1326
-
1327
- except ValueError as e:
1328
- st.write(e)
1329
- st.error('🙇‍♂️ Please raise the number of topics and click submit')
1330
-
1331
-
1332
- except NameError as e:
1333
- st.warning('🖱️ Please click Submit')
1334
- st.write(e)
1335
-
1336
- with tab2:
1337
- st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')
1338
-
1339
- with tab3:
1340
- st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
1341
- st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Community’s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')
1342
-
1343
- with tab4:
1344
- st.divider()
1345
- st.subheader(':blue[BERTopic]', anchor=False)
1346
- st.text("Click the camera icon on the top right menu")
1347
- st.markdown("![Downloading visualization](https://raw.githubusercontent.com/faizhalas/library-tools/main/images/download_bertopic.jpg)")
1348
- st.divider()
1349
- st.subheader(':blue[Downloading CSV Results]', anchor=False)
1350
- st.button("Download Results")
1351
- st.text("Click Download results button at bottom of page")
1352
 
1353
- except:
1354
- st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
1355
- st.stop()
 
679
  except:
680
  st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
681
  st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682