Spaces:
Sleeping
Sleeping
Update pages/2 Topic Modeling.py
Browse files- pages/2 Topic Modeling.py +19 -20
pages/2 Topic Modeling.py
CHANGED
|
@@ -74,6 +74,9 @@ with st.popover("🔗 Menu"):
|
|
| 74 |
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
| 75 |
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
| 76 |
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
st.header("Topic Modeling", anchor=False)
|
| 79 |
st.subheader('Put your file here...', anchor=False)
|
|
@@ -196,7 +199,7 @@ if uploaded_file is not None:
|
|
| 196 |
method = c1.selectbox(
|
| 197 |
'Choose method',
|
| 198 |
('Choose...', 'pyLDA', 'Biterm', 'BERTopic'))
|
| 199 |
-
ColCho = c2.selectbox('Choose column', (["Title","Abstract"]))
|
| 200 |
num_cho = c3.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
|
| 201 |
|
| 202 |
d1, d2 = st.columns([3,7])
|
|
@@ -235,8 +238,8 @@ if uploaded_file is not None:
|
|
| 235 |
if fine_tuning:
|
| 236 |
topic_labelling = st.toggle("Automatic topic labelling")
|
| 237 |
if topic_labelling:
|
| 238 |
-
|
| 239 |
-
if
|
| 240 |
api_key = st.text_input("API Key")
|
| 241 |
|
| 242 |
else:
|
|
@@ -245,6 +248,10 @@ if uploaded_file is not None:
|
|
| 245 |
#===clean csv===
|
| 246 |
@st.cache_data(ttl=3600, show_spinner=False)
|
| 247 |
def clean_csv(extype):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
paper = papers.dropna(subset=[ColCho])
|
| 249 |
|
| 250 |
#===mapping===
|
|
@@ -527,37 +534,30 @@ if uploaded_file is not None:
|
|
| 527 |
"MMR": mmr,
|
| 528 |
}
|
| 529 |
if topic_labelling:
|
| 530 |
-
if
|
| 531 |
client = openai.OpenAI(api_key=api_key)
|
| 532 |
representation_model = {
|
| 533 |
"KeyBERT": keybert,
|
| 534 |
"MMR": mmr,
|
| 535 |
"test": OpenAI(client, model = "gpt-4o-mini", delay_in_seconds=10)
|
| 536 |
}
|
| 537 |
-
elif
|
| 538 |
-
|
| 539 |
-
clientmod = TextGeneration(
|
| 540 |
representation_model = {
|
| 541 |
"KeyBERT": keybert,
|
| 542 |
"MMR": mmr,
|
| 543 |
"test": clientmod
|
| 544 |
}
|
| 545 |
-
elif
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
torch_dtype = "auto",
|
| 549 |
-
device_map = "auto",
|
| 550 |
-
)
|
| 551 |
-
clientmod = TextGeneration(gen)
|
| 552 |
-
|
| 553 |
representation_model = {
|
| 554 |
"KeyBERT": keybert,
|
| 555 |
"MMR": mmr,
|
| 556 |
-
"test":
|
| 557 |
}
|
| 558 |
|
| 559 |
-
|
| 560 |
-
|
| 561 |
vectorizer_model = CountVectorizer(ngram_range=(1, xgram), stop_words='english')
|
| 562 |
topic_model = BERTopic(representation_model = representation_model, embedding_model=model, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=bert_top_n_words)
|
| 563 |
topics, probs = topic_model.fit_transform(topic_abs, embeddings=embeddings)
|
|
@@ -668,7 +668,6 @@ if uploaded_file is not None:
|
|
| 668 |
st.button("Download Results")
|
| 669 |
st.text("Click Download results button at bottom of page")
|
| 670 |
|
| 671 |
-
except
|
| 672 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
| 673 |
-
st.write(e)
|
| 674 |
st.stop()
|
|
|
|
| 74 |
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
| 75 |
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
| 76 |
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
| 77 |
+
st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
|
| 78 |
+
st.page_link("pages/9 Summarization.py", label = "Summarization",icon ="9️⃣")
|
| 79 |
+
st.page_link("pages/10 WordCloud.py", label = "WordCloud", icon = "🔟")
|
| 80 |
|
| 81 |
st.header("Topic Modeling", anchor=False)
|
| 82 |
st.subheader('Put your file here...', anchor=False)
|
|
|
|
| 199 |
method = c1.selectbox(
|
| 200 |
'Choose method',
|
| 201 |
('Choose...', 'pyLDA', 'Biterm', 'BERTopic'))
|
| 202 |
+
ColCho = c2.selectbox('Choose column', (["Abstract","Title", "Abstract + Title"]))
|
| 203 |
num_cho = c3.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
|
| 204 |
|
| 205 |
d1, d2 = st.columns([3,7])
|
|
|
|
| 238 |
if fine_tuning:
|
| 239 |
topic_labelling = st.toggle("Automatic topic labelling")
|
| 240 |
if topic_labelling:
|
| 241 |
+
llm_provider = st.selectbox("Model",["OpenAI/gpt-4o","Google/flan-t5","LiquidAI/LFM2-350M"])
|
| 242 |
+
if llm_provider == "OpenAI/gpt-4o":
|
| 243 |
api_key = st.text_input("API Key")
|
| 244 |
|
| 245 |
else:
|
|
|
|
| 248 |
#===clean csv===
|
| 249 |
@st.cache_data(ttl=3600, show_spinner=False)
|
| 250 |
def clean_csv(extype):
|
| 251 |
+
if (ColCho=="Abstract + Title"):
|
| 252 |
+
papers["Abstract + Title"] = papers["Title"] + " " + papers["Abstract"]
|
| 253 |
+
st.write(papers["Abstract + Title"])
|
| 254 |
+
|
| 255 |
paper = papers.dropna(subset=[ColCho])
|
| 256 |
|
| 257 |
#===mapping===
|
|
|
|
| 534 |
"MMR": mmr,
|
| 535 |
}
|
| 536 |
if topic_labelling:
|
| 537 |
+
if llm_provider == "OpenAI/gpt-4o":
|
| 538 |
client = openai.OpenAI(api_key=api_key)
|
| 539 |
representation_model = {
|
| 540 |
"KeyBERT": keybert,
|
| 541 |
"MMR": mmr,
|
| 542 |
"test": OpenAI(client, model = "gpt-4o-mini", delay_in_seconds=10)
|
| 543 |
}
|
| 544 |
+
elif llm_provider == "Google/flan-t5":
|
| 545 |
+
pipe = pipeline("text2text-generation", model = "google/flan-t5-base")
|
| 546 |
+
clientmod = TextGeneration(pipe)
|
| 547 |
representation_model = {
|
| 548 |
"KeyBERT": keybert,
|
| 549 |
"MMR": mmr,
|
| 550 |
"test": clientmod
|
| 551 |
}
|
| 552 |
+
elif llm_provider == "LiquidAI/LFM2-350M":
|
| 553 |
+
pipe = pipeline("text-generation", model = "LiquidAI/LFM2-350M")
|
| 554 |
+
clientmod = TextGeneration(pipe)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
representation_model = {
|
| 556 |
"KeyBERT": keybert,
|
| 557 |
"MMR": mmr,
|
| 558 |
+
"test": clientmod
|
| 559 |
}
|
| 560 |
|
|
|
|
|
|
|
| 561 |
vectorizer_model = CountVectorizer(ngram_range=(1, xgram), stop_words='english')
|
| 562 |
topic_model = BERTopic(representation_model = representation_model, embedding_model=model, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=bert_top_n_words)
|
| 563 |
topics, probs = topic_model.fit_transform(topic_abs, embeddings=embeddings)
|
|
|
|
| 668 |
st.button("Download Results")
|
| 669 |
st.text("Click Download results button at bottom of page")
|
| 670 |
|
| 671 |
+
except:
|
| 672 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
|
|
|
| 673 |
st.stop()
|