Spaces:

keneonyeachonam
/

Topic-Modeling-Clusters-Free-Text-032123

Runtime error

App Files Files Community

keneonyeachonam commited on Mar 21, 2023

Commit

729f89e

1 Parent(s): 164e254

Upload 2 files

Browse files

Files changed (2) hide show

app.py +145 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import streamlit as st
+import spacy
+import numpy as np
+from gensim import corpora, models
+from itertools import chain
+from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn.metrics.pairwise import cosine_similarity
+from itertools import islice
+from scipy.signal import argrelmax
+nlp = spacy.load('en_core_web_sm')
+def window(seq, n=3):
+    it = iter(seq)
+    result = tuple(islice(it, n))
+    if len(result) == n:
+        yield result
+    for elem in it:
+        result = result[1:] + (elem,)
+        yield result
+def get_depths(scores):
+    def climb(seq, i, mode='left'):
+        if mode == 'left':
+            while True:
+                curr = seq[i]
+                if i == 0:
+                    return curr
+                i = i-1
+                if not seq[i] > curr:
+                    return curr
+        if mode == 'right':
+            while True:
+                curr = seq[i]
+                if i == (len(seq)-1):
+                    return curr
+                i = i+1
+                if not seq[i] > curr:
+                    return curr
+    depths = []
+    for i in range(len(scores)):
+        score = scores[i]
+        l_peak = climb(scores, i, mode='left')
+        r_peak = climb(scores, i, mode='right')
+        depth = 0.5 * (l_peak + r_peak - (2*score))
+        depths.append(depth)
+    return np.array(depths)
+def get_local_maxima(depth_scores, order=1):
+    maxima_ids = argrelmax(depth_scores, order=order)[0]
+    filtered_scores = np.zeros(len(depth_scores))
+    filtered_scores[maxima_ids] = depth_scores[maxima_ids]
+    return filtered_scores
+def compute_threshold(scores):
+    s = scores[np.nonzero(scores)]
+    threshold = np.mean(s) - (np.std(s) / 2)
+    return threshold
+def get_threshold_segments(scores, threshold=0.1):
+    segment_ids = np.where(scores >= threshold)[0]
+    return segment_ids
+def print_list(lst):
+    for e in lst:
+        st.markdown("- " + e)
+st.subheader("Topic Modeling with Segmentation")
+uploaded_file = st.file_uploader("choose a text file", type=["txt"])
+if uploaded_file is not None:
+    st.session_state["text"] = uploaded_file.getvalue().decode('utf-8')
+st.write("OR")
+input_text = st.text_area(
+    label="Enter text separated by newlines",
+    value="",
+    key="text",
+    height=150
+)
+button=st.button('Get Segments')
+if (button==True) and input_text != "":
+    texts = input_text.split('\n')
+    sents = []
+    for text in texts:
+        doc = nlp(text)
+        for sent in doc.sents:
+            sents.append(sent)
+    MIN_LENGTH = 3
+    tokenized_sents = [[token.lemma_.lower() for token in sent if
+                        not token.is_stop and not token.is_punct and token.text.strip() and len(token) >= MIN_LENGTH]
+                        for sent in sents]
+    st.write("Modeling topics:")
+    np.random.seed(123)
+    N_TOPICS = 5
+    N_PASSES = 5
+    dictionary = corpora.Dictionary(tokenized_sents)
+    bow = [dictionary.doc2bow(sent) for sent in tokenized_sents]
+    topic_model = models.LdaModel(corpus=bow, id2word=dictionary, num_topics=N_TOPICS, passes=N_PASSES)
+    st.write("inferring topics ...")
+    THRESHOLD = 0.05
+    doc_topics = list(topic_model.get_document_topics(bow, minimum_probability=THRESHOLD))
+    k = 3
+    top_k_topics = [[t[0] for t in sorted(sent_topics, key=lambda x: x[1], reverse=True)][:k]
+                    for sent_topics in doc_topics]
+    WINDOW_SIZE = 3
+    window_topics = window(top_k_topics, n=WINDOW_SIZE)
+    window_topics = [list(set(chain.from_iterable(window))) for window in window_topics]
+    binarizer = MultiLabelBinarizer(classes=range(N_TOPICS))
+    encoded_topic = binarizer.fit_transform(window_topics)
+    st.write("generating segments ...")
+    sims_topic = [cosine_similarity([pair[0]], [pair[1]])[0][0] for pair in zip(encoded_topic, encoded_topic[1:])]
+    depths_topic = get_depths(sims_topic)
+    filtered_topic = get_local_maxima(depths_topic, order=1)
+    threshold_topic = compute_threshold(filtered_topic)
+    threshold_segments_topic = get_threshold_segments(filtered_topic, threshold_topic)
+    segment_ids = threshold_segments_topic + WINDOW_SIZE
+    segment_ids = [0] + segment_ids.tolist() + [len(sents)]
+    slices = list(zip(segment_ids[:-1], segment_ids[1:]))
+    segmented = [sents[s[0]: s[1]] for s in slices]
+    for segment in segmented[:-1]:
+        print_list([s.text for s in segment])
+        st.markdown("""---""")
+    print_list([s.text for s in segmented[-1]])

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gensim
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz
+matplotlib
+numpy
+scikit-learn
+scipy
+sentence_transformers
+spacy