keneonyeachonam commited on
Commit
729f89e
·
1 Parent(s): 164e254

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +145 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import spacy
3
+ import numpy as np
4
+ from gensim import corpora, models
5
+ from itertools import chain
6
+ from sklearn.preprocessing import MultiLabelBinarizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ from itertools import islice
9
+ from scipy.signal import argrelmax
10
+
11
+ nlp = spacy.load('en_core_web_sm')
12
+
13
+
14
+ def window(seq, n=3):
15
+ it = iter(seq)
16
+ result = tuple(islice(it, n))
17
+ if len(result) == n:
18
+ yield result
19
+ for elem in it:
20
+ result = result[1:] + (elem,)
21
+ yield result
22
+
23
+ def get_depths(scores):
24
+
25
+ def climb(seq, i, mode='left'):
26
+
27
+ if mode == 'left':
28
+ while True:
29
+ curr = seq[i]
30
+ if i == 0:
31
+ return curr
32
+ i = i-1
33
+ if not seq[i] > curr:
34
+ return curr
35
+
36
+ if mode == 'right':
37
+ while True:
38
+ curr = seq[i]
39
+ if i == (len(seq)-1):
40
+ return curr
41
+ i = i+1
42
+ if not seq[i] > curr:
43
+ return curr
44
+
45
+ depths = []
46
+ for i in range(len(scores)):
47
+ score = scores[i]
48
+ l_peak = climb(scores, i, mode='left')
49
+ r_peak = climb(scores, i, mode='right')
50
+ depth = 0.5 * (l_peak + r_peak - (2*score))
51
+ depths.append(depth)
52
+
53
+ return np.array(depths)
54
+
55
+
56
+ def get_local_maxima(depth_scores, order=1):
57
+ maxima_ids = argrelmax(depth_scores, order=order)[0]
58
+ filtered_scores = np.zeros(len(depth_scores))
59
+ filtered_scores[maxima_ids] = depth_scores[maxima_ids]
60
+ return filtered_scores
61
+
62
+ def compute_threshold(scores):
63
+ s = scores[np.nonzero(scores)]
64
+ threshold = np.mean(s) - (np.std(s) / 2)
65
+ return threshold
66
+
67
+ def get_threshold_segments(scores, threshold=0.1):
68
+ segment_ids = np.where(scores >= threshold)[0]
69
+ return segment_ids
70
+
71
+
72
+ def print_list(lst):
73
+ for e in lst:
74
+ st.markdown("- " + e)
75
+
76
+
77
+ st.subheader("Topic Modeling with Segmentation")
78
+ uploaded_file = st.file_uploader("choose a text file", type=["txt"])
79
+ if uploaded_file is not None:
80
+ st.session_state["text"] = uploaded_file.getvalue().decode('utf-8')
81
+
82
+ st.write("OR")
83
+
84
+ input_text = st.text_area(
85
+ label="Enter text separated by newlines",
86
+ value="",
87
+ key="text",
88
+ height=150
89
+ )
90
+
91
+ button=st.button('Get Segments')
92
+ if (button==True) and input_text != "":
93
+ texts = input_text.split('\n')
94
+ sents = []
95
+ for text in texts:
96
+ doc = nlp(text)
97
+ for sent in doc.sents:
98
+ sents.append(sent)
99
+ MIN_LENGTH = 3
100
+ tokenized_sents = [[token.lemma_.lower() for token in sent if
101
+ not token.is_stop and not token.is_punct and token.text.strip() and len(token) >= MIN_LENGTH]
102
+ for sent in sents]
103
+ st.write("Modeling topics:")
104
+
105
+
106
+ np.random.seed(123)
107
+
108
+ N_TOPICS = 5
109
+ N_PASSES = 5
110
+
111
+ dictionary = corpora.Dictionary(tokenized_sents)
112
+ bow = [dictionary.doc2bow(sent) for sent in tokenized_sents]
113
+ topic_model = models.LdaModel(corpus=bow, id2word=dictionary, num_topics=N_TOPICS, passes=N_PASSES)
114
+ st.write("inferring topics ...")
115
+ THRESHOLD = 0.05
116
+ doc_topics = list(topic_model.get_document_topics(bow, minimum_probability=THRESHOLD))
117
+ k = 3
118
+ top_k_topics = [[t[0] for t in sorted(sent_topics, key=lambda x: x[1], reverse=True)][:k]
119
+ for sent_topics in doc_topics]
120
+ WINDOW_SIZE = 3
121
+ window_topics = window(top_k_topics, n=WINDOW_SIZE)
122
+ window_topics = [list(set(chain.from_iterable(window))) for window in window_topics]
123
+
124
+ binarizer = MultiLabelBinarizer(classes=range(N_TOPICS))
125
+
126
+ encoded_topic = binarizer.fit_transform(window_topics)
127
+ st.write("generating segments ...")
128
+ sims_topic = [cosine_similarity([pair[0]], [pair[1]])[0][0] for pair in zip(encoded_topic, encoded_topic[1:])]
129
+ depths_topic = get_depths(sims_topic)
130
+ filtered_topic = get_local_maxima(depths_topic, order=1)
131
+ threshold_topic = compute_threshold(filtered_topic)
132
+ threshold_segments_topic = get_threshold_segments(filtered_topic, threshold_topic)
133
+
134
+ segment_ids = threshold_segments_topic + WINDOW_SIZE
135
+
136
+ segment_ids = [0] + segment_ids.tolist() + [len(sents)]
137
+ slices = list(zip(segment_ids[:-1], segment_ids[1:]))
138
+
139
+ segmented = [sents[s[0]: s[1]] for s in slices]
140
+
141
+ for segment in segmented[:-1]:
142
+ print_list([s.text for s in segment])
143
+ st.markdown("""---""")
144
+
145
+ print_list([s.text for s in segmented[-1]])
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gensim
2
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0.tar.gz
3
+ matplotlib
4
+ numpy
5
+ scikit-learn
6
+ scipy
7
+ sentence_transformers
8
+ spacy