Spaces:
Runtime error
Runtime error
Commit ·
fcdc8ee
1
Parent(s): d9987f2
Create kbert_topics.py
Browse files- kbert_topics.py +28 -0
kbert_topics.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer, util
|
| 2 |
+
from keybert import KeyBERT
|
| 3 |
+
from preprocess_function import bert_preprocess
|
| 4 |
+
import spacy
|
| 5 |
+
nlp = spacy.load("en_core_web_trf")
|
| 6 |
+
|
| 7 |
+
model_sent = SentenceTransformer("all-mpnet-base-v2")
|
| 8 |
+
|
| 9 |
+
kw_model = KeyBERT(model = model_sent)
|
| 10 |
+
|
| 11 |
+
def extract_topics(event_value):
|
| 12 |
+
keywords = kw_model.extract_keywords(bert_preprocess(event_value), top_n = 10)
|
| 13 |
+
|
| 14 |
+
entity_remove = {'CARDINAL', 'DATE', 'GPE', 'LANGUAGE', 'LOC',
|
| 15 |
+
'NORP', 'ORDINAL', 'PERCENT', 'PERSON', 'QUANTITY', 'TIME'}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
keyword_bert = [token.lemma_ for key in keywords if key[1] >= 0.3 for token in nlp(key[0])]
|
| 19 |
+
|
| 20 |
+
work_list = []
|
| 21 |
+
|
| 22 |
+
for word_check in keyword_bert:
|
| 23 |
+
doc = nlp(word_check)
|
| 24 |
+
if not any(word.label_ in entity_remove for word in doc.ents):
|
| 25 |
+
work_list.extend(token.text for token in doc if token.pos_ != 'VERB')
|
| 26 |
+
|
| 27 |
+
return work_list
|
| 28 |
+
|