File size: 1,223 Bytes
fcdc8ee
 
 
 
772fc72
ec5d4bf
 
fcdc8ee
 
 
 
 
 
5b6ee8a
 
9217965
5b6ee8a
9dea803
5b6ee8a
fcdc8ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b6ee8a
 
 
fcdc8ee
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from sentence_transformers import SentenceTransformer, util
from keybert import KeyBERT
from preprocess_function import bert_preprocess
import spacy
import spacy_transformers
import spacy.cli
spacy.cli.download("en_core_web_trf")
nlp = spacy.load("en_core_web_trf")

model_sent = SentenceTransformer("all-mpnet-base-v2")

kw_model = KeyBERT(model = model_sent)


common_words = []
with open("common_words_v2.txt", "r") as file:
    for line in file:
        common_words.append(line.strip())

def extract_topics(event_value):
    keywords = kw_model.extract_keywords(bert_preprocess(event_value), top_n = 10)
    
    entity_remove = {'CARDINAL', 'DATE', 'GPE', 'LANGUAGE', 'LOC',
                      'NORP', 'ORDINAL', 'PERCENT', 'PERSON', 'QUANTITY', 'TIME'}
    
    
    keyword_bert = [token.lemma_ for key in keywords if key[1] >= 0.3 for token in nlp(key[0])]
    
    work_list = []
    
    for word_check in keyword_bert:
        doc = nlp(word_check)
        if not any(word.label_ in entity_remove for word in doc.ents):
          work_list.extend(token.text for token in doc if token.pos_ != 'VERB')
            
    work_list = list(set(work_list) - set(common_words))
    
    return work_list