Spaces:
Running
Running
| from keybert import KeyBERT | |
| import spacy | |
| from collections import Counter | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| kw_model = KeyBERT() | |
| nlp = spacy.load("en_core_web_md") | |
| def get_top_keywords(text, top_n=5): | |
| keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english') | |
| seen = set() | |
| deduped_keywords = [] | |
| for word, _ in keywords: | |
| simplified = word.lower().replace("-", " ").replace("_", " ") | |
| if simplified not in seen: | |
| seen.add(simplified) | |
| deduped_keywords.append(word) | |
| if len(deduped_keywords) >= top_n: | |
| break | |
| return deduped_keywords | |
| def get_top_named_entities(text, top_n=15): | |
| doc = nlp(text) | |
| entities = [ent.text.strip() for ent in doc.ents if ent.label_ in {"PERSON", "ORG", "GPE", "EVENT", "PRODUCT", "LOC", "FAC"}] | |
| counter = Counter(entities) | |
| ranked_entities = [entity for entity, _ in counter.most_common(top_n)] | |
| return ranked_entities | |
| def detect_events(text): | |
| keywords = get_top_keywords(text) | |
| named_entities = get_top_named_entities(text) | |
| return { | |
| "Top Keywords (KeyBERT)": keywords, | |
| "Top Named Entities (NER)": named_entities | |
| } | |