kisejin's picture
Upload 261 files
19b102a verified
import numpy as np
import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy.language import Language
from packaging import version
from scipy.sparse import csr_matrix
from typing import List, Mapping, Tuple, Union
from sklearn import __version__ as sklearn_version
from bertopic.representation._base import BaseRepresentation
class PartOfSpeech(BaseRepresentation):
""" Extract Topic Keywords based on their Part-of-Speech
DEFAULT_PATTERNS = [
[{'POS': 'ADJ'}, {'POS': 'NOUN'}],
[{'POS': 'NOUN'}],
[{'POS': 'ADJ'}]
]
From candidate topics, as extracted with c-TF-IDF,
find documents that contain keywords found in the
candidate topics. These candidate documents then
serve as the representative set of documents from
which the Spacy model can extract a set of candidate
keywords for each topic.
These candidate keywords are first judged by whether
they fall within the DEFAULT_PATTERNS or the user-defined
pattern. Then, the resulting keywords are sorted by
their respective c-TF-IDF values.
Arguments:
model: The Spacy model to use
top_n_words: The top n words to extract
pos_patterns: Patterns for Spacy to use.
See https://spacy.io/usage/rule-based-matching
Usage:
```python
from bertopic.representation import PartOfSpeech
from bertopic import BERTopic
# Create your representation model
representation_model = PartOfSpeech("en_core_web_sm")
# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)
```
You can define custom POS patterns to be extracted:
```python
pos_patterns = [
[{'POS': 'ADJ'}, {'POS': 'NOUN'}],
[{'POS': 'NOUN'}], [{'POS': 'ADJ'}]
]
representation_model = PartOfSpeech("en_core_web_sm", pos_patterns=pos_patterns)
```
"""
def __init__(self,
model: Union[str, Language] = "en_core_web_sm",
top_n_words: int = 10,
pos_patterns: List[str] = None):
if isinstance(model, str):
self.model = spacy.load(model)
elif isinstance(model, Language):
self.model = model
else:
raise ValueError("Make sure that the Spacy model that you"
"pass is either a string referring to a"
"Spacy model or a Spacy nlp object.")
self.top_n_words = top_n_words
if pos_patterns is None:
self.pos_patterns = [
[{'POS': 'ADJ'}, {'POS': 'NOUN'}],
[{'POS': 'NOUN'}], [{'POS': 'ADJ'}]
]
else:
self.pos_patterns = pos_patterns
def extract_topics(self,
topic_model,
documents: pd.DataFrame,
c_tf_idf: csr_matrix,
topics: Mapping[str, List[Tuple[str, float]]]
) -> Mapping[str, List[Tuple[str, float]]]:
""" Extract topics
Arguments:
topic_model: A BERTopic model
documents: All input documents
c_tf_idf: Not used
topics: The candidate topics as calculated with c-TF-IDF
Returns:
updated_topics: Updated topic representations
"""
matcher = Matcher(self.model.vocab)
matcher.add("Pattern", self.pos_patterns)
candidate_topics = {}
for topic, values in topics.items():
keywords = list(zip(*values))[0]
# Extract candidate documents
candidate_documents = []
for keyword in keywords:
selection = documents.loc[documents.Topic == topic, :]
selection = selection.loc[selection.Document.str.contains(keyword), "Document"]
if len(selection) > 0:
for document in selection[:2]:
candidate_documents.append(document)
candidate_documents = list(set(candidate_documents))
# Extract keywords
docs_pipeline = self.model.pipe(candidate_documents)
updated_keywords = []
for doc in docs_pipeline:
matches = matcher(doc)
for _, start, end in matches:
updated_keywords.append(doc[start:end].text)
candidate_topics[topic] = list(set(updated_keywords))
# Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
# and will be removed in 1.2. Please use get_feature_names_out instead.
if version.parse(sklearn_version) >= version.parse("1.0.0"):
words = list(topic_model.vectorizer_model.get_feature_names_out())
else:
words = list(topic_model.vectorizer_model.get_feature_names())
# Match updated keywords with c-TF-IDF values
words_lookup = dict(zip(words, range(len(words))))
updated_topics = {topic: [] for topic in topics.keys()}
for topic, candidate_keywords in candidate_topics.items():
word_indices = [words_lookup.get(keyword) for keyword in candidate_keywords if words_lookup.get(keyword)]
vals = topic_model.c_tf_idf_[:, np.array(word_indices)][topic + topic_model._outliers]
indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1]
vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1]
topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)]
updated_topics[topic] = topic_words
if len(updated_topics[topic]) < self.top_n_words:
updated_topics[topic] += [("", 0) for _ in range(self.top_n_words-len(updated_topics[topic]))]
return updated_topics