|
|
import os |
|
|
import sys |
|
|
import json |
|
|
import numpy as np |
|
|
|
|
|
from pathlib import Path |
|
|
from tempfile import TemporaryDirectory |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
from huggingface_hub import ( |
|
|
create_repo, get_hf_file_metadata, |
|
|
hf_hub_download, hf_hub_url, |
|
|
repo_type_and_id_from_hf_id, upload_folder) |
|
|
_has_hf_hub = True |
|
|
except ImportError: |
|
|
_has_hf_hub = False |
|
|
|
|
|
|
|
|
if sys.version_info >= (3, 8): |
|
|
from typing import Literal |
|
|
else: |
|
|
from typing_extensions import Literal |
|
|
from typing import Union, Mapping, Any |
|
|
|
|
|
|
|
|
try: |
|
|
import torch |
|
|
_has_torch = True |
|
|
except ImportError: |
|
|
_has_torch = False |
|
|
|
|
|
|
|
|
try: |
|
|
from PIL import Image |
|
|
_has_vision = True |
|
|
except: |
|
|
_has_vision = False |
|
|
|
|
|
|
|
|
TOPICS_NAME = "topics.json" |
|
|
CONFIG_NAME = "config.json" |
|
|
|
|
|
HF_WEIGHTS_NAME = "topic_embeddings.bin" |
|
|
HF_SAFE_WEIGHTS_NAME = "topic_embeddings.safetensors" |
|
|
|
|
|
CTFIDF_WEIGHTS_NAME = "ctfidf.bin" |
|
|
CTFIDF_SAFE_WEIGHTS_NAME = "ctfidf.safetensors" |
|
|
CTFIDF_CFG_NAME = "ctfidf_config.json" |
|
|
|
|
|
MODEL_CARD_TEMPLATE = """ |
|
|
--- |
|
|
tags: |
|
|
- bertopic |
|
|
library_name: bertopic |
|
|
pipeline_tag: {PIPELINE_TAG} |
|
|
--- |
|
|
|
|
|
# {MODEL_NAME} |
|
|
|
|
|
This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model. |
|
|
BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets. |
|
|
|
|
|
## Usage |
|
|
|
|
|
To use this model, please install BERTopic: |
|
|
|
|
|
``` |
|
|
pip install -U bertopic |
|
|
``` |
|
|
|
|
|
You can use the model as follows: |
|
|
|
|
|
```python |
|
|
from bertopic import BERTopic |
|
|
topic_model = BERTopic.load("{PATH}") |
|
|
|
|
|
topic_model.get_topic_info() |
|
|
``` |
|
|
|
|
|
## Topic overview |
|
|
|
|
|
* Number of topics: {NR_TOPICS} |
|
|
* Number of training documents: {NR_DOCUMENTS} |
|
|
|
|
|
<details> |
|
|
<summary>Click here for an overview of all topics.</summary> |
|
|
|
|
|
{TOPICS} |
|
|
|
|
|
</details> |
|
|
|
|
|
## Training hyperparameters |
|
|
|
|
|
{HYPERPARAMS} |
|
|
|
|
|
## Framework versions |
|
|
|
|
|
{FRAMEWORKS} |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
def push_to_hf_hub( |
|
|
model, |
|
|
repo_id: str, |
|
|
commit_message: str = 'Add BERTopic model', |
|
|
token: str = None, |
|
|
revision: str = None, |
|
|
private: bool = False, |
|
|
create_pr: bool = False, |
|
|
model_card: bool = True, |
|
|
serialization: str = "safetensors", |
|
|
save_embedding_model: Union[str, bool] = True, |
|
|
save_ctfidf: bool = False, |
|
|
): |
|
|
""" Push your BERTopic model to a HuggingFace Hub |
|
|
|
|
|
Arguments: |
|
|
repo_id: The name of your HuggingFace repository |
|
|
commit_message: A commit message |
|
|
token: Token to add if not already logged in |
|
|
revision: Repository revision |
|
|
private: Whether to create a private repository |
|
|
create_pr: Whether to upload the model as a Pull Request |
|
|
model_card: Whether to automatically create a modelcard |
|
|
serialization: The type of serialization. |
|
|
Either `safetensors` or `pytorch` |
|
|
save_embedding_model: A pointer towards a HuggingFace model to be loaded in with |
|
|
SentenceTransformers. E.g., |
|
|
`sentence-transformers/all-MiniLM-L6-v2` |
|
|
save_ctfidf: Whether to save c-TF-IDF information |
|
|
""" |
|
|
if not _has_hf_hub: |
|
|
raise ValueError("Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`") |
|
|
|
|
|
|
|
|
repo_url = create_repo(repo_id, token=token, private=private, exist_ok=True) |
|
|
_, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url) |
|
|
repo_id = f"{repo_owner}/{repo_name}" |
|
|
|
|
|
|
|
|
with TemporaryDirectory() as tmpdir: |
|
|
|
|
|
|
|
|
model.save(tmpdir, serialization=serialization, save_embedding_model=save_embedding_model, save_ctfidf=save_ctfidf) |
|
|
|
|
|
|
|
|
try: |
|
|
get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)) |
|
|
except: |
|
|
if model_card: |
|
|
readme_text = generate_readme(model, repo_id) |
|
|
readme_path = Path(tmpdir) / "README.md" |
|
|
readme_path.write_text(readme_text, encoding='utf8') |
|
|
|
|
|
|
|
|
return upload_folder(repo_id=repo_id, folder_path=tmpdir, revision=revision, |
|
|
create_pr=create_pr, commit_message=commit_message) |
|
|
|
|
|
|
|
|
def load_local_files(path): |
|
|
""" Load local BERTopic files """ |
|
|
|
|
|
topics = load_cfg_from_json(path / TOPICS_NAME) |
|
|
params = load_cfg_from_json(path / CONFIG_NAME) |
|
|
|
|
|
|
|
|
safetensor_path = path / HF_SAFE_WEIGHTS_NAME |
|
|
if safetensor_path.is_file(): |
|
|
tensors = load_safetensors(safetensor_path) |
|
|
else: |
|
|
torch_path = path / HF_WEIGHTS_NAME |
|
|
if torch_path.is_file(): |
|
|
tensors = torch.load(torch_path, map_location="cpu") |
|
|
|
|
|
|
|
|
try: |
|
|
ctfidf_tensors = None |
|
|
safetensor_path = path / CTFIDF_SAFE_WEIGHTS_NAME |
|
|
if safetensor_path.is_file(): |
|
|
ctfidf_tensors = load_safetensors(safetensor_path) |
|
|
else: |
|
|
torch_path = path / CTFIDF_WEIGHTS_NAME |
|
|
if torch_path.is_file(): |
|
|
ctfidf_tensors = torch.load(torch_path, map_location="cpu") |
|
|
ctfidf_config = load_cfg_from_json(path / CTFIDF_CFG_NAME) |
|
|
except: |
|
|
ctfidf_config, ctfidf_tensors = None, None |
|
|
|
|
|
|
|
|
images = None |
|
|
if _has_vision: |
|
|
try: |
|
|
Image.open(path / "images/0.jpg") |
|
|
_has_images = True |
|
|
except: |
|
|
_has_images = False |
|
|
|
|
|
if _has_images: |
|
|
topic_list = list(topics["topic_representations"].keys()) |
|
|
images = {} |
|
|
for topic in topic_list: |
|
|
image = Image.open(path / f"images/{topic}.jpg") |
|
|
images[int(topic)] = image |
|
|
|
|
|
return topics, params, tensors, ctfidf_tensors, ctfidf_config, images |
|
|
|
|
|
|
|
|
def load_files_from_hf(path): |
|
|
""" Load files from HuggingFace. """ |
|
|
path = str(path) |
|
|
|
|
|
|
|
|
topics = load_cfg_from_json(hf_hub_download(path, TOPICS_NAME, revision=None)) |
|
|
params = load_cfg_from_json(hf_hub_download(path, CONFIG_NAME, revision=None)) |
|
|
|
|
|
|
|
|
try: |
|
|
tensors = hf_hub_download(path, HF_SAFE_WEIGHTS_NAME, revision=None) |
|
|
tensors = load_safetensors(tensors) |
|
|
except: |
|
|
tensors = hf_hub_download(path, HF_WEIGHTS_NAME, revision=None) |
|
|
tensors = torch.load(tensors, map_location="cpu") |
|
|
|
|
|
|
|
|
try: |
|
|
ctfidf_config = load_cfg_from_json(hf_hub_download(path, CTFIDF_CFG_NAME, revision=None)) |
|
|
try: |
|
|
ctfidf_tensors = hf_hub_download(path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None) |
|
|
ctfidf_tensors = load_safetensors(ctfidf_tensors) |
|
|
except: |
|
|
ctfidf_tensors = hf_hub_download(path, CTFIDF_WEIGHTS_NAME, revision=None) |
|
|
ctfidf_tensors = torch.load(ctfidf_tensors, map_location="cpu") |
|
|
except: |
|
|
ctfidf_config, ctfidf_tensors = None, None |
|
|
|
|
|
|
|
|
images = None |
|
|
if _has_vision: |
|
|
try: |
|
|
hf_hub_download(path, "images/0.jpg", revision=None) |
|
|
_has_images = True |
|
|
except: |
|
|
_has_images = False |
|
|
|
|
|
if _has_images: |
|
|
topic_list = list(topics["topic_representations"].keys()) |
|
|
images = {} |
|
|
for topic in topic_list: |
|
|
image = Image.open(hf_hub_download(path, f"images/{topic}.jpg", revision=None)) |
|
|
images[int(topic)] = image |
|
|
|
|
|
return topics, params, tensors, ctfidf_tensors, ctfidf_config, images |
|
|
|
|
|
|
|
|
def generate_readme(model, repo_id: str): |
|
|
""" Generate README for HuggingFace model card """ |
|
|
model_card = MODEL_CARD_TEMPLATE |
|
|
topic_table_head = "| Topic ID | Topic Keywords | Topic Frequency | Label | \n|----------|----------------|-----------------|-------| \n" |
|
|
|
|
|
|
|
|
model_name = repo_id.split("/")[-1] |
|
|
params = {param: value for param, value in model.get_params().items() if "model" not in param} |
|
|
params = "\n".join([f"* {param}: {value}" for param, value in params.items()]) |
|
|
topics = sorted(list(set(model.topics_))) |
|
|
nr_topics = str(len(set(model.topics_))) |
|
|
|
|
|
if model.topic_sizes_ is not None: |
|
|
nr_documents = str(sum(model.topic_sizes_.values())) |
|
|
else: |
|
|
nr_documents = "" |
|
|
|
|
|
|
|
|
topic_keywords = [" - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics] |
|
|
topic_freq = [model.get_topic_freq(topic) for topic in topics] |
|
|
topic_labels = model.custom_labels_ if model.custom_labels_ else [model.topic_labels_[topic] for topic in topics] |
|
|
topics = [f"| {topic} | {topic_keywords[index]} | {topic_freq[topic]} | {topic_labels[index]} | \n" for index, topic in enumerate(topics)] |
|
|
topics = topic_table_head + "".join(topics) |
|
|
frameworks = "\n".join([f"* {param}: {value}" for param, value in get_package_versions().items()]) |
|
|
|
|
|
|
|
|
model_card = model_card.replace("{MODEL_NAME}", model_name) |
|
|
model_card = model_card.replace("{PATH}", repo_id) |
|
|
model_card = model_card.replace("{NR_TOPICS}", nr_topics) |
|
|
model_card = model_card.replace("{TOPICS}", topics.strip()) |
|
|
model_card = model_card.replace("{NR_DOCUMENTS}", nr_documents) |
|
|
model_card = model_card.replace("{HYPERPARAMS}", params) |
|
|
model_card = model_card.replace("{FRAMEWORKS}", frameworks) |
|
|
|
|
|
|
|
|
has_visual_aspect = check_has_visual_aspect(model) |
|
|
if not has_visual_aspect: |
|
|
model_card = model_card.replace("{PIPELINE_TAG}", "text-classification") |
|
|
else: |
|
|
model_card = model_card.replace("pipeline_tag: {PIPELINE_TAG}\n","") |
|
|
|
|
|
return model_card |
|
|
|
|
|
|
|
|
def save_hf(model, save_directory, serialization: str): |
|
|
""" Save topic embeddings, either safely (using safetensors) or using legacy pytorch """ |
|
|
tensors = torch.from_numpy(np.array(model.topic_embeddings_, dtype=np.float32)) |
|
|
tensors = {"topic_embeddings": tensors} |
|
|
|
|
|
if serialization == "safetensors": |
|
|
save_safetensors(save_directory / HF_SAFE_WEIGHTS_NAME, tensors) |
|
|
if serialization == "pytorch": |
|
|
assert _has_torch, "`pip install pytorch` to save as bin" |
|
|
torch.save(tensors, save_directory / HF_WEIGHTS_NAME) |
|
|
|
|
|
|
|
|
def save_ctfidf(model, |
|
|
save_directory: str, |
|
|
serialization: str): |
|
|
""" Save c-TF-IDF sparse matrix """ |
|
|
indptr = torch.from_numpy(model.c_tf_idf_.indptr) |
|
|
indices = torch.from_numpy(model.c_tf_idf_.indices) |
|
|
data = torch.from_numpy(model.c_tf_idf_.data) |
|
|
shape = torch.from_numpy(np.array(model.c_tf_idf_.shape)) |
|
|
diag = torch.from_numpy(np.array(model.ctfidf_model._idf_diag.data)) |
|
|
tensors = { |
|
|
"indptr": indptr, |
|
|
"indices": indices, |
|
|
"data": data, |
|
|
"shape": shape, |
|
|
"diag": diag |
|
|
} |
|
|
|
|
|
if serialization == "safetensors": |
|
|
save_safetensors(save_directory / CTFIDF_SAFE_WEIGHTS_NAME, tensors) |
|
|
if serialization == "pytorch": |
|
|
assert _has_torch, "`pip install pytorch` to save as .bin" |
|
|
torch.save(tensors, save_directory / CTFIDF_WEIGHTS_NAME) |
|
|
|
|
|
|
|
|
def save_ctfidf_config(model, path): |
|
|
""" Save parameters to recreate CountVectorizer and c-TF-IDF """ |
|
|
config = {} |
|
|
|
|
|
|
|
|
config["ctfidf_model"] = { |
|
|
"bm25_weighting": model.ctfidf_model.bm25_weighting, |
|
|
"reduce_frequent_words": model.ctfidf_model.reduce_frequent_words |
|
|
} |
|
|
|
|
|
|
|
|
cv_params = model.vectorizer_model.get_params() |
|
|
del cv_params["tokenizer"], cv_params["preprocessor"], cv_params["dtype"] |
|
|
if not isinstance(cv_params["analyzer"], str): |
|
|
del cv_params["analyzer"] |
|
|
|
|
|
config["vectorizer_model"] = { |
|
|
"params": cv_params, |
|
|
"vocab": model.vectorizer_model.vocabulary_ |
|
|
} |
|
|
|
|
|
with path.open('w') as f: |
|
|
json.dump(config, f, indent=2) |
|
|
|
|
|
|
|
|
def save_config(model, path: str, embedding_model): |
|
|
""" Save BERTopic configuration """ |
|
|
path = Path(path) |
|
|
params = model.get_params() |
|
|
config = {param: value for param, value in params.items() if "model" not in param} |
|
|
|
|
|
|
|
|
if isinstance(embedding_model, str): |
|
|
config["embedding_model"] = embedding_model |
|
|
|
|
|
with path.open('w') as f: |
|
|
json.dump(config, f, indent=2) |
|
|
|
|
|
return config |
|
|
|
|
|
def check_has_visual_aspect(model): |
|
|
"""Check if model has visual aspect""" |
|
|
if _has_vision: |
|
|
for aspect, value in model.topic_aspects_.items(): |
|
|
if isinstance(value[0], Image.Image): |
|
|
visual_aspects = model.topic_aspects_[aspect] |
|
|
return True |
|
|
|
|
|
def save_images(model, path: str): |
|
|
""" Save topic images """ |
|
|
if _has_vision: |
|
|
visual_aspects = None |
|
|
for aspect, value in model.topic_aspects_.items(): |
|
|
if isinstance(value[0], Image.Image): |
|
|
visual_aspects = model.topic_aspects_[aspect] |
|
|
break |
|
|
|
|
|
if visual_aspects is not None: |
|
|
path.mkdir(exist_ok=True, parents=True) |
|
|
for topic, image in visual_aspects.items(): |
|
|
image.save(path / f"{topic}.jpg") |
|
|
|
|
|
|
|
|
def save_topics(model, path: str): |
|
|
""" Save Topic-specific information """ |
|
|
path = Path(path) |
|
|
|
|
|
if _has_vision: |
|
|
selected_topic_aspects = {} |
|
|
for aspect, value in model.topic_aspects_.items(): |
|
|
if not isinstance(value[0], Image.Image): |
|
|
selected_topic_aspects[aspect] = value |
|
|
else: |
|
|
selected_topic_aspects["Visual_Aspect"] = True |
|
|
else: |
|
|
selected_topic_aspects = model.topic_aspects_ |
|
|
|
|
|
topics = { |
|
|
"topic_representations": model.topic_representations_, |
|
|
"topics": [int(topic) for topic in model.topics_], |
|
|
"topic_sizes": model.topic_sizes_, |
|
|
"topic_mapper": np.array(model.topic_mapper_.mappings_, dtype=int).tolist(), |
|
|
"topic_labels": model.topic_labels_, |
|
|
"custom_labels": model.custom_labels_, |
|
|
"_outliers": int(model._outliers), |
|
|
"topic_aspects": selected_topic_aspects |
|
|
} |
|
|
|
|
|
with path.open('w') as f: |
|
|
json.dump(topics, f, indent=2, cls=NumpyEncoder) |
|
|
|
|
|
|
|
|
def load_cfg_from_json(json_file: Union[str, os.PathLike]): |
|
|
""" Load configuration from json """ |
|
|
with open(json_file, "r", encoding="utf-8") as reader: |
|
|
text = reader.read() |
|
|
return json.loads(text) |
|
|
|
|
|
|
|
|
class NumpyEncoder(json.JSONEncoder): |
|
|
def default(self, obj): |
|
|
if isinstance(obj, np.integer): |
|
|
return int(obj) |
|
|
if isinstance(obj, np.floating): |
|
|
return float(obj) |
|
|
return super(NumpyEncoder, self).default(obj) |
|
|
|
|
|
|
|
|
|
|
|
def get_package_versions(): |
|
|
""" Get versions of main dependencies of BERTopic """ |
|
|
try: |
|
|
import platform |
|
|
from numpy import __version__ as np_version |
|
|
|
|
|
try: |
|
|
from importlib.metadata import version |
|
|
hdbscan_version = version('hdbscan') |
|
|
except: |
|
|
hdbscan_version = None |
|
|
|
|
|
from umap import __version__ as umap_version |
|
|
from pandas import __version__ as pandas_version |
|
|
from sklearn import __version__ as sklearn_version |
|
|
from sentence_transformers import __version__ as sbert_version |
|
|
from numba import __version__ as numba_version |
|
|
from transformers import __version__ as transformers_version |
|
|
|
|
|
from plotly import __version__ as plotly_version |
|
|
return {"Numpy": np_version, "HDBSCAN": hdbscan_version, "UMAP": umap_version, |
|
|
"Pandas": pandas_version, "Scikit-Learn": sklearn_version, |
|
|
"Sentence-transformers": sbert_version, "Transformers": transformers_version, |
|
|
"Numba": numba_version, "Plotly": plotly_version, "Python": platform.python_version()} |
|
|
except Exception as e: |
|
|
return e |
|
|
|
|
|
|
|
|
def load_safetensors(path): |
|
|
""" Load safetensors and check whether it is installed """ |
|
|
try: |
|
|
import safetensors.torch |
|
|
import safetensors |
|
|
return safetensors.torch.load_file(path, device="cpu") |
|
|
except ImportError: |
|
|
raise ValueError("`pip install safetensors` to load .safetensors") |
|
|
|
|
|
|
|
|
def save_safetensors(path, tensors): |
|
|
""" Save safetensors and check whether it is installed """ |
|
|
try: |
|
|
import safetensors.torch |
|
|
import safetensors |
|
|
safetensors.torch.save_file(tensors, path) |
|
|
except ImportError: |
|
|
raise ValueError("`pip install safetensors` to save as .safetensors") |
|
|
|