| | import os |
| | import sys |
| | import json |
| | import numpy as np |
| |
|
| | from pathlib import Path |
| | from tempfile import TemporaryDirectory |
| |
|
| |
|
| | |
| | try: |
| | from huggingface_hub import ( |
| | create_repo, get_hf_file_metadata, |
| | hf_hub_download, hf_hub_url, |
| | repo_type_and_id_from_hf_id, upload_folder) |
| | _has_hf_hub = True |
| | except ImportError: |
| | _has_hf_hub = False |
| |
|
| | |
| | if sys.version_info >= (3, 8): |
| | from typing import Literal |
| | else: |
| | from typing_extensions import Literal |
| | from typing import Union, Mapping, Any |
| |
|
| | |
| | try: |
| | import torch |
| | _has_torch = True |
| | except ImportError: |
| | _has_torch = False |
| |
|
| | |
| | try: |
| | from PIL import Image |
| | _has_vision = True |
| | except: |
| | _has_vision = False |
| |
|
| |
|
| | TOPICS_NAME = "topics.json" |
| | CONFIG_NAME = "config.json" |
| |
|
| | HF_WEIGHTS_NAME = "topic_embeddings.bin" |
| | HF_SAFE_WEIGHTS_NAME = "topic_embeddings.safetensors" |
| |
|
| | CTFIDF_WEIGHTS_NAME = "ctfidf.bin" |
| | CTFIDF_SAFE_WEIGHTS_NAME = "ctfidf.safetensors" |
| | CTFIDF_CFG_NAME = "ctfidf_config.json" |
| |
|
| | MODEL_CARD_TEMPLATE = """ |
| | --- |
| | tags: |
| | - bertopic |
| | library_name: bertopic |
| | pipeline_tag: {PIPELINE_TAG} |
| | --- |
| | |
| | # {MODEL_NAME} |
| | |
| | This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model. |
| | BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets. |
| | |
| | ## Usage |
| | |
| | To use this model, please install BERTopic: |
| | |
| | ``` |
| | pip install -U bertopic |
| | ``` |
| | |
| | You can use the model as follows: |
| | |
| | ```python |
| | from bertopic import BERTopic |
| | topic_model = BERTopic.load("{PATH}") |
| | |
| | topic_model.get_topic_info() |
| | ``` |
| | |
| | ## Topic overview |
| | |
| | * Number of topics: {NR_TOPICS} |
| | * Number of training documents: {NR_DOCUMENTS} |
| | |
| | <details> |
| | <summary>Click here for an overview of all topics.</summary> |
| | |
| | {TOPICS} |
| | |
| | </details> |
| | |
| | ## Training hyperparameters |
| | |
| | {HYPERPARAMS} |
| | |
| | ## Framework versions |
| | |
| | {FRAMEWORKS} |
| | """ |
| |
|
| |
|
| |
|
| | def push_to_hf_hub( |
| | model, |
| | repo_id: str, |
| | commit_message: str = 'Add BERTopic model', |
| | token: str = None, |
| | revision: str = None, |
| | private: bool = False, |
| | create_pr: bool = False, |
| | model_card: bool = True, |
| | serialization: str = "safetensors", |
| | save_embedding_model: Union[str, bool] = True, |
| | save_ctfidf: bool = False, |
| | ): |
| | """ Push your BERTopic model to a HuggingFace Hub |
| | |
| | Arguments: |
| | repo_id: The name of your HuggingFace repository |
| | commit_message: A commit message |
| | token: Token to add if not already logged in |
| | revision: Repository revision |
| | private: Whether to create a private repository |
| | create_pr: Whether to upload the model as a Pull Request |
| | model_card: Whether to automatically create a modelcard |
| | serialization: The type of serialization. |
| | Either `safetensors` or `pytorch` |
| | save_embedding_model: A pointer towards a HuggingFace model to be loaded in with |
| | SentenceTransformers. E.g., |
| | `sentence-transformers/all-MiniLM-L6-v2` |
| | save_ctfidf: Whether to save c-TF-IDF information |
| | """ |
| | if not _has_hf_hub: |
| | raise ValueError("Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`") |
| |
|
| | |
| | repo_url = create_repo(repo_id, token=token, private=private, exist_ok=True) |
| | _, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url) |
| | repo_id = f"{repo_owner}/{repo_name}" |
| |
|
| | |
| | with TemporaryDirectory() as tmpdir: |
| |
|
| | |
| | model.save(tmpdir, serialization=serialization, save_embedding_model=save_embedding_model, save_ctfidf=save_ctfidf) |
| |
|
| | |
| | try: |
| | get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)) |
| | except: |
| | if model_card: |
| | readme_text = generate_readme(model, repo_id) |
| | readme_path = Path(tmpdir) / "README.md" |
| | readme_path.write_text(readme_text, encoding='utf8') |
| |
|
| | |
| | return upload_folder(repo_id=repo_id, folder_path=tmpdir, revision=revision, |
| | create_pr=create_pr, commit_message=commit_message) |
| |
|
| |
|
| | def load_local_files(path): |
| | """ Load local BERTopic files """ |
| | |
| | topics = load_cfg_from_json(path / TOPICS_NAME) |
| | params = load_cfg_from_json(path / CONFIG_NAME) |
| |
|
| | |
| | safetensor_path = path / HF_SAFE_WEIGHTS_NAME |
| | if safetensor_path.is_file(): |
| | tensors = load_safetensors(safetensor_path) |
| | else: |
| | torch_path = path / HF_WEIGHTS_NAME |
| | if torch_path.is_file(): |
| | tensors = torch.load(torch_path, map_location="cpu") |
| |
|
| | |
| | try: |
| | ctfidf_tensors = None |
| | safetensor_path = path / CTFIDF_SAFE_WEIGHTS_NAME |
| | if safetensor_path.is_file(): |
| | ctfidf_tensors = load_safetensors(safetensor_path) |
| | else: |
| | torch_path = path / CTFIDF_WEIGHTS_NAME |
| | if torch_path.is_file(): |
| | ctfidf_tensors = torch.load(torch_path, map_location="cpu") |
| | ctfidf_config = load_cfg_from_json(path / CTFIDF_CFG_NAME) |
| | except: |
| | ctfidf_config, ctfidf_tensors = None, None |
| |
|
| | |
| | images = None |
| | if _has_vision: |
| | try: |
| | Image.open(path / "images/0.jpg") |
| | _has_images = True |
| | except: |
| | _has_images = False |
| |
|
| | if _has_images: |
| | topic_list = list(topics["topic_representations"].keys()) |
| | images = {} |
| | for topic in topic_list: |
| | image = Image.open(path / f"images/{topic}.jpg") |
| | images[int(topic)] = image |
| |
|
| | return topics, params, tensors, ctfidf_tensors, ctfidf_config, images |
| |
|
| |
|
| | def load_files_from_hf(path): |
| | """ Load files from HuggingFace. """ |
| | path = str(path) |
| |
|
| | |
| | topics = load_cfg_from_json(hf_hub_download(path, TOPICS_NAME, revision=None)) |
| | params = load_cfg_from_json(hf_hub_download(path, CONFIG_NAME, revision=None)) |
| |
|
| | |
| | try: |
| | tensors = hf_hub_download(path, HF_SAFE_WEIGHTS_NAME, revision=None) |
| | tensors = load_safetensors(tensors) |
| | except: |
| | tensors = hf_hub_download(path, HF_WEIGHTS_NAME, revision=None) |
| | tensors = torch.load(tensors, map_location="cpu") |
| |
|
| | |
| | try: |
| | ctfidf_config = load_cfg_from_json(hf_hub_download(path, CTFIDF_CFG_NAME, revision=None)) |
| | try: |
| | ctfidf_tensors = hf_hub_download(path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None) |
| | ctfidf_tensors = load_safetensors(ctfidf_tensors) |
| | except: |
| | ctfidf_tensors = hf_hub_download(path, CTFIDF_WEIGHTS_NAME, revision=None) |
| | ctfidf_tensors = torch.load(ctfidf_tensors, map_location="cpu") |
| | except: |
| | ctfidf_config, ctfidf_tensors = None, None |
| |
|
| | |
| | images = None |
| | if _has_vision: |
| | try: |
| | hf_hub_download(path, "images/0.jpg", revision=None) |
| | _has_images = True |
| | except: |
| | _has_images = False |
| |
|
| | if _has_images: |
| | topic_list = list(topics["topic_representations"].keys()) |
| | images = {} |
| | for topic in topic_list: |
| | image = Image.open(hf_hub_download(path, f"images/{topic}.jpg", revision=None)) |
| | images[int(topic)] = image |
| |
|
| | return topics, params, tensors, ctfidf_tensors, ctfidf_config, images |
| |
|
| |
|
| | def generate_readme(model, repo_id: str): |
| | """ Generate README for HuggingFace model card """ |
| | model_card = MODEL_CARD_TEMPLATE |
| | topic_table_head = "| Topic ID | Topic Keywords | Topic Frequency | Label | \n|----------|----------------|-----------------|-------| \n" |
| |
|
| | |
| | model_name = repo_id.split("/")[-1] |
| | params = {param: value for param, value in model.get_params().items() if "model" not in param} |
| | params = "\n".join([f"* {param}: {value}" for param, value in params.items()]) |
| | topics = sorted(list(set(model.topics_))) |
| | nr_topics = str(len(set(model.topics_))) |
| |
|
| | if model.topic_sizes_ is not None: |
| | nr_documents = str(sum(model.topic_sizes_.values())) |
| | else: |
| | nr_documents = "" |
| |
|
| | |
| | topic_keywords = [" - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics] |
| | topic_freq = [model.get_topic_freq(topic) for topic in topics] |
| | topic_labels = model.custom_labels_ if model.custom_labels_ else [model.topic_labels_[topic] for topic in topics] |
| | topics = [f"| {topic} | {topic_keywords[index]} | {topic_freq[topic]} | {topic_labels[index]} | \n" for index, topic in enumerate(topics)] |
| | topics = topic_table_head + "".join(topics) |
| | frameworks = "\n".join([f"* {param}: {value}" for param, value in get_package_versions().items()]) |
| |
|
| | |
| | model_card = model_card.replace("{MODEL_NAME}", model_name) |
| | model_card = model_card.replace("{PATH}", repo_id) |
| | model_card = model_card.replace("{NR_TOPICS}", nr_topics) |
| | model_card = model_card.replace("{TOPICS}", topics.strip()) |
| | model_card = model_card.replace("{NR_DOCUMENTS}", nr_documents) |
| | model_card = model_card.replace("{HYPERPARAMS}", params) |
| | model_card = model_card.replace("{FRAMEWORKS}", frameworks) |
| | |
| | |
| | has_visual_aspect = check_has_visual_aspect(model) |
| | if not has_visual_aspect: |
| | model_card = model_card.replace("{PIPELINE_TAG}", "text-classification") |
| | else: |
| | model_card = model_card.replace("pipeline_tag: {PIPELINE_TAG}\n","") |
| | |
| | return model_card |
| |
|
| |
|
| | def save_hf(model, save_directory, serialization: str): |
| | """ Save topic embeddings, either safely (using safetensors) or using legacy pytorch """ |
| | tensors = torch.from_numpy(np.array(model.topic_embeddings_, dtype=np.float32)) |
| | tensors = {"topic_embeddings": tensors} |
| |
|
| | if serialization == "safetensors": |
| | save_safetensors(save_directory / HF_SAFE_WEIGHTS_NAME, tensors) |
| | if serialization == "pytorch": |
| | assert _has_torch, "`pip install pytorch` to save as bin" |
| | torch.save(tensors, save_directory / HF_WEIGHTS_NAME) |
| |
|
| |
|
| | def save_ctfidf(model, |
| | save_directory: str, |
| | serialization: str): |
| | """ Save c-TF-IDF sparse matrix """ |
| | indptr = torch.from_numpy(model.c_tf_idf_.indptr) |
| | indices = torch.from_numpy(model.c_tf_idf_.indices) |
| | data = torch.from_numpy(model.c_tf_idf_.data) |
| | shape = torch.from_numpy(np.array(model.c_tf_idf_.shape)) |
| | diag = torch.from_numpy(np.array(model.ctfidf_model._idf_diag.data)) |
| | tensors = { |
| | "indptr": indptr, |
| | "indices": indices, |
| | "data": data, |
| | "shape": shape, |
| | "diag": diag |
| | } |
| |
|
| | if serialization == "safetensors": |
| | save_safetensors(save_directory / CTFIDF_SAFE_WEIGHTS_NAME, tensors) |
| | if serialization == "pytorch": |
| | assert _has_torch, "`pip install pytorch` to save as .bin" |
| | torch.save(tensors, save_directory / CTFIDF_WEIGHTS_NAME) |
| |
|
| |
|
| | def save_ctfidf_config(model, path): |
| | """ Save parameters to recreate CountVectorizer and c-TF-IDF """ |
| | config = {} |
| |
|
| | |
| | config["ctfidf_model"] = { |
| | "bm25_weighting": model.ctfidf_model.bm25_weighting, |
| | "reduce_frequent_words": model.ctfidf_model.reduce_frequent_words |
| | } |
| |
|
| | |
| | cv_params = model.vectorizer_model.get_params() |
| | del cv_params["tokenizer"], cv_params["preprocessor"], cv_params["dtype"] |
| | if not isinstance(cv_params["analyzer"], str): |
| | del cv_params["analyzer"] |
| |
|
| | config["vectorizer_model"] = { |
| | "params": cv_params, |
| | "vocab": model.vectorizer_model.vocabulary_ |
| | } |
| |
|
| | with path.open('w') as f: |
| | json.dump(config, f, indent=2) |
| |
|
| |
|
| | def save_config(model, path: str, embedding_model): |
| | """ Save BERTopic configuration """ |
| | path = Path(path) |
| | params = model.get_params() |
| | config = {param: value for param, value in params.items() if "model" not in param} |
| |
|
| | |
| | if isinstance(embedding_model, str): |
| | config["embedding_model"] = embedding_model |
| |
|
| | with path.open('w') as f: |
| | json.dump(config, f, indent=2) |
| |
|
| | return config |
| |
|
| | def check_has_visual_aspect(model): |
| | """Check if model has visual aspect""" |
| | if _has_vision: |
| | for aspect, value in model.topic_aspects_.items(): |
| | if isinstance(value[0], Image.Image): |
| | visual_aspects = model.topic_aspects_[aspect] |
| | return True |
| |
|
| | def save_images(model, path: str): |
| | """ Save topic images """ |
| | if _has_vision: |
| | visual_aspects = None |
| | for aspect, value in model.topic_aspects_.items(): |
| | if isinstance(value[0], Image.Image): |
| | visual_aspects = model.topic_aspects_[aspect] |
| | break |
| | |
| | if visual_aspects is not None: |
| | path.mkdir(exist_ok=True, parents=True) |
| | for topic, image in visual_aspects.items(): |
| | image.save(path / f"{topic}.jpg") |
| |
|
| |
|
| | def save_topics(model, path: str): |
| | """ Save Topic-specific information """ |
| | path = Path(path) |
| |
|
| | if _has_vision: |
| | selected_topic_aspects = {} |
| | for aspect, value in model.topic_aspects_.items(): |
| | if not isinstance(value[0], Image.Image): |
| | selected_topic_aspects[aspect] = value |
| | else: |
| | selected_topic_aspects["Visual_Aspect"] = True |
| | else: |
| | selected_topic_aspects = model.topic_aspects_ |
| |
|
| | topics = { |
| | "topic_representations": model.topic_representations_, |
| | "topics": [int(topic) for topic in model.topics_], |
| | "topic_sizes": model.topic_sizes_, |
| | "topic_mapper": np.array(model.topic_mapper_.mappings_, dtype=int).tolist(), |
| | "topic_labels": model.topic_labels_, |
| | "custom_labels": model.custom_labels_, |
| | "_outliers": int(model._outliers), |
| | "topic_aspects": selected_topic_aspects |
| | } |
| |
|
| | with path.open('w') as f: |
| | json.dump(topics, f, indent=2, cls=NumpyEncoder) |
| |
|
| |
|
| | def load_cfg_from_json(json_file: Union[str, os.PathLike]): |
| | """ Load configuration from json """ |
| | with open(json_file, "r", encoding="utf-8") as reader: |
| | text = reader.read() |
| | return json.loads(text) |
| |
|
| |
|
| | class NumpyEncoder(json.JSONEncoder): |
| | def default(self, obj): |
| | if isinstance(obj, np.integer): |
| | return int(obj) |
| | if isinstance(obj, np.floating): |
| | return float(obj) |
| | return super(NumpyEncoder, self).default(obj) |
| |
|
| |
|
| |
|
| | def get_package_versions(): |
| | """ Get versions of main dependencies of BERTopic """ |
| | try: |
| | import platform |
| | from numpy import __version__ as np_version |
| | |
| | try: |
| | from importlib.metadata import version |
| | hdbscan_version = version('hdbscan') |
| | except: |
| | hdbscan_version = None |
| |
|
| | from umap import __version__ as umap_version |
| | from pandas import __version__ as pandas_version |
| | from sklearn import __version__ as sklearn_version |
| | from sentence_transformers import __version__ as sbert_version |
| | from numba import __version__ as numba_version |
| | from transformers import __version__ as transformers_version |
| | |
| | from plotly import __version__ as plotly_version |
| | return {"Numpy": np_version, "HDBSCAN": hdbscan_version, "UMAP": umap_version, |
| | "Pandas": pandas_version, "Scikit-Learn": sklearn_version, |
| | "Sentence-transformers": sbert_version, "Transformers": transformers_version, |
| | "Numba": numba_version, "Plotly": plotly_version, "Python": platform.python_version()} |
| | except Exception as e: |
| | return e |
| | |
| |
|
| | def load_safetensors(path): |
| | """ Load safetensors and check whether it is installed """ |
| | try: |
| | import safetensors.torch |
| | import safetensors |
| | return safetensors.torch.load_file(path, device="cpu") |
| | except ImportError: |
| | raise ValueError("`pip install safetensors` to load .safetensors") |
| |
|
| |
|
| | def save_safetensors(path, tensors): |
| | """ Save safetensors and check whether it is installed """ |
| | try: |
| | import safetensors.torch |
| | import safetensors |
| | safetensors.torch.save_file(tensors, path) |
| | except ImportError: |
| | raise ValueError("`pip install safetensors` to save as .safetensors") |
| |
|