import os import sys import json import numpy as np from pathlib import Path from tempfile import TemporaryDirectory # HuggingFace Hub try: from huggingface_hub import ( create_repo, get_hf_file_metadata, hf_hub_download, hf_hub_url, repo_type_and_id_from_hf_id, upload_folder) _has_hf_hub = True except ImportError: _has_hf_hub = False # Typing if sys.version_info >= (3, 8): from typing import Literal else: from typing_extensions import Literal from typing import Union, Mapping, Any # Pytorch check try: import torch _has_torch = True except ImportError: _has_torch = False # Image check try: from PIL import Image _has_vision = True except: _has_vision = False TOPICS_NAME = "topics.json" CONFIG_NAME = "config.json" HF_WEIGHTS_NAME = "topic_embeddings.bin" # default pytorch pkl HF_SAFE_WEIGHTS_NAME = "topic_embeddings.safetensors" # safetensors version CTFIDF_WEIGHTS_NAME = "ctfidf.bin" # default pytorch pkl CTFIDF_SAFE_WEIGHTS_NAME = "ctfidf.safetensors" # safetensors version CTFIDF_CFG_NAME = "ctfidf_config.json" MODEL_CARD_TEMPLATE = """ --- tags: - bertopic library_name: bertopic pipeline_tag: {PIPELINE_TAG} --- # {MODEL_NAME} This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model. BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets. ## Usage To use this model, please install BERTopic: ``` pip install -U bertopic ``` You can use the model as follows: ```python from bertopic import BERTopic topic_model = BERTopic.load("{PATH}") topic_model.get_topic_info() ``` ## Topic overview * Number of topics: {NR_TOPICS} * Number of training documents: {NR_DOCUMENTS}
Click here for an overview of all topics. {TOPICS}
## Training hyperparameters {HYPERPARAMS} ## Framework versions {FRAMEWORKS} """ def push_to_hf_hub( model, repo_id: str, commit_message: str = 'Add BERTopic model', token: str = None, revision: str = None, private: bool = False, create_pr: bool = False, model_card: bool = True, serialization: str = "safetensors", save_embedding_model: Union[str, bool] = True, save_ctfidf: bool = False, ): """ Push your BERTopic model to a HuggingFace Hub Arguments: repo_id: The name of your HuggingFace repository commit_message: A commit message token: Token to add if not already logged in revision: Repository revision private: Whether to create a private repository create_pr: Whether to upload the model as a Pull Request model_card: Whether to automatically create a modelcard serialization: The type of serialization. Either `safetensors` or `pytorch` save_embedding_model: A pointer towards a HuggingFace model to be loaded in with SentenceTransformers. E.g., `sentence-transformers/all-MiniLM-L6-v2` save_ctfidf: Whether to save c-TF-IDF information """ if not _has_hf_hub: raise ValueError("Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`") # Create repo if it doesn't exist yet and infer complete repo_id repo_url = create_repo(repo_id, token=token, private=private, exist_ok=True) _, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url) repo_id = f"{repo_owner}/{repo_name}" # Temporarily save model and push to HF with TemporaryDirectory() as tmpdir: # Save model weights and config. model.save(tmpdir, serialization=serialization, save_embedding_model=save_embedding_model, save_ctfidf=save_ctfidf) # Add README if it does not exist try: get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)) except: if model_card: readme_text = generate_readme(model, repo_id) readme_path = Path(tmpdir) / "README.md" readme_path.write_text(readme_text, encoding='utf8') # Upload model return upload_folder(repo_id=repo_id, folder_path=tmpdir, revision=revision, create_pr=create_pr, commit_message=commit_message) def load_local_files(path): """ Load local BERTopic files """ # Load json configs topics = load_cfg_from_json(path / TOPICS_NAME) params = load_cfg_from_json(path / CONFIG_NAME) # Load Topic Embeddings safetensor_path = path / HF_SAFE_WEIGHTS_NAME if safetensor_path.is_file(): tensors = load_safetensors(safetensor_path) else: torch_path = path / HF_WEIGHTS_NAME if torch_path.is_file(): tensors = torch.load(torch_path, map_location="cpu") # c-TF-IDF try: ctfidf_tensors = None safetensor_path = path / CTFIDF_SAFE_WEIGHTS_NAME if safetensor_path.is_file(): ctfidf_tensors = load_safetensors(safetensor_path) else: torch_path = path / CTFIDF_WEIGHTS_NAME if torch_path.is_file(): ctfidf_tensors = torch.load(torch_path, map_location="cpu") ctfidf_config = load_cfg_from_json(path / CTFIDF_CFG_NAME) except: ctfidf_config, ctfidf_tensors = None, None # Load images images = None if _has_vision: try: Image.open(path / "images/0.jpg") _has_images = True except: _has_images = False if _has_images: topic_list = list(topics["topic_representations"].keys()) images = {} for topic in topic_list: image = Image.open(path / f"images/{topic}.jpg") images[int(topic)] = image return topics, params, tensors, ctfidf_tensors, ctfidf_config, images def load_files_from_hf(path): """ Load files from HuggingFace. """ path = str(path) # Configs topics = load_cfg_from_json(hf_hub_download(path, TOPICS_NAME, revision=None)) params = load_cfg_from_json(hf_hub_download(path, CONFIG_NAME, revision=None)) # Topic Embeddings try: tensors = hf_hub_download(path, HF_SAFE_WEIGHTS_NAME, revision=None) tensors = load_safetensors(tensors) except: tensors = hf_hub_download(path, HF_WEIGHTS_NAME, revision=None) tensors = torch.load(tensors, map_location="cpu") # c-TF-IDF try: ctfidf_config = load_cfg_from_json(hf_hub_download(path, CTFIDF_CFG_NAME, revision=None)) try: ctfidf_tensors = hf_hub_download(path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None) ctfidf_tensors = load_safetensors(ctfidf_tensors) except: ctfidf_tensors = hf_hub_download(path, CTFIDF_WEIGHTS_NAME, revision=None) ctfidf_tensors = torch.load(ctfidf_tensors, map_location="cpu") except: ctfidf_config, ctfidf_tensors = None, None # Load images if they exist images = None if _has_vision: try: hf_hub_download(path, "images/0.jpg", revision=None) _has_images = True except: _has_images = False if _has_images: topic_list = list(topics["topic_representations"].keys()) images = {} for topic in topic_list: image = Image.open(hf_hub_download(path, f"images/{topic}.jpg", revision=None)) images[int(topic)] = image return topics, params, tensors, ctfidf_tensors, ctfidf_config, images def generate_readme(model, repo_id: str): """ Generate README for HuggingFace model card """ model_card = MODEL_CARD_TEMPLATE topic_table_head = "| Topic ID | Topic Keywords | Topic Frequency | Label | \n|----------|----------------|-----------------|-------| \n" # Get Statistics model_name = repo_id.split("/")[-1] params = {param: value for param, value in model.get_params().items() if "model" not in param} params = "\n".join([f"* {param}: {value}" for param, value in params.items()]) topics = sorted(list(set(model.topics_))) nr_topics = str(len(set(model.topics_))) if model.topic_sizes_ is not None: nr_documents = str(sum(model.topic_sizes_.values())) else: nr_documents = "" # Topic information topic_keywords = [" - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics] topic_freq = [model.get_topic_freq(topic) for topic in topics] topic_labels = model.custom_labels_ if model.custom_labels_ else [model.topic_labels_[topic] for topic in topics] topics = [f"| {topic} | {topic_keywords[index]} | {topic_freq[topic]} | {topic_labels[index]} | \n" for index, topic in enumerate(topics)] topics = topic_table_head + "".join(topics) frameworks = "\n".join([f"* {param}: {value}" for param, value in get_package_versions().items()]) # Fill Statistics into model card model_card = model_card.replace("{MODEL_NAME}", model_name) model_card = model_card.replace("{PATH}", repo_id) model_card = model_card.replace("{NR_TOPICS}", nr_topics) model_card = model_card.replace("{TOPICS}", topics.strip()) model_card = model_card.replace("{NR_DOCUMENTS}", nr_documents) model_card = model_card.replace("{HYPERPARAMS}", params) model_card = model_card.replace("{FRAMEWORKS}", frameworks) # Fill Pipeline tag has_visual_aspect = check_has_visual_aspect(model) if not has_visual_aspect: model_card = model_card.replace("{PIPELINE_TAG}", "text-classification") else: model_card = model_card.replace("pipeline_tag: {PIPELINE_TAG}\n","") # TODO add proper tag for this instance return model_card def save_hf(model, save_directory, serialization: str): """ Save topic embeddings, either safely (using safetensors) or using legacy pytorch """ tensors = torch.from_numpy(np.array(model.topic_embeddings_, dtype=np.float32)) tensors = {"topic_embeddings": tensors} if serialization == "safetensors": save_safetensors(save_directory / HF_SAFE_WEIGHTS_NAME, tensors) if serialization == "pytorch": assert _has_torch, "`pip install pytorch` to save as bin" torch.save(tensors, save_directory / HF_WEIGHTS_NAME) def save_ctfidf(model, save_directory: str, serialization: str): """ Save c-TF-IDF sparse matrix """ indptr = torch.from_numpy(model.c_tf_idf_.indptr) indices = torch.from_numpy(model.c_tf_idf_.indices) data = torch.from_numpy(model.c_tf_idf_.data) shape = torch.from_numpy(np.array(model.c_tf_idf_.shape)) diag = torch.from_numpy(np.array(model.ctfidf_model._idf_diag.data)) tensors = { "indptr": indptr, "indices": indices, "data": data, "shape": shape, "diag": diag } if serialization == "safetensors": save_safetensors(save_directory / CTFIDF_SAFE_WEIGHTS_NAME, tensors) if serialization == "pytorch": assert _has_torch, "`pip install pytorch` to save as .bin" torch.save(tensors, save_directory / CTFIDF_WEIGHTS_NAME) def save_ctfidf_config(model, path): """ Save parameters to recreate CountVectorizer and c-TF-IDF """ config = {} # Recreate ClassTfidfTransformer config["ctfidf_model"] = { "bm25_weighting": model.ctfidf_model.bm25_weighting, "reduce_frequent_words": model.ctfidf_model.reduce_frequent_words } # Recreate CountVectorizer cv_params = model.vectorizer_model.get_params() del cv_params["tokenizer"], cv_params["preprocessor"], cv_params["dtype"] if not isinstance(cv_params["analyzer"], str): del cv_params["analyzer"] config["vectorizer_model"] = { "params": cv_params, "vocab": model.vectorizer_model.vocabulary_ } with path.open('w') as f: json.dump(config, f, indent=2) def save_config(model, path: str, embedding_model): """ Save BERTopic configuration """ path = Path(path) params = model.get_params() config = {param: value for param, value in params.items() if "model" not in param} # Embedding model tag to be used in sentence-transformers if isinstance(embedding_model, str): config["embedding_model"] = embedding_model with path.open('w') as f: json.dump(config, f, indent=2) return config def check_has_visual_aspect(model): """Check if model has visual aspect""" if _has_vision: for aspect, value in model.topic_aspects_.items(): if isinstance(value[0], Image.Image): visual_aspects = model.topic_aspects_[aspect] return True def save_images(model, path: str): """ Save topic images """ if _has_vision: visual_aspects = None for aspect, value in model.topic_aspects_.items(): if isinstance(value[0], Image.Image): visual_aspects = model.topic_aspects_[aspect] break if visual_aspects is not None: path.mkdir(exist_ok=True, parents=True) for topic, image in visual_aspects.items(): image.save(path / f"{topic}.jpg") def save_topics(model, path: str): """ Save Topic-specific information """ path = Path(path) if _has_vision: selected_topic_aspects = {} for aspect, value in model.topic_aspects_.items(): if not isinstance(value[0], Image.Image): selected_topic_aspects[aspect] = value else: selected_topic_aspects["Visual_Aspect"] = True else: selected_topic_aspects = model.topic_aspects_ topics = { "topic_representations": model.topic_representations_, "topics": [int(topic) for topic in model.topics_], "topic_sizes": model.topic_sizes_, "topic_mapper": np.array(model.topic_mapper_.mappings_, dtype=int).tolist(), "topic_labels": model.topic_labels_, "custom_labels": model.custom_labels_, "_outliers": int(model._outliers), "topic_aspects": selected_topic_aspects } with path.open('w') as f: json.dump(topics, f, indent=2, cls=NumpyEncoder) def load_cfg_from_json(json_file: Union[str, os.PathLike]): """ Load configuration from json """ with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() return json.loads(text) class NumpyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.integer): return int(obj) if isinstance(obj, np.floating): return float(obj) return super(NumpyEncoder, self).default(obj) def get_package_versions(): """ Get versions of main dependencies of BERTopic """ try: import platform from numpy import __version__ as np_version try: from importlib.metadata import version hdbscan_version = version('hdbscan') except: hdbscan_version = None from umap import __version__ as umap_version from pandas import __version__ as pandas_version from sklearn import __version__ as sklearn_version from sentence_transformers import __version__ as sbert_version from numba import __version__ as numba_version from transformers import __version__ as transformers_version from plotly import __version__ as plotly_version return {"Numpy": np_version, "HDBSCAN": hdbscan_version, "UMAP": umap_version, "Pandas": pandas_version, "Scikit-Learn": sklearn_version, "Sentence-transformers": sbert_version, "Transformers": transformers_version, "Numba": numba_version, "Plotly": plotly_version, "Python": platform.python_version()} except Exception as e: return e def load_safetensors(path): """ Load safetensors and check whether it is installed """ try: import safetensors.torch import safetensors return safetensors.torch.load_file(path, device="cpu") except ImportError: raise ValueError("`pip install safetensors` to load .safetensors") def save_safetensors(path, tensors): """ Save safetensors and check whether it is installed """ try: import safetensors.torch import safetensors safetensors.torch.save_file(tensors, path) except ImportError: raise ValueError("`pip install safetensors` to save as .safetensors")