kisejin
/

TopicModelingRepo

Model card Files Files and versions

TopicModelingRepo / BERTopic /bertopic /backend /_use.py

kisejin's picture

Upload 261 files

19b102a verified almost 2 years ago

1.77 kB

	import numpy as np
	from tqdm import tqdm
	from typing import List

	from bertopic.backend import BaseEmbedder


	class USEBackend(BaseEmbedder):
	""" Universal Sentence Encoder

	USE encodes text into high-dimensional vectors that
	are used for semantic similarity in BERTopic.

	Arguments:
	embedding_model: An USE embedding model

	Examples:

	```python
	import tensorflow_hub
	from bertopic.backend import USEBackend

	embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
	use_embedder = USEBackend(embedding_model)
	```
	"""
	def __init__(self, embedding_model):
	super().__init__()

	try:
	embedding_model(["test sentence"])
	self.embedding_model = embedding_model
	except TypeError:
	raise ValueError("Please select a correct USE model: \n"
	"`import tensorflow_hub` \n"
	"`embedding_model = tensorflow_hub.load(path_to_model)`")

	def embed(self,
	documents: List[str],
	verbose: bool = False) -> np.ndarray:
	""" Embed a list of n documents/words into an n-dimensional
	matrix of embeddings

	Arguments:
	documents: A list of documents or words to be embedded
	verbose: Controls the verbosity of the process

	Returns:
	Document/words embeddings with shape (n, m) with `n` documents/words
	that each have an embeddings size of `m`
	"""
	embeddings = np.array(
	[
	self.embedding_model([doc]).cpu().numpy()[0]
	for doc in tqdm(documents, disable=not verbose)
	]
	)
	return embeddings