kisejin
/

TopicModelingRepo

Model card Files Files and versions

TopicModelingRepo / BERTopic /bertopic /representation /_utils.py

kisejin's picture

Upload 261 files

19b102a verified almost 2 years ago

3.96 kB

	import random
	import time


	def truncate_document(topic_model, doc_length, tokenizer, document: str):
	""" Truncate a document to a certain length

	If you want to add a custom tokenizer, then it will need to have a `decode` and
	`encode` method. An example would be the following custom tokenizer:

	```python
	class Tokenizer:
	'A custom tokenizer that splits on commas'
	def encode(self, doc):
	return doc.split(",")

	def decode(self, doc_chuncks):
	return ",".join(doc_chuncks)
	```

	You can use this tokenizer by passing it to the `tokenizer` parameter.

	Arguments:
	topic_model: A BERTopic model
	doc_length: The maximum length of each document. If a document is longer,
	it will be truncated. If None, the entire document is passed.
	tokenizer: The tokenizer used to calculate to split the document into segments
	used to count the length of a document.
	* If tokenizer is 'char', then the document is split up
	into characters which are counted to adhere to `doc_length`
	* If tokenizer is 'whitespace', the document is split up
	into words separated by whitespaces. These words are counted
	and truncated depending on `doc_length`
	* If tokenizer is 'vectorizer', then the internal CountVectorizer
	is used to tokenize the document. These tokens are counted
	and trunctated depending on `doc_length`. They are decoded with
	whitespaces.
	* If tokenizer is a callable, then that callable is used to tokenize
	the document. These tokens are counted and truncated depending
	on `doc_length`
	document: A single document

	Returns:
	truncated_document: A truncated document
	"""
	if doc_length is not None:
	if tokenizer == "char":
	truncated_document = document[:doc_length]
	elif tokenizer == "whitespace":
	truncated_document = " ".join(document.split()[:doc_length])
	elif tokenizer == "vectorizer":
	tokenizer = topic_model.vectorizer_model.build_tokenizer()
	truncated_document = " ".join(tokenizer(document)[:doc_length])
	elif hasattr(tokenizer, 'encode') and hasattr(tokenizer, 'decode'):
	encoded_document = tokenizer.encode(document)
	truncated_document = tokenizer.decode(encoded_document[:doc_length])
	return truncated_document
	return document


	def retry_with_exponential_backoff(
	func,
	initial_delay: float = 1,
	exponential_base: float = 2,
	jitter: bool = True,
	max_retries: int = 10,
	errors: tuple = None,
	):
	"""Retry a function with exponential backoff."""

	def wrapper(args, *kwargs):
	# Initialize variables
	num_retries = 0
	delay = initial_delay

	# Loop until a successful response or max_retries is hit or an exception is raised
	while True:
	try:
	return func(args, *kwargs)

	# Retry on specific errors
	except errors as e:
	# Increment retries
	num_retries += 1

	# Check if max retries has been reached
	if num_retries > max_retries:
	raise Exception(
	f"Maximum number of retries ({max_retries}) exceeded."
	)

	# Increment the delay
	delay = exponential_base (1 + jitter * random.random())

	# Sleep for the delay
	time.sleep(delay)

	# Raise exceptions for any errors not specified
	except Exception as e:
	raise e

	return wrapper