Upload 261 files

19b102a verified almost 2 years ago

8.16 kB

	import pandas as pd
	from langchain.docstore.document import Document
	from scipy.sparse import csr_matrix
	from typing import Callable, Dict, Mapping, List, Tuple, Union

	from bertopic.representation._base import BaseRepresentation
	from bertopic.representation._utils import truncate_document

	DEFAULT_PROMPT = "What are these documents about? Please give a single label."


	class LangChain(BaseRepresentation):
	""" Using chains in langchain to generate topic labels.

	The classic example uses `langchain.chains.question_answering.load_qa_chain`.
	This returns a chain that takes a list of documents and a question as input.

	You can also use Runnables such as those composed using the LangChain Expression Language.

	Arguments:
	chain: The langchain chain or Runnable with a `batch` method.
	Input keys must be `input_documents` and `question`.
	Output key must be `output_text`.
	prompt: The prompt to be used in the model. If no prompt is given,
	`self.default_prompt_` is used instead.
	nr_docs: The number of documents to pass to LangChain if a prompt
	with the `["DOCUMENTS"]` tag is used.
	diversity: The diversity of documents to pass to LangChain.
	Accepts values between 0 and 1. A higher
	values results in passing more diverse documents
	whereas lower values passes more similar documents.
	doc_length: The maximum length of each document. If a document is longer,
	it will be truncated. If None, the entire document is passed.
	tokenizer: The tokenizer used to calculate to split the document into segments
	used to count the length of a document.
	* If tokenizer is 'char', then the document is split up
	into characters which are counted to adhere to `doc_length`
	* If tokenizer is 'whitespace', the document is split up
	into words separated by whitespaces. These words are counted
	and truncated depending on `doc_length`
	* If tokenizer is 'vectorizer', then the internal CountVectorizer
	is used to tokenize the document. These tokens are counted
	and trunctated depending on `doc_length`. They are decoded with
	whitespaces.
	* If tokenizer is a callable, then that callable is used to tokenize
	the document. These tokens are counted and truncated depending
	on `doc_length`
	chain_config: The configuration for the langchain chain. Can be used to set options
	like max_concurrency to avoid rate limiting errors.
	Usage:

	To use this, you will need to install the langchain package first.
	Additionally, you will need an underlying LLM to support langchain,
	like openai:

	`pip install langchain`
	`pip install openai`

	Then, you can create your chain as follows:

	```python
	from langchain.chains.question_answering import load_qa_chain
	from langchain.llms import OpenAI
	chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type="stuff")
	```

	Finally, you can pass the chain to BERTopic as follows:

	```python
	from bertopic.representation import LangChain

	# Create your representation model
	representation_model = LangChain(chain)

	# Use the representation model in BERTopic on top of the default pipeline
	topic_model = BERTopic(representation_model=representation_model)
	```

	You can also use a custom prompt:

	```python
	prompt = "What are these documents about? Please give a single label."
	representation_model = LangChain(chain, prompt=prompt)
	```

	You can also use a Runnable instead of a chain.
	The example below uses the LangChain Expression Language:

	```python
	from bertopic.representation import LangChain
	from langchain.chains.question_answering import load_qa_chain
	from langchain.chat_models import ChatAnthropic
	from langchain.schema.document import Document
	from langchain.schema.runnable import RunnablePassthrough
	from langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer

	prompt = ...
	llm = ...

	# We will construct a special privacy-preserving chain using Microsoft Presidio

	pii_handler = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"])

	chain = (
	{
	"input_documents": (
	lambda inp: [
	Document(
	page_content=pii_handler.anonymize(
	d.page_content,
	language="en",
	),
	)
	for d in inp["input_documents"]
	]
	),
	"question": RunnablePassthrough(),
	}
	\| load_qa_chain(representation_llm, chain_type="stuff")
	\| (lambda output: {"output_text": pii_handler.deanonymize(output["output_text"])})
	)

	representation_model = LangChain(chain, prompt=representation_prompt)
	```
	"""
	def __init__(self,
	chain,
	prompt: str = None,
	nr_docs: int = 4,
	diversity: float = None,
	doc_length: int = None,
	tokenizer: Union[str, Callable] = None,
	chain_config = None,
	):
	self.chain = chain
	self.prompt = prompt if prompt is not None else DEFAULT_PROMPT
	self.default_prompt_ = DEFAULT_PROMPT
	self.chain_config = chain_config
	self.nr_docs = nr_docs
	self.diversity = diversity
	self.doc_length = doc_length
	self.tokenizer = tokenizer

	def extract_topics(self,
	topic_model,
	documents: pd.DataFrame,
	c_tf_idf: csr_matrix,
	topics: Mapping[str, List[Tuple[str, float]]]
	) -> Mapping[str, List[Tuple[str, int]]]:
	""" Extract topics

	Arguments:
	topic_model: A BERTopic model
	documents: All input documents
	c_tf_idf: The topic c-TF-IDF representation
	topics: The candidate topics as calculated with c-TF-IDF

	Returns:
	updated_topics: Updated topic representations
	"""
	# Extract the top 4 representative documents per topic
	repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(
	c_tf_idf=c_tf_idf,
	documents=documents,
	topics=topics,
	nr_samples=500,
	nr_repr_docs=self.nr_docs,
	diversity=self.diversity
	)

	# Generate label using langchain's batch functionality
	chain_docs: List[List[Document]] = [
	[
	Document(
	page_content=truncate_document(
	topic_model,
	self.doc_length,
	self.tokenizer,
	doc
	)
	)
	for doc in docs
	]
	for docs in repr_docs_mappings.values()
	]

	# `self.chain` must take `input_documents` and `question` as input keys
	inputs = [
	{"input_documents": docs, "question": self.prompt}
	for docs in chain_docs
	]

	# `self.chain` must return a dict with an `output_text` key
	# same output key as the `StuffDocumentsChain` returned by `load_qa_chain`
	outputs = self.chain.batch(inputs=inputs, config=self.chain_config)
	labels = [output["output_text"].strip() for output in outputs]

	updated_topics = {
	topic: [(label, 1)] + [("", 0) for _ in range(9)]
	for topic, label in zip(repr_docs_mappings.keys(), labels)
	}

	return updated_topics