| import pandas as pd |
| from langchain.docstore.document import Document |
| from scipy.sparse import csr_matrix |
| from typing import Callable, Dict, Mapping, List, Tuple, Union |
|
|
| from bertopic.representation._base import BaseRepresentation |
| from bertopic.representation._utils import truncate_document |
|
|
| DEFAULT_PROMPT = "What are these documents about? Please give a single label." |
|
|
|
|
| class LangChain(BaseRepresentation): |
| """ Using chains in langchain to generate topic labels. |
| |
| The classic example uses `langchain.chains.question_answering.load_qa_chain`. |
| This returns a chain that takes a list of documents and a question as input. |
| |
| You can also use Runnables such as those composed using the LangChain Expression Language. |
| |
| Arguments: |
| chain: The langchain chain or Runnable with a `batch` method. |
| Input keys must be `input_documents` and `question`. |
| Output key must be `output_text`. |
| prompt: The prompt to be used in the model. If no prompt is given, |
| `self.default_prompt_` is used instead. |
| nr_docs: The number of documents to pass to LangChain if a prompt |
| with the `["DOCUMENTS"]` tag is used. |
| diversity: The diversity of documents to pass to LangChain. |
| Accepts values between 0 and 1. A higher |
| values results in passing more diverse documents |
| whereas lower values passes more similar documents. |
| doc_length: The maximum length of each document. If a document is longer, |
| it will be truncated. If None, the entire document is passed. |
| tokenizer: The tokenizer used to calculate to split the document into segments |
| used to count the length of a document. |
| * If tokenizer is 'char', then the document is split up |
| into characters which are counted to adhere to `doc_length` |
| * If tokenizer is 'whitespace', the document is split up |
| into words separated by whitespaces. These words are counted |
| and truncated depending on `doc_length` |
| * If tokenizer is 'vectorizer', then the internal CountVectorizer |
| is used to tokenize the document. These tokens are counted |
| and trunctated depending on `doc_length`. They are decoded with |
| whitespaces. |
| * If tokenizer is a callable, then that callable is used to tokenize |
| the document. These tokens are counted and truncated depending |
| on `doc_length` |
| chain_config: The configuration for the langchain chain. Can be used to set options |
| like max_concurrency to avoid rate limiting errors. |
| Usage: |
| |
| To use this, you will need to install the langchain package first. |
| Additionally, you will need an underlying LLM to support langchain, |
| like openai: |
| |
| `pip install langchain` |
| `pip install openai` |
| |
| Then, you can create your chain as follows: |
| |
| ```python |
| from langchain.chains.question_answering import load_qa_chain |
| from langchain.llms import OpenAI |
| chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type="stuff") |
| ``` |
| |
| Finally, you can pass the chain to BERTopic as follows: |
| |
| ```python |
| from bertopic.representation import LangChain |
| |
| # Create your representation model |
| representation_model = LangChain(chain) |
| |
| # Use the representation model in BERTopic on top of the default pipeline |
| topic_model = BERTopic(representation_model=representation_model) |
| ``` |
| |
| You can also use a custom prompt: |
| |
| ```python |
| prompt = "What are these documents about? Please give a single label." |
| representation_model = LangChain(chain, prompt=prompt) |
| ``` |
| |
| You can also use a Runnable instead of a chain. |
| The example below uses the LangChain Expression Language: |
| |
| ```python |
| from bertopic.representation import LangChain |
| from langchain.chains.question_answering import load_qa_chain |
| from langchain.chat_models import ChatAnthropic |
| from langchain.schema.document import Document |
| from langchain.schema.runnable import RunnablePassthrough |
| from langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer |
| |
| prompt = ... |
| llm = ... |
| |
| # We will construct a special privacy-preserving chain using Microsoft Presidio |
| |
| pii_handler = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"]) |
| |
| chain = ( |
| { |
| "input_documents": ( |
| lambda inp: [ |
| Document( |
| page_content=pii_handler.anonymize( |
| d.page_content, |
| language="en", |
| ), |
| ) |
| for d in inp["input_documents"] |
| ] |
| ), |
| "question": RunnablePassthrough(), |
| } |
| | load_qa_chain(representation_llm, chain_type="stuff") |
| | (lambda output: {"output_text": pii_handler.deanonymize(output["output_text"])}) |
| ) |
| |
| representation_model = LangChain(chain, prompt=representation_prompt) |
| ``` |
| """ |
| def __init__(self, |
| chain, |
| prompt: str = None, |
| nr_docs: int = 4, |
| diversity: float = None, |
| doc_length: int = None, |
| tokenizer: Union[str, Callable] = None, |
| chain_config = None, |
| ): |
| self.chain = chain |
| self.prompt = prompt if prompt is not None else DEFAULT_PROMPT |
| self.default_prompt_ = DEFAULT_PROMPT |
| self.chain_config = chain_config |
| self.nr_docs = nr_docs |
| self.diversity = diversity |
| self.doc_length = doc_length |
| self.tokenizer = tokenizer |
|
|
| def extract_topics(self, |
| topic_model, |
| documents: pd.DataFrame, |
| c_tf_idf: csr_matrix, |
| topics: Mapping[str, List[Tuple[str, float]]] |
| ) -> Mapping[str, List[Tuple[str, int]]]: |
| """ Extract topics |
| |
| Arguments: |
| topic_model: A BERTopic model |
| documents: All input documents |
| c_tf_idf: The topic c-TF-IDF representation |
| topics: The candidate topics as calculated with c-TF-IDF |
| |
| Returns: |
| updated_topics: Updated topic representations |
| """ |
| |
| repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( |
| c_tf_idf=c_tf_idf, |
| documents=documents, |
| topics=topics, |
| nr_samples=500, |
| nr_repr_docs=self.nr_docs, |
| diversity=self.diversity |
| ) |
|
|
| |
| chain_docs: List[List[Document]] = [ |
| [ |
| Document( |
| page_content=truncate_document( |
| topic_model, |
| self.doc_length, |
| self.tokenizer, |
| doc |
| ) |
| ) |
| for doc in docs |
| ] |
| for docs in repr_docs_mappings.values() |
| ] |
|
|
| |
| inputs = [ |
| {"input_documents": docs, "question": self.prompt} |
| for docs in chain_docs |
| ] |
|
|
| |
| |
| outputs = self.chain.batch(inputs=inputs, config=self.chain_config) |
| labels = [output["output_text"].strip() for output in outputs] |
|
|
| updated_topics = { |
| topic: [(label, 1)] + [("", 0) for _ in range(9)] |
| for topic, label in zip(repr_docs_mappings.keys(), labels) |
| } |
|
|
| return updated_topics |
|
|