Spaces:

IBHS
/

RoofingRoadmap

Sleeping

App Files Files

RoofingRoadmap / LiteratureAgent.py

mfallahian

feat: ver 2.0

642592e 9 months ago

raw

history blame

6.02 kB

	from langchain_openai import OpenAIEmbeddings
	from langchain_openai import ChatOpenAI
	from langchain_chroma import Chroma
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.runnables import RunnablePassthrough
	from langchain_core.output_parsers import XMLOutputParser
	from langchain.chains import create_retrieval_chain
	from langchain_core.documents import Document
	from typing import List


	XML_SYSTEM_PROMPT = """You're a helpful AI assistant. Given a user question and some scientific literature
	documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
	and their performance against natural hazards(e.g., wind, hail), answer the user
	question.
	You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile)
	and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.

	When addressing questions about ‘what is the best roof,’ consider the following factors:
	• Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
	• For the insurance industry, the ‘best roof’ depends on the specific hazards (their location and frequency), performance expectations and predictability, and the cost of materials.

	If none of the articles answer the question, simply say that there are no articles relevant to your inquiry.
	Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that
	justifies the answer and the ID and also Source Name of the quote article. Return a citation for every quote across all articles
	that justify the answer. Use the following format for your final output:
	<cited_answer>
	<answer></answer>
	<citations>
	<citation><source_id></source_id><source></source><quote></quote></citation>
	<citation><source_id></source_id><source></source><quote></quote></citation>
	...
	</citations>
	</cited_answer>

	If none of the articles answer the question, return:
	<cited_answer>
	<answer>Nothing</answer>
	<citations/>
	</cited_answer>

	ALWAYS maintain valid XML structure with properly closed tags. Here are the articles:{context}"""


	class RoofCoverChatbot:
	def __init__(self, model: str = "gpt-4.1", temperature: float = 0.1):
	"""
	Initialize the RoofCoverChatbot by setting up the retrieval chain,
	which uses scientific literature documents to generate an XML-formatted answer.
	"""
	# Create the XML prompt template.
	self.xml_prompt = ChatPromptTemplate.from_messages(
	[("system", XML_SYSTEM_PROMPT), ("human", "{input}")]
	)

	# Initialize the language model.
	self.llm = ChatOpenAI(model=model, temperature=temperature)

	# Create the chain that refines answers using retrieved documents.
	# The first step formats the retrieved context as XML.
	rag_chain_from_docs = (
	RunnablePassthrough.assign(
	context=(lambda x: self.format_docs_xml(x["context"]))
	)
	\| self.xml_prompt
	\| self.llm
	\| XMLOutputParser()
	)

	embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

	self.vectordb = Chroma(
	embedding_function=embeddings,
	persist_directory="./chroma_db",
	collection_name="document_collection",
	collection_metadata={"hnsw:space": "cosine"}
	)
	# Use similarity search to retrieve the top-K documents.
	self.retriever = self.vectordb.as_retriever(
	search_type="similarity", search_kwargs={"k": 5}
	)

	# This lambda extracts the "input" key for retrieval.
	retrieve_docs = (lambda x: x["input"]) \| self.retriever

	# Build the final chain: retrieve documents then generate an answer.
	self.chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
	answer=rag_chain_from_docs
	)

	@staticmethod
	def format_docs_xml(docs: List[Document]) -> str:
	"""
	Format a list of documents into XML snippets.

	Each document is formatted with its source metadata and a snippet of its content.
	"""

	formatted_docs = [
	(
	f"<source id=\"{i}\">\n"
	f"<source>{doc.metadata['source']}</source>\n"
	f"<article_snippet>{doc.page_content}</article_snippet>\n"
	f"</source>"
	)
	for i, doc in enumerate(docs)
	]
	return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"

	def get_response(self, query: str) -> str:
	"""
	Return the chatbot response for the given query.

	The method retrieves relevant documents and then uses the XML chain to generate
	an answer with citations.

	:param query: The user question.
	:return: XML-formatted answer with citations.
	"""
	return self.chain.invoke({"input": query})

	def get_extra_resources(self, query: str, original_sources: List[str]):
	"""
	Invokes the retriever using the given query and returns additional resources.

	Uses the retriever to fetch resources based on the input query string. This
	method facilitates targeted resource retrieval based on provided input.

	:param query: A string representing the query to be processed by the retriever.
	:type query: str
	:return: The resources or data obtained from the retriever after processing
	the query.
	"""

	retriever = self.vectordb.as_retriever(
	search_type="similarity", search_kwargs={"k": 8, "filter":{"source": {"$nin": original_sources}}}
	)

	result = retriever.invoke(query)
	return result