from langchain_openai import OpenAIEmbeddings from langchain_openai import ChatOpenAI from langchain_chroma import Chroma from langchain_core.prompts import ChatPromptTemplate from langchain_core.runnables import RunnablePassthrough from langchain_core.output_parsers import XMLOutputParser from langchain.chains import create_retrieval_chain from langchain_core.documents import Document from typing import List XML_SYSTEM_PROMPT = """You're a helpful AI assistant. Given a user question and some scientific literature documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile) and their performance against natural hazards(e.g., wind, hail), answer the user question. You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile) and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff. When addressing questions about ‘what is the best roof,’ consider the following factors: • Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age. • For the insurance industry, the ‘best roof’ depends on the specific hazards (their location and frequency), performance expectations and predictability, and the cost of materials. If none of the articles answer the question, simply say that there are no articles relevant to your inquiry. Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that justifies the answer and the ID and also Source Name of the quote article. Return a citation for every quote across all articles that justify the answer. Use the following format for your final output: ... If none of the articles answer the question, return: Nothing ALWAYS maintain valid XML structure with properly closed tags. Here are the articles:{context}""" class RoofCoverChatbot: def __init__(self, model: str = "gpt-4.1", temperature: float = 0.1): """ Initialize the RoofCoverChatbot by setting up the retrieval chain, which uses scientific literature documents to generate an XML-formatted answer. """ # Create the XML prompt template. self.xml_prompt = ChatPromptTemplate.from_messages( [("system", XML_SYSTEM_PROMPT), ("human", "{input}")] ) # Initialize the language model. self.llm = ChatOpenAI(model=model, temperature=temperature) # Create the chain that refines answers using retrieved documents. # The first step formats the retrieved context as XML. rag_chain_from_docs = ( RunnablePassthrough.assign( context=(lambda x: self.format_docs_xml(x["context"])) ) | self.xml_prompt | self.llm | XMLOutputParser() ) embeddings = OpenAIEmbeddings(model="text-embedding-3-large") self.vectordb = Chroma( embedding_function=embeddings, persist_directory="./chroma_db", collection_name="document_collection", collection_metadata={"hnsw:space": "cosine"} ) # Use similarity search to retrieve the top-K documents. self.retriever = self.vectordb.as_retriever( search_type="similarity", search_kwargs={"k": 5} ) # This lambda extracts the "input" key for retrieval. retrieve_docs = (lambda x: x["input"]) | self.retriever # Build the final chain: retrieve documents then generate an answer. self.chain = RunnablePassthrough.assign(context=retrieve_docs).assign( answer=rag_chain_from_docs ) @staticmethod def format_docs_xml(docs: List[Document]) -> str: """ Format a list of documents into XML snippets. Each document is formatted with its source metadata and a snippet of its content. """ formatted_docs = [ ( f"\n" f"{doc.metadata['source']}\n" f"{doc.page_content}\n" f"" ) for i, doc in enumerate(docs) ] return f"\n\n\n{chr(10).join(formatted_docs)}\n" def get_response(self, query: str) -> str: """ Return the chatbot response for the given query. The method retrieves relevant documents and then uses the XML chain to generate an answer with citations. :param query: The user question. :return: XML-formatted answer with citations. """ return self.chain.invoke({"input": query}) def get_extra_resources(self, query: str, original_sources: List[str]): """ Invokes the retriever using the given query and returns additional resources. Uses the retriever to fetch resources based on the input query string. This method facilitates targeted resource retrieval based on provided input. :param query: A string representing the query to be processed by the retriever. :type query: str :return: The resources or data obtained from the retriever after processing the query. """ retriever = self.vectordb.as_retriever( search_type="similarity", search_kwargs={"k": 8, "filter":{"source": {"$nin": original_sources}}} ) result = retriever.invoke(query) return result