Spaces:
Sleeping
Sleeping
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_openai import ChatOpenAI | |
| from langchain_chroma import Chroma | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_core.runnables import RunnablePassthrough | |
| from langchain_core.output_parsers import XMLOutputParser | |
| from langchain.chains import create_retrieval_chain | |
| from langchain_core.documents import Document | |
| from typing import List | |
| XML_SYSTEM_PROMPT = """You're a helpful AI assistant. Given a user question and some scientific literature | |
| documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile) | |
| and their performance against natural hazards(e.g., wind, hail), answer the user | |
| question. | |
| You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile) | |
| and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff. | |
| When addressing questions about ‘what is the best roof,’ consider the following factors: | |
| • Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age. | |
| • For the insurance industry, the ‘best roof’ depends on the specific hazards (their location and frequency), performance expectations and predictability, and the cost of materials. | |
| If none of the articles answer the question, simply say that there are no articles relevant to your inquiry. | |
| Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that | |
| justifies the answer and the ID and also Source Name of the quote article. Return a citation for every quote across all articles | |
| that justify the answer. Use the following format for your final output: | |
| <cited_answer> | |
| <answer></answer> | |
| <citations> | |
| <citation><source_id></source_id><source></source><quote></quote></citation> | |
| <citation><source_id></source_id><source></source><quote></quote></citation> | |
| ... | |
| </citations> | |
| </cited_answer> | |
| If none of the articles answer the question, return: | |
| <cited_answer> | |
| <answer>Nothing</answer> | |
| <citations/> | |
| </cited_answer> | |
| ALWAYS maintain valid XML structure with properly closed tags. Here are the articles:{context}""" | |
| class RoofCoverChatbot: | |
| def __init__(self, model: str = "gpt-4.1", temperature: float = 0.1): | |
| """ | |
| Initialize the RoofCoverChatbot by setting up the retrieval chain, | |
| which uses scientific literature documents to generate an XML-formatted answer. | |
| """ | |
| # Create the XML prompt template. | |
| self.xml_prompt = ChatPromptTemplate.from_messages( | |
| [("system", XML_SYSTEM_PROMPT), ("human", "{input}")] | |
| ) | |
| # Initialize the language model. | |
| self.llm = ChatOpenAI(model=model, temperature=temperature) | |
| # Create the chain that refines answers using retrieved documents. | |
| # The first step formats the retrieved context as XML. | |
| rag_chain_from_docs = ( | |
| RunnablePassthrough.assign( | |
| context=(lambda x: self.format_docs_xml(x["context"])) | |
| ) | |
| | self.xml_prompt | |
| | self.llm | |
| | XMLOutputParser() | |
| ) | |
| embeddings = OpenAIEmbeddings(model="text-embedding-3-large") | |
| self.vectordb = Chroma( | |
| embedding_function=embeddings, | |
| persist_directory="./chroma_db", | |
| collection_name="document_collection", | |
| collection_metadata={"hnsw:space": "cosine"} | |
| ) | |
| # Use similarity search to retrieve the top-K documents. | |
| self.retriever = self.vectordb.as_retriever( | |
| search_type="similarity", search_kwargs={"k": 5} | |
| ) | |
| # This lambda extracts the "input" key for retrieval. | |
| retrieve_docs = (lambda x: x["input"]) | self.retriever | |
| # Build the final chain: retrieve documents then generate an answer. | |
| self.chain = RunnablePassthrough.assign(context=retrieve_docs).assign( | |
| answer=rag_chain_from_docs | |
| ) | |
| def format_docs_xml(docs: List[Document]) -> str: | |
| """ | |
| Format a list of documents into XML snippets. | |
| Each document is formatted with its source metadata and a snippet of its content. | |
| """ | |
| formatted_docs = [ | |
| ( | |
| f"<source id=\"{i}\">\n" | |
| f"<source>{doc.metadata['source']}</source>\n" | |
| f"<article_snippet>{doc.page_content}</article_snippet>\n" | |
| f"</source>" | |
| ) | |
| for i, doc in enumerate(docs) | |
| ] | |
| return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>" | |
| def get_response(self, query: str) -> str: | |
| """ | |
| Return the chatbot response for the given query. | |
| The method retrieves relevant documents and then uses the XML chain to generate | |
| an answer with citations. | |
| :param query: The user question. | |
| :return: XML-formatted answer with citations. | |
| """ | |
| return self.chain.invoke({"input": query}) | |
| def get_extra_resources(self, query: str, original_sources: List[str]): | |
| """ | |
| Invokes the retriever using the given query and returns additional resources. | |
| Uses the retriever to fetch resources based on the input query string. This | |
| method facilitates targeted resource retrieval based on provided input. | |
| :param query: A string representing the query to be processed by the retriever. | |
| :type query: str | |
| :return: The resources or data obtained from the retriever after processing | |
| the query. | |
| """ | |
| retriever = self.vectordb.as_retriever( | |
| search_type="similarity", search_kwargs={"k": 8, "filter":{"source": {"$nin": original_sources}}} | |
| ) | |
| result = retriever.invoke(query) | |
| return result | |