from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import XMLOutputParser
from langchain.chains import create_retrieval_chain
from langchain_core.documents import Document
from typing import List
XML_SYSTEM_PROMPT = """You're a helpful AI assistant. Given a user question and some scientific literature
documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
and their performance against natural hazards(e.g., wind, hail), answer the user
question.
You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile)
and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.
When addressing questions about ‘what is the best roof,’ consider the following factors:
• Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
• For the insurance industry, the ‘best roof’ depends on the specific hazards (their location and frequency), performance expectations and predictability, and the cost of materials.
If none of the articles answer the question, simply say that there are no articles relevant to your inquiry.
Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that
justifies the answer and the ID and also Source Name of the quote article. Return a citation for every quote across all articles
that justify the answer. Use the following format for your final output:
...
If none of the articles answer the question, return:
Nothing
ALWAYS maintain valid XML structure with properly closed tags. Here are the articles:{context}"""
class RoofCoverChatbot:
def __init__(self, model: str = "gpt-4.1", temperature: float = 0.1):
"""
Initialize the RoofCoverChatbot by setting up the retrieval chain,
which uses scientific literature documents to generate an XML-formatted answer.
"""
# Create the XML prompt template.
self.xml_prompt = ChatPromptTemplate.from_messages(
[("system", XML_SYSTEM_PROMPT), ("human", "{input}")]
)
# Initialize the language model.
self.llm = ChatOpenAI(model=model, temperature=temperature)
# Create the chain that refines answers using retrieved documents.
# The first step formats the retrieved context as XML.
rag_chain_from_docs = (
RunnablePassthrough.assign(
context=(lambda x: self.format_docs_xml(x["context"]))
)
| self.xml_prompt
| self.llm
| XMLOutputParser()
)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
self.vectordb = Chroma(
embedding_function=embeddings,
persist_directory="./chroma_db",
collection_name="document_collection",
collection_metadata={"hnsw:space": "cosine"}
)
# Use similarity search to retrieve the top-K documents.
self.retriever = self.vectordb.as_retriever(
search_type="similarity", search_kwargs={"k": 5}
)
# This lambda extracts the "input" key for retrieval.
retrieve_docs = (lambda x: x["input"]) | self.retriever
# Build the final chain: retrieve documents then generate an answer.
self.chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
answer=rag_chain_from_docs
)
@staticmethod
def format_docs_xml(docs: List[Document]) -> str:
"""
Format a list of documents into XML snippets.
Each document is formatted with its source metadata and a snippet of its content.
"""
formatted_docs = [
(
f"\n"
f"{doc.metadata['source']}\n"
f"{doc.page_content}\n"
f""
)
for i, doc in enumerate(docs)
]
return f"\n\n\n{chr(10).join(formatted_docs)}\n"
def get_response(self, query: str) -> str:
"""
Return the chatbot response for the given query.
The method retrieves relevant documents and then uses the XML chain to generate
an answer with citations.
:param query: The user question.
:return: XML-formatted answer with citations.
"""
return self.chain.invoke({"input": query})
def get_extra_resources(self, query: str, original_sources: List[str]):
"""
Invokes the retriever using the given query and returns additional resources.
Uses the retriever to fetch resources based on the input query string. This
method facilitates targeted resource retrieval based on provided input.
:param query: A string representing the query to be processed by the retriever.
:type query: str
:return: The resources or data obtained from the retriever after processing
the query.
"""
retriever = self.vectordb.as_retriever(
search_type="similarity", search_kwargs={"k": 8, "filter":{"source": {"$nin": original_sources}}}
)
result = retriever.invoke(query)
return result