Spaces:
Sleeping
Sleeping
File size: 6,019 Bytes
fa8ee23 29f53db fa8ee23 29f53db 642592e fa8ee23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import XMLOutputParser
from langchain.chains import create_retrieval_chain
from langchain_core.documents import Document
from typing import List
XML_SYSTEM_PROMPT = """You're a helpful AI assistant. Given a user question and some scientific literature
documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
and their performance against natural hazards(e.g., wind, hail), answer the user
question.
You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile)
and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.
When addressing questions about ‘what is the best roof,’ consider the following factors:
• Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
• For the insurance industry, the ‘best roof’ depends on the specific hazards (their location and frequency), performance expectations and predictability, and the cost of materials.
If none of the articles answer the question, simply say that there are no articles relevant to your inquiry.
Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that
justifies the answer and the ID and also Source Name of the quote article. Return a citation for every quote across all articles
that justify the answer. Use the following format for your final output:
<cited_answer>
<answer></answer>
<citations>
<citation><source_id></source_id><source></source><quote></quote></citation>
<citation><source_id></source_id><source></source><quote></quote></citation>
...
</citations>
</cited_answer>
If none of the articles answer the question, return:
<cited_answer>
<answer>Nothing</answer>
<citations/>
</cited_answer>
ALWAYS maintain valid XML structure with properly closed tags. Here are the articles:{context}"""
class RoofCoverChatbot:
def __init__(self, model: str = "gpt-4.1", temperature: float = 0.1):
"""
Initialize the RoofCoverChatbot by setting up the retrieval chain,
which uses scientific literature documents to generate an XML-formatted answer.
"""
# Create the XML prompt template.
self.xml_prompt = ChatPromptTemplate.from_messages(
[("system", XML_SYSTEM_PROMPT), ("human", "{input}")]
)
# Initialize the language model.
self.llm = ChatOpenAI(model=model, temperature=temperature)
# Create the chain that refines answers using retrieved documents.
# The first step formats the retrieved context as XML.
rag_chain_from_docs = (
RunnablePassthrough.assign(
context=(lambda x: self.format_docs_xml(x["context"]))
)
| self.xml_prompt
| self.llm
| XMLOutputParser()
)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
self.vectordb = Chroma(
embedding_function=embeddings,
persist_directory="./chroma_db",
collection_name="document_collection",
collection_metadata={"hnsw:space": "cosine"}
)
# Use similarity search to retrieve the top-K documents.
self.retriever = self.vectordb.as_retriever(
search_type="similarity", search_kwargs={"k": 5}
)
# This lambda extracts the "input" key for retrieval.
retrieve_docs = (lambda x: x["input"]) | self.retriever
# Build the final chain: retrieve documents then generate an answer.
self.chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
answer=rag_chain_from_docs
)
@staticmethod
def format_docs_xml(docs: List[Document]) -> str:
"""
Format a list of documents into XML snippets.
Each document is formatted with its source metadata and a snippet of its content.
"""
formatted_docs = [
(
f"<source id=\"{i}\">\n"
f"<source>{doc.metadata['source']}</source>\n"
f"<article_snippet>{doc.page_content}</article_snippet>\n"
f"</source>"
)
for i, doc in enumerate(docs)
]
return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"
def get_response(self, query: str) -> str:
"""
Return the chatbot response for the given query.
The method retrieves relevant documents and then uses the XML chain to generate
an answer with citations.
:param query: The user question.
:return: XML-formatted answer with citations.
"""
return self.chain.invoke({"input": query})
def get_extra_resources(self, query: str, original_sources: List[str]):
"""
Invokes the retriever using the given query and returns additional resources.
Uses the retriever to fetch resources based on the input query string. This
method facilitates targeted resource retrieval based on provided input.
:param query: A string representing the query to be processed by the retriever.
:type query: str
:return: The resources or data obtained from the retriever after processing
the query.
"""
retriever = self.vectordb.as_retriever(
search_type="similarity", search_kwargs={"k": 8, "filter":{"source": {"$nin": original_sources}}}
)
result = retriever.invoke(query)
return result
|