File size: 6,019 Bytes
fa8ee23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29f53db
fa8ee23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29f53db
642592e
 
fa8ee23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import XMLOutputParser
from langchain.chains import create_retrieval_chain
from langchain_core.documents import Document
from typing import List


XML_SYSTEM_PROMPT = """You're a helpful AI assistant. Given a user question and some scientific literature 
documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile) 
and their performance against natural hazards(e.g., wind, hail), answer the user 
question. 
You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile) 
and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.

When addressing questions about ‘what is the best roof,’ consider the following factors:
	•	Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
	•	For the insurance industry, the ‘best roof’ depends on the specific hazards (their location and frequency), performance expectations and predictability, and the cost of materials.

If none of the articles answer the question, simply say that there are no articles relevant to your inquiry.
Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that 
justifies the answer and the ID and also Source Name of the quote article. Return a citation for every quote across all articles 
that justify the answer. Use the following format for your final output:
<cited_answer>
    <answer></answer>
    <citations>
        <citation><source_id></source_id><source></source><quote></quote></citation>
        <citation><source_id></source_id><source></source><quote></quote></citation>
        ...
    </citations>
</cited_answer>

If none of the articles answer the question, return:
<cited_answer>
    <answer>Nothing</answer>
    <citations/>
</cited_answer>

ALWAYS maintain valid XML structure with properly closed tags. Here are the articles:{context}"""


class RoofCoverChatbot:
    def __init__(self, model: str = "gpt-4.1", temperature: float = 0.1):
        """
        Initialize the RoofCoverChatbot by setting up the retrieval chain,
        which uses scientific literature documents to generate an XML-formatted answer.
        """
        # Create the XML prompt template.
        self.xml_prompt = ChatPromptTemplate.from_messages(
            [("system", XML_SYSTEM_PROMPT), ("human", "{input}")]
        )

        # Initialize the language model.
        self.llm = ChatOpenAI(model=model, temperature=temperature)

        # Create the chain that refines answers using retrieved documents.
        # The first step formats the retrieved context as XML.
        rag_chain_from_docs = (
                RunnablePassthrough.assign(
                    context=(lambda x: self.format_docs_xml(x["context"]))
                )
                | self.xml_prompt
                | self.llm
                | XMLOutputParser()
        )

        embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

        self.vectordb = Chroma(
            embedding_function=embeddings,
            persist_directory="./chroma_db",
            collection_name="document_collection",
            collection_metadata={"hnsw:space": "cosine"}
        )
        # Use similarity search to retrieve the top-K documents.
        self.retriever = self.vectordb.as_retriever(
            search_type="similarity", search_kwargs={"k": 5}
        )

        # This lambda extracts the "input" key for retrieval.
        retrieve_docs = (lambda x: x["input"]) | self.retriever

        # Build the final chain: retrieve documents then generate an answer.
        self.chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
            answer=rag_chain_from_docs
        )

    @staticmethod
    def format_docs_xml(docs: List[Document]) -> str:
        """
        Format a list of documents into XML snippets.

        Each document is formatted with its source metadata and a snippet of its content.
        """

        formatted_docs = [
            (
                f"<source id=\"{i}\">\n"
                f"<source>{doc.metadata['source']}</source>\n"
                f"<article_snippet>{doc.page_content}</article_snippet>\n"
                f"</source>"
            )
            for i, doc in enumerate(docs)
        ]
        return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"

    def get_response(self, query: str) -> str:
        """
        Return the chatbot response for the given query.

        The method retrieves relevant documents and then uses the XML chain to generate
        an answer with citations.

        :param query: The user question.
        :return: XML-formatted answer with citations.
        """
        return self.chain.invoke({"input": query})

    def get_extra_resources(self, query: str, original_sources: List[str]):
        """
        Invokes the retriever using the given query and returns additional resources.

        Uses the retriever to fetch resources based on the input query string. This
        method facilitates targeted resource retrieval based on provided input.

        :param query: A string representing the query to be processed by the retriever.
        :type query: str
        :return: The resources or data obtained from the retriever after processing
            the query.
        """

        retriever = self.vectordb.as_retriever(
            search_type="similarity", search_kwargs={"k": 8, "filter":{"source": {"$nin": original_sources}}}
        )

        result =  retriever.invoke(query)
        return result