Spaces:
Sleeping
Sleeping
Upload 13 files
Browse files- .gitattributes +1 -0
- LiteratureAgent.py +139 -0
- Refiner.py +60 -0
- chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/data_level0.bin +3 -0
- chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/header.bin +3 -0
- chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/index_metadata.pickle +3 -0
- chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/length.bin +3 -0
- chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/link_lists.bin +3 -0
- chroma_langchain_db/chroma.sqlite3 +3 -0
- config.py +56 -5
- helpers.py +2 -1
- main.py +64 -107
- refine.pdf +0 -0
- requirements.txt +2 -1
.gitattributes
CHANGED
|
@@ -78,3 +78,4 @@ Articles/wind_loads_on_discontinuous_metal_roofing_ibhs.pdf filter=lfs diff=lfs
|
|
| 78 |
Articles/wind_uplift_resistance_of_artificially_and_naturally_aged_asphalt_shingles.pdf filter=lfs diff=lfs merge=lfs -text
|
| 79 |
Articles/wind_vulnerability_analysis_of_standing_seam_roof_system_considering_fatigue_damage.pdf filter=lfs diff=lfs merge=lfs -text
|
| 80 |
chroma_store/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 78 |
Articles/wind_uplift_resistance_of_artificially_and_naturally_aged_asphalt_shingles.pdf filter=lfs diff=lfs merge=lfs -text
|
| 79 |
Articles/wind_vulnerability_analysis_of_standing_seam_roof_system_considering_fatigue_damage.pdf filter=lfs diff=lfs merge=lfs -text
|
| 80 |
chroma_store/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 81 |
+
chroma_langchain_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
LiteratureAgent.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_openai import OpenAIEmbeddings
|
| 2 |
+
from langchain_openai import ChatOpenAI
|
| 3 |
+
from langchain_chroma import Chroma
|
| 4 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 5 |
+
from langchain_core.runnables import RunnablePassthrough
|
| 6 |
+
from langchain_core.output_parsers import XMLOutputParser
|
| 7 |
+
from langchain.chains import create_retrieval_chain
|
| 8 |
+
from langchain_core.documents import Document
|
| 9 |
+
from typing import List
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
XML_SYSTEM_PROMPT = """You're a helpful AI assistant. Given a user question and some scientific literature
|
| 13 |
+
documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
|
| 14 |
+
and their performance against natural hazards(e.g., wind, hail), answer the user
|
| 15 |
+
question.
|
| 16 |
+
You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile)
|
| 17 |
+
and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.
|
| 18 |
+
|
| 19 |
+
When addressing questions about ‘what is the best roof,’ consider the following factors:
|
| 20 |
+
• Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
|
| 21 |
+
• For the insurance industry, the ‘best roof’ depends on the specific hazards (their location and frequency), performance expectations and predictability, and the cost of materials.
|
| 22 |
+
|
| 23 |
+
If none of the articles answer the question, simply say that there are no articles relevant to your inquiry.
|
| 24 |
+
Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that
|
| 25 |
+
justifies the answer and the ID and also Source Name of the quote article. Return a citation for every quote across all articles
|
| 26 |
+
that justify the answer. Use the following format for your final output:
|
| 27 |
+
<cited_answer>
|
| 28 |
+
<answer></answer>
|
| 29 |
+
<citations>
|
| 30 |
+
<citation><source_id></source_id><source></source><quote></quote></citation>
|
| 31 |
+
<citation><source_id></source_id><source></source><quote></quote></citation>
|
| 32 |
+
...
|
| 33 |
+
</citations>
|
| 34 |
+
</cited_answer>
|
| 35 |
+
|
| 36 |
+
If none of the articles answer the question, return:
|
| 37 |
+
<cited_answer>
|
| 38 |
+
<answer>Nothing</answer>
|
| 39 |
+
<citations/>
|
| 40 |
+
</cited_answer>
|
| 41 |
+
|
| 42 |
+
ALWAYS maintain valid XML structure with properly closed tags. Here are the articles:{context}"""
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class RoofCoverChatbot:
|
| 46 |
+
def __init__(self, model: str = "gpt-4o", temperature: float = 0.1):
|
| 47 |
+
"""
|
| 48 |
+
Initialize the RoofCoverChatbot by setting up the retrieval chain,
|
| 49 |
+
which uses scientific literature documents to generate an XML-formatted answer.
|
| 50 |
+
"""
|
| 51 |
+
# Create the XML prompt template.
|
| 52 |
+
self.xml_prompt = ChatPromptTemplate.from_messages(
|
| 53 |
+
[("system", XML_SYSTEM_PROMPT), ("human", "{input}")]
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Initialize the language model.
|
| 57 |
+
self.llm = ChatOpenAI(model=model, temperature=temperature)
|
| 58 |
+
|
| 59 |
+
# Create the chain that refines answers using retrieved documents.
|
| 60 |
+
# The first step formats the retrieved context as XML.
|
| 61 |
+
rag_chain_from_docs = (
|
| 62 |
+
RunnablePassthrough.assign(
|
| 63 |
+
context=(lambda x: self.format_docs_xml(x["context"]))
|
| 64 |
+
)
|
| 65 |
+
| self.xml_prompt
|
| 66 |
+
| self.llm
|
| 67 |
+
| XMLOutputParser()
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
|
| 71 |
+
|
| 72 |
+
self.vectordb = Chroma(
|
| 73 |
+
collection_name="roofs_collection",
|
| 74 |
+
embedding_function=embeddings,
|
| 75 |
+
persist_directory="./chroma_langchain_db",
|
| 76 |
+
)
|
| 77 |
+
# Use similarity search to retrieve the top-K documents.
|
| 78 |
+
self.retriever = self.vectordb.as_retriever(
|
| 79 |
+
search_type="similarity", search_kwargs={"k": 5}
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# This lambda extracts the "input" key for retrieval.
|
| 83 |
+
retrieve_docs = (lambda x: x["input"]) | self.retriever
|
| 84 |
+
|
| 85 |
+
# Build the final chain: retrieve documents then generate an answer.
|
| 86 |
+
self.chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
|
| 87 |
+
answer=rag_chain_from_docs
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
@staticmethod
|
| 91 |
+
def format_docs_xml(docs: List[Document]) -> str:
|
| 92 |
+
"""
|
| 93 |
+
Format a list of documents into XML snippets.
|
| 94 |
+
|
| 95 |
+
Each document is formatted with its source metadata and a snippet of its content.
|
| 96 |
+
"""
|
| 97 |
+
|
| 98 |
+
formatted_docs = [
|
| 99 |
+
(
|
| 100 |
+
f"<source id=\"{i}\">\n"
|
| 101 |
+
f"<source>{doc.metadata['source']}</source>\n"
|
| 102 |
+
f"<article_snippet>{doc.page_content}</article_snippet>\n"
|
| 103 |
+
f"</source>"
|
| 104 |
+
)
|
| 105 |
+
for i, doc in enumerate(docs)
|
| 106 |
+
]
|
| 107 |
+
return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"
|
| 108 |
+
|
| 109 |
+
def get_response(self, query: str) -> str:
|
| 110 |
+
"""
|
| 111 |
+
Return the chatbot response for the given query.
|
| 112 |
+
|
| 113 |
+
The method retrieves relevant documents and then uses the XML chain to generate
|
| 114 |
+
an answer with citations.
|
| 115 |
+
|
| 116 |
+
:param query: The user question.
|
| 117 |
+
:return: XML-formatted answer with citations.
|
| 118 |
+
"""
|
| 119 |
+
return self.chain.invoke({"input": query})
|
| 120 |
+
|
| 121 |
+
def get_extra_resources(self, query: str, original_sources: List[str]):
|
| 122 |
+
"""
|
| 123 |
+
Invokes the retriever using the given query and returns additional resources.
|
| 124 |
+
|
| 125 |
+
Uses the retriever to fetch resources based on the input query string. This
|
| 126 |
+
method facilitates targeted resource retrieval based on provided input.
|
| 127 |
+
|
| 128 |
+
:param query: A string representing the query to be processed by the retriever.
|
| 129 |
+
:type query: str
|
| 130 |
+
:return: The resources or data obtained from the retriever after processing
|
| 131 |
+
the query.
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
retriever = self.vectordb.as_retriever(
|
| 135 |
+
search_type="similarity", search_kwargs={"k": 8, "filter":{"source": {"$nin": original_sources}}}
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
result = retriever.invoke(query)
|
| 139 |
+
return result
|
Refiner.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_core.output_parsers import JsonOutputParser
|
| 2 |
+
from langchain_core.runnables import RunnableLambda
|
| 3 |
+
from langchain_core.prompts import PromptTemplate
|
| 4 |
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
| 7 |
+
from langchain_core.vectorstores import InMemoryVectorStore
|
| 8 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 9 |
+
from operator import itemgetter
|
| 10 |
+
from config import NEW_REFINE_SYSTEM_PROMPT_JSON
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Answer(BaseModel):
|
| 14 |
+
enhanced_question: str = Field(description="Paraphrased question")
|
| 15 |
+
enhanced_answer: str = Field(description="Enhanced answer")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class RefinementPipeline:
|
| 19 |
+
def __init__(self, model: str = "gpt-4o", temperature: float = 0.1):
|
| 20 |
+
self.llm = ChatOpenAI(model=model, temperature=temperature)
|
| 21 |
+
self.parser = JsonOutputParser(pydantic_object=Answer)
|
| 22 |
+
self.prompt = PromptTemplate(
|
| 23 |
+
template=NEW_REFINE_SYSTEM_PROMPT_JSON,
|
| 24 |
+
input_variables=["question", "answer", "context"],
|
| 25 |
+
partial_variables={"format_instructions": self.parser.get_format_instructions()},
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# Load and process PDF
|
| 29 |
+
self.pdf_loader = PyMuPDFLoader("refine.pdf")
|
| 30 |
+
self.pdf_docs = self.pdf_loader.load()
|
| 31 |
+
|
| 32 |
+
# Split the document into chunks
|
| 33 |
+
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
|
| 34 |
+
self.splits = self.text_splitter.split_documents(self.pdf_docs)
|
| 35 |
+
|
| 36 |
+
# Create an in-memory vector store from the document splits
|
| 37 |
+
self.pdf_vectorstore = InMemoryVectorStore.from_documents(
|
| 38 |
+
documents=self.splits, embedding=OpenAIEmbeddings()
|
| 39 |
+
)
|
| 40 |
+
self.pdf_retriever = self.pdf_vectorstore.as_retriever()
|
| 41 |
+
|
| 42 |
+
# Define the processing chain
|
| 43 |
+
self.chain = (
|
| 44 |
+
{
|
| 45 |
+
"context": itemgetter("question") | self.pdf_retriever,
|
| 46 |
+
"question": itemgetter("question"),
|
| 47 |
+
"answer": itemgetter("answer")
|
| 48 |
+
}
|
| 49 |
+
| RunnableLambda(lambda x: {
|
| 50 |
+
"context": "\n".join([doc.page_content for doc in x["context"]]),
|
| 51 |
+
"question": x["question"],
|
| 52 |
+
"answer": x["answer"]
|
| 53 |
+
})
|
| 54 |
+
| self.prompt
|
| 55 |
+
| self.llm
|
| 56 |
+
| self.parser
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
def invoke(self, question: str, answer: str):
|
| 60 |
+
return self.chain.invoke({"question": question, "answer": answer})
|
chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e95387811fdd2c605bdbe40ef66c6427e098a831868dd455a3033a5eaf874a3
|
| 3 |
+
size 62140000
|
chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73a6d751332fa17ddf2061d192fdfd6cdbd8cec7ee44b674a975082dc8b71982
|
| 3 |
+
size 100
|
chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24b1ebdaa3495251e55f78467b1c4675e3100547c42ba96603c02b2ba59a1121
|
| 3 |
+
size 288012
|
chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f2e700c214bf60718eade7cbed207636098e2fe9f13fc04b69c3f9957bd9940
|
| 3 |
+
size 20000
|
chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5619be57918b92ff190f6a1a193c472a026ed614ea305f95d5e4d1523961fe03
|
| 3 |
+
size 43052
|
chroma_langchain_db/chroma.sqlite3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e399b010878c1e88b70e4dff1babfbe01cff91fcbfea99f77817f3b6e66a9771
|
| 3 |
+
size 126881792
|
config.py
CHANGED
|
@@ -2,10 +2,7 @@ from langchain.chains.summarize.refine_prompts import REFINE_PROMPT
|
|
| 2 |
|
| 3 |
XML_SYSTEM_PROMPT= """You're a helpful AI assistant. Given a user question and some scientific literature
|
| 4 |
documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
|
| 5 |
-
and their performance against natural hazards(e.g., wind, hail),
|
| 6 |
-
question.
|
| 7 |
-
You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile)
|
| 8 |
-
and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.
|
| 9 |
|
| 10 |
When addressing questions about ‘what is the best roof,’ consider the following factors:
|
| 11 |
• Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
|
|
@@ -25,6 +22,29 @@ that justify the answer. Use the following format for your final output:
|
|
| 25 |
</cited_answer>
|
| 26 |
Here are the articles:{context}"""
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
REFINE_SYSTEM_PROMPT = (
|
| 29 |
"You are an assistant for question-answering tasks. "
|
| 30 |
"Use the following pieces of retrieved context to answer "
|
|
@@ -32,4 +52,35 @@ REFINE_SYSTEM_PROMPT = (
|
|
| 32 |
"If you cannot answer the question with the retrieved context, only say that 'Nothing' "
|
| 33 |
"\n\n"
|
| 34 |
"{context}"
|
| 35 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
XML_SYSTEM_PROMPT= """You're a helpful AI assistant. Given a user question and some scientific literature
|
| 4 |
documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
|
| 5 |
+
and their performance against natural hazards(e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
When addressing questions about ‘what is the best roof,’ consider the following factors:
|
| 8 |
• Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
|
|
|
|
| 22 |
</cited_answer>
|
| 23 |
Here are the articles:{context}"""
|
| 24 |
|
| 25 |
+
# NEW_REFINE_SYSTEM_PROMPT = (
|
| 26 |
+
# "You are an assistant designed to ensure that answers are completely aligned with the authoritative content of a provided PDF. "
|
| 27 |
+
# "Before retrieving any context, first paraphrase the user's question to fully capture its intended meaning and naturally fit it into the context of the PDF. "
|
| 28 |
+
# "This is crucial because users sometimes provide partially paraphrased or incomplete questions. "
|
| 29 |
+
# "The questions generally pertain to scientific literature on roof cover materials (e.g., asphalt shingles, metal, tile) and their performance against natural hazards (e.g., wind, hail). "
|
| 30 |
+
# "Your task is then to take the refined question and an initial answer from another source, and refine that answer using the context retrieved from the PDF. "
|
| 31 |
+
# "If the retrieved context supports the initial answer, adjust it to fully match the PDF content. If the context contradicts or does not support the answer, modify it accordingly. "
|
| 32 |
+
# "Provide clear, concise, and informed answers without unnecessary fluff. If there is no supporting evidence, simply respond with ONLY one word 'Nothing'. "
|
| 33 |
+
# "IMPORTANT: In your final output, return only the refined answer text with no additional labels, headings, or repeated paraphrased questions."
|
| 34 |
+
# )
|
| 35 |
+
|
| 36 |
+
# NEW_REFINE_SYSTEM_PROMPT = (
|
| 37 |
+
# "You are an assistant designed to ensure that answers are completely aligned with the authoritative content of a provided PDF. "
|
| 38 |
+
# "Before retrieving any context, first paraphrase the user's question to fully capture its intended meaning and naturally fit it into the context of the PDF. "
|
| 39 |
+
# "This is crucial because users sometimes provide partially paraphrased or incomplete questions. "
|
| 40 |
+
# "The questions generally pertain to scientific literature on roof cover materials (e.g., asphalt shingles, metal, tile) and their performance against natural hazards (e.g., wind, hail). "
|
| 41 |
+
# "Your task is then to take the refined (paraphrased) question and an initial answer from another source, and refine that answer using the context retrieved from the PDF. "
|
| 42 |
+
# "If the retrieved context supports the initial answer, adjust it to fully match the PDF content. If the context contradicts or does not support the answer, modify it accordingly. "
|
| 43 |
+
# "Provide clear, concise, and informed answers without unnecessary fluff. If there is no supporting evidence, simply respond with 'Nothing'. "
|
| 44 |
+
# "IMPORTANT: Your final output must be a valid JSON object with exactly two keys: 'paraphrased_question' and 'answer'. "
|
| 45 |
+
# "Respond ONLY with the JSON object. Do not include any other text or formatting."
|
| 46 |
+
# )
|
| 47 |
+
|
| 48 |
REFINE_SYSTEM_PROMPT = (
|
| 49 |
"You are an assistant for question-answering tasks. "
|
| 50 |
"Use the following pieces of retrieved context to answer "
|
|
|
|
| 52 |
"If you cannot answer the question with the retrieved context, only say that 'Nothing' "
|
| 53 |
"\n\n"
|
| 54 |
"{context}"
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# NEW_REFINE_SYSTEM_PROMPT_JSON = (
|
| 58 |
+
# "You are an assistant designed to ensure that answers are completely aligned with the authoritative content of a provided PDF. "
|
| 59 |
+
# "Before retrieving any context, first paraphrase the user's question to fully capture its intended meaning and naturally fit it into the context of the PDF. "
|
| 60 |
+
# "This is crucial because users sometimes provide partially paraphrased or incomplete questions. "
|
| 61 |
+
# "The questions generally pertain to scientific literature on roof cover materials (e.g., asphalt shingles, metal, tile) and their performance against natural hazards (e.g., wind, hail). "
|
| 62 |
+
# "Your task is then to take the refined (paraphrased) question and an initial answer from another source, and refine that answer using the context retrieved from the PDF. "
|
| 63 |
+
# "If the retrieved context supports the initial answer, adjust it to fully match the PDF content. If the context contradicts or does not support the answer, modify it accordingly. "
|
| 64 |
+
# "Provide clear, concise, and informed answers without unnecessary fluff. If there is no supporting evidence, simply respond with 'Nothing'. "
|
| 65 |
+
# "Here is the user question: {question} "
|
| 66 |
+
# "Here is the initial answer: {answer} "
|
| 67 |
+
# "Here is the retrieved context: {context} "
|
| 68 |
+
# "IMPORTANT: Your final output must be a valid JSON object with {format_instructions} "
|
| 69 |
+
# )
|
| 70 |
+
|
| 71 |
+
NEW_REFINE_SYSTEM_PROMPT_JSON = """You are an assistant that ensures answers are fully aligned with the authoritative content of a provided PDF document. Before retrieving any context, first paraphrase the user's question to capture its complete intended meaning and seamlessly integrate it into the context of the PDF. This is essential because users sometimes submit partially paraphrased or incomplete questions, especially regarding scientific literature on roof cover materials (e.g., asphalt shingles, metal, tile) and their performance against natural hazards (e.g., wind, hail).
|
| 72 |
+
|
| 73 |
+
Your task is as follows:
|
| 74 |
+
1. Paraphrase the provided user question to fully clarify its intent.
|
| 75 |
+
2. Using the retrieved context from the PDF, refine the initial answer from another source:
|
| 76 |
+
- If the context supports the initial answer, adjust it so that it fully aligns with the PDF content.
|
| 77 |
+
- If the context contradicts or does not support the answer, modify the answer accordingly.
|
| 78 |
+
- If there is no supporting evidence, respond ONLY with "Nothing" as "enhanced_answer".
|
| 79 |
+
3. Provide a clear, concise, and informed answer without unnecessary fluff.
|
| 80 |
+
|
| 81 |
+
Inputs:
|
| 82 |
+
- User question: {question}
|
| 83 |
+
- Initial answer: {answer}
|
| 84 |
+
- Retrieved context: {context}
|
| 85 |
+
|
| 86 |
+
IMPORTANT: Your final output must be a valid JSON object with exactly two keys: "paraphrased_question" and "answer". {format_instructions}"""
|
helpers.py
CHANGED
|
@@ -32,7 +32,8 @@ def get_article_info(df: pd.DataFrame, file_name: str):
|
|
| 32 |
Given a DataFrame and a file name, return the corresponding
|
| 33 |
title and link from the CSV. Assumes file_name is unique in the DF.
|
| 34 |
"""
|
| 35 |
-
|
|
|
|
| 36 |
if row.empty:
|
| 37 |
# Fallback if not found
|
| 38 |
return "IBHS Website", "https://ibhs.org"
|
|
|
|
| 32 |
Given a DataFrame and a file name, return the corresponding
|
| 33 |
title and link from the CSV. Assumes file_name is unique in the DF.
|
| 34 |
"""
|
| 35 |
+
edited_file_name = file_name.replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
|
| 36 |
+
row = df[df["file_name"] == edited_file_name]
|
| 37 |
if row.empty:
|
| 38 |
# Fallback if not found
|
| 39 |
return "IBHS Website", "https://ibhs.org"
|
main.py
CHANGED
|
@@ -1,11 +1,3 @@
|
|
| 1 |
-
from langchain_openai import OpenAIEmbeddings
|
| 2 |
-
from langchain_openai import ChatOpenAI
|
| 3 |
-
from langchain_chroma import Chroma
|
| 4 |
-
import chromadb
|
| 5 |
-
from chromadb.config import Settings
|
| 6 |
-
from langchain_core.prompts import ChatPromptTemplate
|
| 7 |
-
from langchain_core.runnables import RunnablePassthrough
|
| 8 |
-
from langchain_core.output_parsers import XMLOutputParser
|
| 9 |
import gradio as gr
|
| 10 |
import pandas as pd
|
| 11 |
import logging
|
|
@@ -13,24 +5,17 @@ from langchain_core.exceptions import OutputParserException
|
|
| 13 |
import os
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
import azure.cosmos.cosmos_client as cosmos_client
|
| 16 |
-
from langchain_community.document_loaders import PyPDFLoader
|
| 17 |
-
from langchain_core.vectorstores import InMemoryVectorStore
|
| 18 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 19 |
from langchain.chains import create_retrieval_chain
|
| 20 |
-
from langchain.chains.combine_documents import create_stuff_documents_chain
|
| 21 |
-
from langchain_core.runnables import RunnableLambda
|
| 22 |
import datetime
|
| 23 |
import uuid
|
| 24 |
-
from
|
| 25 |
-
from
|
| 26 |
|
|
|
|
| 27 |
|
| 28 |
load_dotenv()
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
K_VALUE = 5
|
| 32 |
-
|
| 33 |
-
xml_prompt = ChatPromptTemplate.from_messages([("system", XML_SYSTEM_PROMPT), ("human", "{input}")])
|
| 34 |
|
| 35 |
ENV = os.getenv('ENV')
|
| 36 |
HOST = os.getenv('ACCOUNT_HOST')
|
|
@@ -42,58 +27,8 @@ client = cosmos_client.CosmosClient(HOST, {'masterKey': MASTER_KEY}, user_agent=
|
|
| 42 |
database = client.get_database_client(DATABASE_ID)
|
| 43 |
container = database.get_container_client(CONTAINER_ID)
|
| 44 |
history_container = database.get_container_client(HISTORY_CONTAINER_ID)
|
| 45 |
-
llm = ChatOpenAI(model="gpt-4o", temperature=0)
|
| 46 |
-
df = pd.read_csv("articles_db.csv")
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
rag_chain_from_docs = (
|
| 50 |
-
RunnablePassthrough.assign(context=(lambda x: format_docs_xml(x["context"])))
|
| 51 |
-
| xml_prompt
|
| 52 |
-
| llm
|
| 53 |
-
| XMLOutputParser()
|
| 54 |
-
)
|
| 55 |
-
|
| 56 |
-
settings = Settings(persist_directory=PERSIST_DIRECTORY)
|
| 57 |
-
vectordb = Chroma(embedding_function=OpenAIEmbeddings(), persist_directory=PERSIST_DIRECTORY)
|
| 58 |
-
retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": K_VALUE})
|
| 59 |
-
retrieve_docs = (lambda x: x["input"]) | retriever
|
| 60 |
-
chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
|
| 61 |
-
answer=rag_chain_from_docs
|
| 62 |
-
)
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
pdf_loader = PyPDFLoader("refine.pdf")
|
| 66 |
-
pdf_docs = pdf_loader.load()
|
| 67 |
-
|
| 68 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
|
| 69 |
-
splits = text_splitter.split_documents(pdf_docs)
|
| 70 |
-
pdf_vectorstore = InMemoryVectorStore.from_documents(
|
| 71 |
-
documents=splits, embedding=OpenAIEmbeddings()
|
| 72 |
-
)
|
| 73 |
-
|
| 74 |
-
pdf_retriever = pdf_vectorstore.as_retriever()
|
| 75 |
-
|
| 76 |
-
refine_prompt = ChatPromptTemplate.from_messages(
|
| 77 |
-
[
|
| 78 |
-
("system", REFINE_SYSTEM_PROMPT),
|
| 79 |
-
("human", "{input}"),
|
| 80 |
-
]
|
| 81 |
-
)
|
| 82 |
-
|
| 83 |
-
question_answer_chain = create_stuff_documents_chain(llm, refine_prompt)
|
| 84 |
-
pdf_rag_chain = create_retrieval_chain(pdf_retriever, question_answer_chain)
|
| 85 |
-
|
| 86 |
-
def vectordb_search(query):
|
| 87 |
-
titles, links = [], []
|
| 88 |
-
question_search = retriever.invoke(query)
|
| 89 |
-
for item in question_search:
|
| 90 |
-
edited_item = item.metadata["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
|
| 91 |
-
title, link = get_article_info(df, edited_item)
|
| 92 |
-
if title not in titles:
|
| 93 |
-
titles.append(title)
|
| 94 |
-
links.append(link)
|
| 95 |
-
return "\n".join([f"- [{title}]({link})" for title, link in zip(titles, links)])
|
| 96 |
|
|
|
|
| 97 |
|
| 98 |
def initialize_session(session_id):
|
| 99 |
# If no session_id exists, generate a new one
|
|
@@ -111,47 +46,68 @@ def llm_response(query, session_id):
|
|
| 111 |
chat["partitionKey"] = "RoofingRoadmap"
|
| 112 |
chat["user"] = query
|
| 113 |
chat["env"] = ENV
|
|
|
|
| 114 |
|
| 115 |
-
|
|
|
|
|
|
|
| 116 |
try:
|
| 117 |
-
if 'f wave' in query.lower() or 'f-wave' in query.lower() or 'fwave' in query.lower():
|
| 118 |
-
query = query.replace('f wave', 'f-wave shingle').replace('f-wave', 'f-wave shingle').replace('fwave', 'f-wave shingle')
|
| 119 |
-
result = chain.invoke({"input": query})
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
title, link = get_article_info(df, edited_item)
|
| 131 |
-
if title not in titles:
|
| 132 |
-
titles.append(title)
|
| 133 |
-
# if link not in links:
|
| 134 |
-
links.append(link)
|
| 135 |
-
except (TypeError, KeyError, IndexError):
|
| 136 |
-
# Handle the error or simply pass if citation does not have the expected keys
|
| 137 |
-
continue
|
| 138 |
-
question_search = retriever.invoke(query)
|
| 139 |
-
for res_item in question_search:
|
| 140 |
-
edited_item = res_item.metadata["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
|
| 141 |
-
res_title, res_link = get_article_info(df, edited_item)
|
| 142 |
-
if res_title not in res_titles and res_title not in titles:
|
| 143 |
-
res_titles.append(res_title)
|
| 144 |
-
res_links.append(res_link)
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
except OutputParserException as e:
|
| 148 |
-
if pdf_answer['answer'] == 'Nothing':
|
| 149 |
answer = "Your search is beyond the scope of this tool at this time. Please explore the rest of [IBHS website](https://ibhs.org) to find research on this topic."
|
| 150 |
return answer
|
|
|
|
|
|
|
| 151 |
else:
|
| 152 |
-
answer =
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
finally:
|
|
|
|
|
|
|
| 155 |
chat["ai"] = answer
|
| 156 |
chat["timestamp"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
| 157 |
container.create_item(body=chat)
|
|
@@ -172,8 +128,9 @@ def llm_response(query, session_id):
|
|
| 172 |
# Combine answer and citations for final markdown output
|
| 173 |
|
| 174 |
|
| 175 |
-
if not res_links:
|
| 176 |
-
|
|
|
|
| 177 |
else:
|
| 178 |
markdown_list += f"\n\n\nHere is a list of articles that can provide more information about your inquiry:\n"
|
| 179 |
markdown_list += "\n".join([f"- [{res_title}]({res_link})" for res_title, res_link in zip(res_titles, res_links)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
import logging
|
|
|
|
| 5 |
import os
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
import azure.cosmos.cosmos_client as cosmos_client
|
|
|
|
|
|
|
|
|
|
| 8 |
from langchain.chains import create_retrieval_chain
|
|
|
|
|
|
|
| 9 |
import datetime
|
| 10 |
import uuid
|
| 11 |
+
from LiteratureAgent import RoofCoverChatbot
|
| 12 |
+
from Refiner import RefinementPipeline
|
| 13 |
|
| 14 |
+
from helpers import get_article_info
|
| 15 |
|
| 16 |
load_dotenv()
|
| 17 |
+
refiner = RefinementPipeline()
|
| 18 |
+
literature_agent = RoofCoverChatbot()
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
ENV = os.getenv('ENV')
|
| 21 |
HOST = os.getenv('ACCOUNT_HOST')
|
|
|
|
| 27 |
database = client.get_database_client(DATABASE_ID)
|
| 28 |
container = database.get_container_client(CONTAINER_ID)
|
| 29 |
history_container = database.get_container_client(HISTORY_CONTAINER_ID)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
df = pd.read_csv("articles_db.csv")
|
| 32 |
|
| 33 |
def initialize_session(session_id):
|
| 34 |
# If no session_id exists, generate a new one
|
|
|
|
| 46 |
chat["partitionKey"] = "RoofingRoadmap"
|
| 47 |
chat["user"] = query
|
| 48 |
chat["env"] = ENV
|
| 49 |
+
answer = None
|
| 50 |
|
| 51 |
+
if 'f wave' in query.lower() or 'f-wave' in query.lower() or 'fwave' in query.lower():
|
| 52 |
+
query = query.replace('f wave', 'f-wave shingle').replace('f-wave', 'f-wave shingle').replace('fwave',
|
| 53 |
+
'f-wave shingle')
|
| 54 |
try:
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
response = literature_agent.get_response(query)
|
| 57 |
+
enhanced_query = refiner.invoke(question=query, answer=response)
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
initial_answer = response['answer']['cited_answer'][0].get("answer", "Nothing")
|
| 61 |
+
except Exception as e:
|
| 62 |
+
initial_answer = "Nothing"
|
| 63 |
+
|
| 64 |
+
if enhanced_query.get("enhanced_answer") == "Nothing" and initial_answer == "Nothing":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
answer = "Your search is beyond the scope of this tool at this time. Please explore the rest of [IBHS website](https://ibhs.org) to find research on this topic."
|
| 66 |
return answer
|
| 67 |
+
if enhanced_query.get("enhanced_answer") != "Nothing":
|
| 68 |
+
answer = enhanced_query['enhanced_answer']
|
| 69 |
else:
|
| 70 |
+
answer = response
|
| 71 |
+
|
| 72 |
+
citations = response['answer']['cited_answer'][1].get('citations', [])
|
| 73 |
+
|
| 74 |
+
original_citations = []
|
| 75 |
+
if citations:
|
| 76 |
+
for citation in citations:
|
| 77 |
+
try:
|
| 78 |
+
# edited_item = citation['citation'][1]["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
|
| 79 |
+
original_citations.append(citation['citation'][1]["source"])
|
| 80 |
+
title, link = get_article_info(df, citation['citation'][1]["source"])
|
| 81 |
+
if title not in titles:
|
| 82 |
+
titles.append(title)
|
| 83 |
+
# if link not in links:
|
| 84 |
+
links.append(link)
|
| 85 |
+
except Exception as e:
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
question_search = literature_agent.get_extra_resources(query, original_citations)
|
| 90 |
+
except Exception as e:
|
| 91 |
+
question_search = []
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
if question_search:
|
| 95 |
+
for res_item in question_search:
|
| 96 |
+
|
| 97 |
+
res_title, res_link = get_article_info(df, res_item.metadata["source"])
|
| 98 |
+
if res_title not in res_titles and res_title not in titles:
|
| 99 |
+
res_titles.append(res_title)
|
| 100 |
+
res_links.append(res_link)
|
| 101 |
+
if len(res_titles) == 5:
|
| 102 |
+
break
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
except Exception as e:
|
| 106 |
+
answer = "Your search is beyond the scope of this tool at this time. Please explore the rest of [IBHS website](https://ibhs.org) to find research on this topic."
|
| 107 |
+
return answer
|
| 108 |
finally:
|
| 109 |
+
if answer is None:
|
| 110 |
+
answer = "Your search is beyond the scope of this tool at this time. Please explore the rest of [IBHS website](https://ibhs.org) to find research on this topic."
|
| 111 |
chat["ai"] = answer
|
| 112 |
chat["timestamp"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
| 113 |
container.create_item(body=chat)
|
|
|
|
| 128 |
# Combine answer and citations for final markdown output
|
| 129 |
|
| 130 |
|
| 131 |
+
if not res_links and not links:
|
| 132 |
+
markdown_list += f"\n\n\nHere is a list of articles that can provide more information about your inquiry:\n"
|
| 133 |
+
markdown_list += "\n".join(["- [IBHS Website](https://ibhs.org)", "- [FORTIFIED Website](https://fortifiedhome.org/roof/)" ])
|
| 134 |
else:
|
| 135 |
markdown_list += f"\n\n\nHere is a list of articles that can provide more information about your inquiry:\n"
|
| 136 |
markdown_list += "\n".join([f"- [{res_title}]({res_link})" for res_title, res_link in zip(res_titles, res_links)])
|
refine.pdf
CHANGED
|
Binary files a/refine.pdf and b/refine.pdf differ
|
|
|
requirements.txt
CHANGED
|
@@ -22,4 +22,5 @@ docx2txt
|
|
| 22 |
azure-mgmt-storage
|
| 23 |
azure-identity
|
| 24 |
azure-storage-blob
|
| 25 |
-
azure-cosmos
|
|
|
|
|
|
| 22 |
azure-mgmt-storage
|
| 23 |
azure-identity
|
| 24 |
azure-storage-blob
|
| 25 |
+
azure-cosmos
|
| 26 |
+
fitz
|