IBHS commited on
Commit
fa8ee23
·
verified ·
1 Parent(s): 04aa1c8

Upload 13 files

Browse files
.gitattributes CHANGED
@@ -78,3 +78,4 @@ Articles/wind_loads_on_discontinuous_metal_roofing_ibhs.pdf filter=lfs diff=lfs
78
  Articles/wind_uplift_resistance_of_artificially_and_naturally_aged_asphalt_shingles.pdf filter=lfs diff=lfs merge=lfs -text
79
  Articles/wind_vulnerability_analysis_of_standing_seam_roof_system_considering_fatigue_damage.pdf filter=lfs diff=lfs merge=lfs -text
80
  chroma_store/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
 
 
78
  Articles/wind_uplift_resistance_of_artificially_and_naturally_aged_asphalt_shingles.pdf filter=lfs diff=lfs merge=lfs -text
79
  Articles/wind_vulnerability_analysis_of_standing_seam_roof_system_considering_fatigue_damage.pdf filter=lfs diff=lfs merge=lfs -text
80
  chroma_store/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
81
+ chroma_langchain_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
LiteratureAgent.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai import OpenAIEmbeddings
2
+ from langchain_openai import ChatOpenAI
3
+ from langchain_chroma import Chroma
4
+ from langchain_core.prompts import ChatPromptTemplate
5
+ from langchain_core.runnables import RunnablePassthrough
6
+ from langchain_core.output_parsers import XMLOutputParser
7
+ from langchain.chains import create_retrieval_chain
8
+ from langchain_core.documents import Document
9
+ from typing import List
10
+
11
+
12
+ XML_SYSTEM_PROMPT = """You're a helpful AI assistant. Given a user question and some scientific literature
13
+ documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
14
+ and their performance against natural hazards(e.g., wind, hail), answer the user
15
+ question.
16
+ You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile)
17
+ and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.
18
+
19
+ When addressing questions about ‘what is the best roof,’ consider the following factors:
20
+ • Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
21
+ • For the insurance industry, the ‘best roof’ depends on the specific hazards (their location and frequency), performance expectations and predictability, and the cost of materials.
22
+
23
+ If none of the articles answer the question, simply say that there are no articles relevant to your inquiry.
24
+ Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that
25
+ justifies the answer and the ID and also Source Name of the quote article. Return a citation for every quote across all articles
26
+ that justify the answer. Use the following format for your final output:
27
+ <cited_answer>
28
+ <answer></answer>
29
+ <citations>
30
+ <citation><source_id></source_id><source></source><quote></quote></citation>
31
+ <citation><source_id></source_id><source></source><quote></quote></citation>
32
+ ...
33
+ </citations>
34
+ </cited_answer>
35
+
36
+ If none of the articles answer the question, return:
37
+ <cited_answer>
38
+ <answer>Nothing</answer>
39
+ <citations/>
40
+ </cited_answer>
41
+
42
+ ALWAYS maintain valid XML structure with properly closed tags. Here are the articles:{context}"""
43
+
44
+
45
+ class RoofCoverChatbot:
46
+ def __init__(self, model: str = "gpt-4o", temperature: float = 0.1):
47
+ """
48
+ Initialize the RoofCoverChatbot by setting up the retrieval chain,
49
+ which uses scientific literature documents to generate an XML-formatted answer.
50
+ """
51
+ # Create the XML prompt template.
52
+ self.xml_prompt = ChatPromptTemplate.from_messages(
53
+ [("system", XML_SYSTEM_PROMPT), ("human", "{input}")]
54
+ )
55
+
56
+ # Initialize the language model.
57
+ self.llm = ChatOpenAI(model=model, temperature=temperature)
58
+
59
+ # Create the chain that refines answers using retrieved documents.
60
+ # The first step formats the retrieved context as XML.
61
+ rag_chain_from_docs = (
62
+ RunnablePassthrough.assign(
63
+ context=(lambda x: self.format_docs_xml(x["context"]))
64
+ )
65
+ | self.xml_prompt
66
+ | self.llm
67
+ | XMLOutputParser()
68
+ )
69
+
70
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
71
+
72
+ self.vectordb = Chroma(
73
+ collection_name="roofs_collection",
74
+ embedding_function=embeddings,
75
+ persist_directory="./chroma_langchain_db",
76
+ )
77
+ # Use similarity search to retrieve the top-K documents.
78
+ self.retriever = self.vectordb.as_retriever(
79
+ search_type="similarity", search_kwargs={"k": 5}
80
+ )
81
+
82
+ # This lambda extracts the "input" key for retrieval.
83
+ retrieve_docs = (lambda x: x["input"]) | self.retriever
84
+
85
+ # Build the final chain: retrieve documents then generate an answer.
86
+ self.chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
87
+ answer=rag_chain_from_docs
88
+ )
89
+
90
+ @staticmethod
91
+ def format_docs_xml(docs: List[Document]) -> str:
92
+ """
93
+ Format a list of documents into XML snippets.
94
+
95
+ Each document is formatted with its source metadata and a snippet of its content.
96
+ """
97
+
98
+ formatted_docs = [
99
+ (
100
+ f"<source id=\"{i}\">\n"
101
+ f"<source>{doc.metadata['source']}</source>\n"
102
+ f"<article_snippet>{doc.page_content}</article_snippet>\n"
103
+ f"</source>"
104
+ )
105
+ for i, doc in enumerate(docs)
106
+ ]
107
+ return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"
108
+
109
+ def get_response(self, query: str) -> str:
110
+ """
111
+ Return the chatbot response for the given query.
112
+
113
+ The method retrieves relevant documents and then uses the XML chain to generate
114
+ an answer with citations.
115
+
116
+ :param query: The user question.
117
+ :return: XML-formatted answer with citations.
118
+ """
119
+ return self.chain.invoke({"input": query})
120
+
121
+ def get_extra_resources(self, query: str, original_sources: List[str]):
122
+ """
123
+ Invokes the retriever using the given query and returns additional resources.
124
+
125
+ Uses the retriever to fetch resources based on the input query string. This
126
+ method facilitates targeted resource retrieval based on provided input.
127
+
128
+ :param query: A string representing the query to be processed by the retriever.
129
+ :type query: str
130
+ :return: The resources or data obtained from the retriever after processing
131
+ the query.
132
+ """
133
+
134
+ retriever = self.vectordb.as_retriever(
135
+ search_type="similarity", search_kwargs={"k": 8, "filter":{"source": {"$nin": original_sources}}}
136
+ )
137
+
138
+ result = retriever.invoke(query)
139
+ return result
Refiner.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.output_parsers import JsonOutputParser
2
+ from langchain_core.runnables import RunnableLambda
3
+ from langchain_core.prompts import PromptTemplate
4
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
5
+ from pydantic import BaseModel, Field
6
+ from langchain_community.document_loaders import PyMuPDFLoader
7
+ from langchain_core.vectorstores import InMemoryVectorStore
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ from operator import itemgetter
10
+ from config import NEW_REFINE_SYSTEM_PROMPT_JSON
11
+
12
+
13
+ class Answer(BaseModel):
14
+ enhanced_question: str = Field(description="Paraphrased question")
15
+ enhanced_answer: str = Field(description="Enhanced answer")
16
+
17
+
18
+ class RefinementPipeline:
19
+ def __init__(self, model: str = "gpt-4o", temperature: float = 0.1):
20
+ self.llm = ChatOpenAI(model=model, temperature=temperature)
21
+ self.parser = JsonOutputParser(pydantic_object=Answer)
22
+ self.prompt = PromptTemplate(
23
+ template=NEW_REFINE_SYSTEM_PROMPT_JSON,
24
+ input_variables=["question", "answer", "context"],
25
+ partial_variables={"format_instructions": self.parser.get_format_instructions()},
26
+ )
27
+
28
+ # Load and process PDF
29
+ self.pdf_loader = PyMuPDFLoader("refine.pdf")
30
+ self.pdf_docs = self.pdf_loader.load()
31
+
32
+ # Split the document into chunks
33
+ self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
34
+ self.splits = self.text_splitter.split_documents(self.pdf_docs)
35
+
36
+ # Create an in-memory vector store from the document splits
37
+ self.pdf_vectorstore = InMemoryVectorStore.from_documents(
38
+ documents=self.splits, embedding=OpenAIEmbeddings()
39
+ )
40
+ self.pdf_retriever = self.pdf_vectorstore.as_retriever()
41
+
42
+ # Define the processing chain
43
+ self.chain = (
44
+ {
45
+ "context": itemgetter("question") | self.pdf_retriever,
46
+ "question": itemgetter("question"),
47
+ "answer": itemgetter("answer")
48
+ }
49
+ | RunnableLambda(lambda x: {
50
+ "context": "\n".join([doc.page_content for doc in x["context"]]),
51
+ "question": x["question"],
52
+ "answer": x["answer"]
53
+ })
54
+ | self.prompt
55
+ | self.llm
56
+ | self.parser
57
+ )
58
+
59
+ def invoke(self, question: str, answer: str):
60
+ return self.chain.invoke({"question": question, "answer": answer})
chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e95387811fdd2c605bdbe40ef66c6427e098a831868dd455a3033a5eaf874a3
3
+ size 62140000
chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73a6d751332fa17ddf2061d192fdfd6cdbd8cec7ee44b674a975082dc8b71982
3
+ size 100
chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24b1ebdaa3495251e55f78467b1c4675e3100547c42ba96603c02b2ba59a1121
3
+ size 288012
chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f2e700c214bf60718eade7cbed207636098e2fe9f13fc04b69c3f9957bd9940
3
+ size 20000
chroma_langchain_db/0fadc0a3-4d1d-4f84-8e80-2aaf4d3d5940/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5619be57918b92ff190f6a1a193c472a026ed614ea305f95d5e4d1523961fe03
3
+ size 43052
chroma_langchain_db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e399b010878c1e88b70e4dff1babfbe01cff91fcbfea99f77817f3b6e66a9771
3
+ size 126881792
config.py CHANGED
@@ -2,10 +2,7 @@ from langchain.chains.summarize.refine_prompts import REFINE_PROMPT
2
 
3
  XML_SYSTEM_PROMPT= """You're a helpful AI assistant. Given a user question and some scientific literature
4
  documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
5
- and their performance against natural hazards(e.g., wind, hail), answer the user
6
- question.
7
- You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile)
8
- and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.
9
 
10
  When addressing questions about ‘what is the best roof,’ consider the following factors:
11
  • Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
@@ -25,6 +22,29 @@ that justify the answer. Use the following format for your final output:
25
  </cited_answer>
26
  Here are the articles:{context}"""
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  REFINE_SYSTEM_PROMPT = (
29
  "You are an assistant for question-answering tasks. "
30
  "Use the following pieces of retrieved context to answer "
@@ -32,4 +52,35 @@ REFINE_SYSTEM_PROMPT = (
32
  "If you cannot answer the question with the retrieved context, only say that 'Nothing' "
33
  "\n\n"
34
  "{context}"
35
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  XML_SYSTEM_PROMPT= """You're a helpful AI assistant. Given a user question and some scientific literature
4
  documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
5
+ and their performance against natural hazards(e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.
 
 
 
6
 
7
  When addressing questions about ‘what is the best roof,’ consider the following factors:
8
  • Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
 
22
  </cited_answer>
23
  Here are the articles:{context}"""
24
 
25
+ # NEW_REFINE_SYSTEM_PROMPT = (
26
+ # "You are an assistant designed to ensure that answers are completely aligned with the authoritative content of a provided PDF. "
27
+ # "Before retrieving any context, first paraphrase the user's question to fully capture its intended meaning and naturally fit it into the context of the PDF. "
28
+ # "This is crucial because users sometimes provide partially paraphrased or incomplete questions. "
29
+ # "The questions generally pertain to scientific literature on roof cover materials (e.g., asphalt shingles, metal, tile) and their performance against natural hazards (e.g., wind, hail). "
30
+ # "Your task is then to take the refined question and an initial answer from another source, and refine that answer using the context retrieved from the PDF. "
31
+ # "If the retrieved context supports the initial answer, adjust it to fully match the PDF content. If the context contradicts or does not support the answer, modify it accordingly. "
32
+ # "Provide clear, concise, and informed answers without unnecessary fluff. If there is no supporting evidence, simply respond with ONLY one word 'Nothing'. "
33
+ # "IMPORTANT: In your final output, return only the refined answer text with no additional labels, headings, or repeated paraphrased questions."
34
+ # )
35
+
36
+ # NEW_REFINE_SYSTEM_PROMPT = (
37
+ # "You are an assistant designed to ensure that answers are completely aligned with the authoritative content of a provided PDF. "
38
+ # "Before retrieving any context, first paraphrase the user's question to fully capture its intended meaning and naturally fit it into the context of the PDF. "
39
+ # "This is crucial because users sometimes provide partially paraphrased or incomplete questions. "
40
+ # "The questions generally pertain to scientific literature on roof cover materials (e.g., asphalt shingles, metal, tile) and their performance against natural hazards (e.g., wind, hail). "
41
+ # "Your task is then to take the refined (paraphrased) question and an initial answer from another source, and refine that answer using the context retrieved from the PDF. "
42
+ # "If the retrieved context supports the initial answer, adjust it to fully match the PDF content. If the context contradicts or does not support the answer, modify it accordingly. "
43
+ # "Provide clear, concise, and informed answers without unnecessary fluff. If there is no supporting evidence, simply respond with 'Nothing'. "
44
+ # "IMPORTANT: Your final output must be a valid JSON object with exactly two keys: 'paraphrased_question' and 'answer'. "
45
+ # "Respond ONLY with the JSON object. Do not include any other text or formatting."
46
+ # )
47
+
48
  REFINE_SYSTEM_PROMPT = (
49
  "You are an assistant for question-answering tasks. "
50
  "Use the following pieces of retrieved context to answer "
 
52
  "If you cannot answer the question with the retrieved context, only say that 'Nothing' "
53
  "\n\n"
54
  "{context}"
55
+ )
56
+
57
+ # NEW_REFINE_SYSTEM_PROMPT_JSON = (
58
+ # "You are an assistant designed to ensure that answers are completely aligned with the authoritative content of a provided PDF. "
59
+ # "Before retrieving any context, first paraphrase the user's question to fully capture its intended meaning and naturally fit it into the context of the PDF. "
60
+ # "This is crucial because users sometimes provide partially paraphrased or incomplete questions. "
61
+ # "The questions generally pertain to scientific literature on roof cover materials (e.g., asphalt shingles, metal, tile) and their performance against natural hazards (e.g., wind, hail). "
62
+ # "Your task is then to take the refined (paraphrased) question and an initial answer from another source, and refine that answer using the context retrieved from the PDF. "
63
+ # "If the retrieved context supports the initial answer, adjust it to fully match the PDF content. If the context contradicts or does not support the answer, modify it accordingly. "
64
+ # "Provide clear, concise, and informed answers without unnecessary fluff. If there is no supporting evidence, simply respond with 'Nothing'. "
65
+ # "Here is the user question: {question} "
66
+ # "Here is the initial answer: {answer} "
67
+ # "Here is the retrieved context: {context} "
68
+ # "IMPORTANT: Your final output must be a valid JSON object with {format_instructions} "
69
+ # )
70
+
71
+ NEW_REFINE_SYSTEM_PROMPT_JSON = """You are an assistant that ensures answers are fully aligned with the authoritative content of a provided PDF document. Before retrieving any context, first paraphrase the user's question to capture its complete intended meaning and seamlessly integrate it into the context of the PDF. This is essential because users sometimes submit partially paraphrased or incomplete questions, especially regarding scientific literature on roof cover materials (e.g., asphalt shingles, metal, tile) and their performance against natural hazards (e.g., wind, hail).
72
+
73
+ Your task is as follows:
74
+ 1. Paraphrase the provided user question to fully clarify its intent.
75
+ 2. Using the retrieved context from the PDF, refine the initial answer from another source:
76
+ - If the context supports the initial answer, adjust it so that it fully aligns with the PDF content.
77
+ - If the context contradicts or does not support the answer, modify the answer accordingly.
78
+ - If there is no supporting evidence, respond ONLY with "Nothing" as "enhanced_answer".
79
+ 3. Provide a clear, concise, and informed answer without unnecessary fluff.
80
+
81
+ Inputs:
82
+ - User question: {question}
83
+ - Initial answer: {answer}
84
+ - Retrieved context: {context}
85
+
86
+ IMPORTANT: Your final output must be a valid JSON object with exactly two keys: "paraphrased_question" and "answer". {format_instructions}"""
helpers.py CHANGED
@@ -32,7 +32,8 @@ def get_article_info(df: pd.DataFrame, file_name: str):
32
  Given a DataFrame and a file name, return the corresponding
33
  title and link from the CSV. Assumes file_name is unique in the DF.
34
  """
35
- row = df[df["file_name"] == file_name]
 
36
  if row.empty:
37
  # Fallback if not found
38
  return "IBHS Website", "https://ibhs.org"
 
32
  Given a DataFrame and a file name, return the corresponding
33
  title and link from the CSV. Assumes file_name is unique in the DF.
34
  """
35
+ edited_file_name = file_name.replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
36
+ row = df[df["file_name"] == edited_file_name]
37
  if row.empty:
38
  # Fallback if not found
39
  return "IBHS Website", "https://ibhs.org"
main.py CHANGED
@@ -1,11 +1,3 @@
1
- from langchain_openai import OpenAIEmbeddings
2
- from langchain_openai import ChatOpenAI
3
- from langchain_chroma import Chroma
4
- import chromadb
5
- from chromadb.config import Settings
6
- from langchain_core.prompts import ChatPromptTemplate
7
- from langchain_core.runnables import RunnablePassthrough
8
- from langchain_core.output_parsers import XMLOutputParser
9
  import gradio as gr
10
  import pandas as pd
11
  import logging
@@ -13,24 +5,17 @@ from langchain_core.exceptions import OutputParserException
13
  import os
14
  from dotenv import load_dotenv
15
  import azure.cosmos.cosmos_client as cosmos_client
16
- from langchain_community.document_loaders import PyPDFLoader
17
- from langchain_core.vectorstores import InMemoryVectorStore
18
- from langchain_text_splitters import RecursiveCharacterTextSplitter
19
  from langchain.chains import create_retrieval_chain
20
- from langchain.chains.combine_documents import create_stuff_documents_chain
21
- from langchain_core.runnables import RunnableLambda
22
  import datetime
23
  import uuid
24
- from config import XML_SYSTEM_PROMPT, REFINE_SYSTEM_PROMPT
25
- from helpers import *
26
 
 
27
 
28
  load_dotenv()
29
- # Constants
30
- PERSIST_DIRECTORY = "chroma_store"
31
- K_VALUE = 5
32
-
33
- xml_prompt = ChatPromptTemplate.from_messages([("system", XML_SYSTEM_PROMPT), ("human", "{input}")])
34
 
35
  ENV = os.getenv('ENV')
36
  HOST = os.getenv('ACCOUNT_HOST')
@@ -42,58 +27,8 @@ client = cosmos_client.CosmosClient(HOST, {'masterKey': MASTER_KEY}, user_agent=
42
  database = client.get_database_client(DATABASE_ID)
43
  container = database.get_container_client(CONTAINER_ID)
44
  history_container = database.get_container_client(HISTORY_CONTAINER_ID)
45
- llm = ChatOpenAI(model="gpt-4o", temperature=0)
46
- df = pd.read_csv("articles_db.csv")
47
-
48
-
49
- rag_chain_from_docs = (
50
- RunnablePassthrough.assign(context=(lambda x: format_docs_xml(x["context"])))
51
- | xml_prompt
52
- | llm
53
- | XMLOutputParser()
54
- )
55
-
56
- settings = Settings(persist_directory=PERSIST_DIRECTORY)
57
- vectordb = Chroma(embedding_function=OpenAIEmbeddings(), persist_directory=PERSIST_DIRECTORY)
58
- retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": K_VALUE})
59
- retrieve_docs = (lambda x: x["input"]) | retriever
60
- chain = RunnablePassthrough.assign(context=retrieve_docs).assign(
61
- answer=rag_chain_from_docs
62
- )
63
-
64
-
65
- pdf_loader = PyPDFLoader("refine.pdf")
66
- pdf_docs = pdf_loader.load()
67
-
68
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
69
- splits = text_splitter.split_documents(pdf_docs)
70
- pdf_vectorstore = InMemoryVectorStore.from_documents(
71
- documents=splits, embedding=OpenAIEmbeddings()
72
- )
73
-
74
- pdf_retriever = pdf_vectorstore.as_retriever()
75
-
76
- refine_prompt = ChatPromptTemplate.from_messages(
77
- [
78
- ("system", REFINE_SYSTEM_PROMPT),
79
- ("human", "{input}"),
80
- ]
81
- )
82
-
83
- question_answer_chain = create_stuff_documents_chain(llm, refine_prompt)
84
- pdf_rag_chain = create_retrieval_chain(pdf_retriever, question_answer_chain)
85
-
86
- def vectordb_search(query):
87
- titles, links = [], []
88
- question_search = retriever.invoke(query)
89
- for item in question_search:
90
- edited_item = item.metadata["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
91
- title, link = get_article_info(df, edited_item)
92
- if title not in titles:
93
- titles.append(title)
94
- links.append(link)
95
- return "\n".join([f"- [{title}]({link})" for title, link in zip(titles, links)])
96
 
 
97
 
98
  def initialize_session(session_id):
99
  # If no session_id exists, generate a new one
@@ -111,47 +46,68 @@ def llm_response(query, session_id):
111
  chat["partitionKey"] = "RoofingRoadmap"
112
  chat["user"] = query
113
  chat["env"] = ENV
 
114
 
115
- pdf_answer = pdf_rag_chain.invoke({"input": f"{query}"})
 
 
116
  try:
117
- if 'f wave' in query.lower() or 'f-wave' in query.lower() or 'fwave' in query.lower():
118
- query = query.replace('f wave', 'f-wave shingle').replace('f-wave', 'f-wave shingle').replace('fwave', 'f-wave shingle')
119
- result = chain.invoke({"input": query})
120
 
121
- if pdf_answer['answer'] != 'Nothing':
122
- answer = pdf_answer['answer']
123
- else:
124
- answer = result['answer']['cited_answer'][0].get("answer", "No answer available.")
125
-
126
- citations = result['answer']['cited_answer'][1].get('citations', [])
127
- for citation in citations:
128
- try:
129
- edited_item = citation['citation'][1]["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
130
- title, link = get_article_info(df, edited_item)
131
- if title not in titles:
132
- titles.append(title)
133
- # if link not in links:
134
- links.append(link)
135
- except (TypeError, KeyError, IndexError):
136
- # Handle the error or simply pass if citation does not have the expected keys
137
- continue
138
- question_search = retriever.invoke(query)
139
- for res_item in question_search:
140
- edited_item = res_item.metadata["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
141
- res_title, res_link = get_article_info(df, edited_item)
142
- if res_title not in res_titles and res_title not in titles:
143
- res_titles.append(res_title)
144
- res_links.append(res_link)
145
-
146
-
147
- except OutputParserException as e:
148
- if pdf_answer['answer'] == 'Nothing':
149
  answer = "Your search is beyond the scope of this tool at this time. Please explore the rest of [IBHS website](https://ibhs.org) to find research on this topic."
150
  return answer
 
 
151
  else:
152
- answer = pdf_answer['answer']
153
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  finally:
 
 
155
  chat["ai"] = answer
156
  chat["timestamp"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
157
  container.create_item(body=chat)
@@ -172,8 +128,9 @@ def llm_response(query, session_id):
172
  # Combine answer and citations for final markdown output
173
 
174
 
175
- if not res_links:
176
- return markdown_list
 
177
  else:
178
  markdown_list += f"\n\n\nHere is a list of articles that can provide more information about your inquiry:\n"
179
  markdown_list += "\n".join([f"- [{res_title}]({res_link})" for res_title, res_link in zip(res_titles, res_links)])
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import logging
 
5
  import os
6
  from dotenv import load_dotenv
7
  import azure.cosmos.cosmos_client as cosmos_client
 
 
 
8
  from langchain.chains import create_retrieval_chain
 
 
9
  import datetime
10
  import uuid
11
+ from LiteratureAgent import RoofCoverChatbot
12
+ from Refiner import RefinementPipeline
13
 
14
+ from helpers import get_article_info
15
 
16
  load_dotenv()
17
+ refiner = RefinementPipeline()
18
+ literature_agent = RoofCoverChatbot()
 
 
 
19
 
20
  ENV = os.getenv('ENV')
21
  HOST = os.getenv('ACCOUNT_HOST')
 
27
  database = client.get_database_client(DATABASE_ID)
28
  container = database.get_container_client(CONTAINER_ID)
29
  history_container = database.get_container_client(HISTORY_CONTAINER_ID)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ df = pd.read_csv("articles_db.csv")
32
 
33
  def initialize_session(session_id):
34
  # If no session_id exists, generate a new one
 
46
  chat["partitionKey"] = "RoofingRoadmap"
47
  chat["user"] = query
48
  chat["env"] = ENV
49
+ answer = None
50
 
51
+ if 'f wave' in query.lower() or 'f-wave' in query.lower() or 'fwave' in query.lower():
52
+ query = query.replace('f wave', 'f-wave shingle').replace('f-wave', 'f-wave shingle').replace('fwave',
53
+ 'f-wave shingle')
54
  try:
 
 
 
55
 
56
+ response = literature_agent.get_response(query)
57
+ enhanced_query = refiner.invoke(question=query, answer=response)
58
+
59
+ try:
60
+ initial_answer = response['answer']['cited_answer'][0].get("answer", "Nothing")
61
+ except Exception as e:
62
+ initial_answer = "Nothing"
63
+
64
+ if enhanced_query.get("enhanced_answer") == "Nothing" and initial_answer == "Nothing":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  answer = "Your search is beyond the scope of this tool at this time. Please explore the rest of [IBHS website](https://ibhs.org) to find research on this topic."
66
  return answer
67
+ if enhanced_query.get("enhanced_answer") != "Nothing":
68
+ answer = enhanced_query['enhanced_answer']
69
  else:
70
+ answer = response
71
+
72
+ citations = response['answer']['cited_answer'][1].get('citations', [])
73
+
74
+ original_citations = []
75
+ if citations:
76
+ for citation in citations:
77
+ try:
78
+ # edited_item = citation['citation'][1]["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
79
+ original_citations.append(citation['citation'][1]["source"])
80
+ title, link = get_article_info(df, citation['citation'][1]["source"])
81
+ if title not in titles:
82
+ titles.append(title)
83
+ # if link not in links:
84
+ links.append(link)
85
+ except Exception as e:
86
+ continue
87
+
88
+ try:
89
+ question_search = literature_agent.get_extra_resources(query, original_citations)
90
+ except Exception as e:
91
+ question_search = []
92
+
93
+
94
+ if question_search:
95
+ for res_item in question_search:
96
+
97
+ res_title, res_link = get_article_info(df, res_item.metadata["source"])
98
+ if res_title not in res_titles and res_title not in titles:
99
+ res_titles.append(res_title)
100
+ res_links.append(res_link)
101
+ if len(res_titles) == 5:
102
+ break
103
+
104
+
105
+ except Exception as e:
106
+ answer = "Your search is beyond the scope of this tool at this time. Please explore the rest of [IBHS website](https://ibhs.org) to find research on this topic."
107
+ return answer
108
  finally:
109
+ if answer is None:
110
+ answer = "Your search is beyond the scope of this tool at this time. Please explore the rest of [IBHS website](https://ibhs.org) to find research on this topic."
111
  chat["ai"] = answer
112
  chat["timestamp"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
113
  container.create_item(body=chat)
 
128
  # Combine answer and citations for final markdown output
129
 
130
 
131
+ if not res_links and not links:
132
+ markdown_list += f"\n\n\nHere is a list of articles that can provide more information about your inquiry:\n"
133
+ markdown_list += "\n".join(["- [IBHS Website](https://ibhs.org)", "- [FORTIFIED Website](https://fortifiedhome.org/roof/)" ])
134
  else:
135
  markdown_list += f"\n\n\nHere is a list of articles that can provide more information about your inquiry:\n"
136
  markdown_list += "\n".join([f"- [{res_title}]({res_link})" for res_title, res_link in zip(res_titles, res_links)])
refine.pdf CHANGED
Binary files a/refine.pdf and b/refine.pdf differ
 
requirements.txt CHANGED
@@ -22,4 +22,5 @@ docx2txt
22
  azure-mgmt-storage
23
  azure-identity
24
  azure-storage-blob
25
- azure-cosmos
 
 
22
  azure-mgmt-storage
23
  azure-identity
24
  azure-storage-blob
25
+ azure-cosmos
26
+ fitz