IBHS commited on
Commit
04aa1c8
·
verified ·
1 Parent(s): 475ef85

Upload 3 files

Browse files
Files changed (3) hide show
  1. config.py +35 -0
  2. helpers.py +39 -0
  3. main.py +31 -103
config.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains.summarize.refine_prompts import REFINE_PROMPT
2
+
3
+ XML_SYSTEM_PROMPT= """You're a helpful AI assistant. Given a user question and some scientific literature
4
+ documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
5
+ and their performance against natural hazards(e.g., wind, hail), answer the user
6
+ question.
7
+ You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile)
8
+ and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.
9
+
10
+ When addressing questions about ‘what is the best roof,’ consider the following factors:
11
+ • Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
12
+ • For the insurance industry, the ‘best roof’ depends on the specific hazards (their location and frequency), performance expectations and predictability, and the cost of materials.
13
+
14
+ If none of the articles answer the question, simply say that there are no articles relevant to your inquiry.
15
+ Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that
16
+ justifies the answer and the ID and also Source Name of the quote article. Return a citation for every quote across all articles
17
+ that justify the answer. Use the following format for your final output:
18
+ <cited_answer>
19
+ <answer></answer>
20
+ <citations>
21
+ <citation><source_id></source_id><source></source><quote></quote></citation>
22
+ <citation><source_id></source_id><source></source><quote></quote></citation>
23
+ ...
24
+ </citations>
25
+ </cited_answer>
26
+ Here are the articles:{context}"""
27
+
28
+ REFINE_SYSTEM_PROMPT = (
29
+ "You are an assistant for question-answering tasks. "
30
+ "Use the following pieces of retrieved context to answer "
31
+ "the question. provide clear, concise, and informed answers without unnecessary fluff. "
32
+ "If you cannot answer the question with the retrieved context, only say that 'Nothing' "
33
+ "\n\n"
34
+ "{context}"
35
+ )
helpers.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.documents import Document
2
+ from typing import List
3
+ import pandas as pd
4
+
5
+ def format_docs_xml(docs: List[Document]) -> str:
6
+ formatted_docs = [
7
+ f"<source id=\"{i}\">\n<source>{doc.metadata['source']}</source>\n<article_snippet>{doc.page_content}</article_snippet>\n</source>"
8
+ for i, doc in enumerate(docs)
9
+ ]
10
+ return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"
11
+
12
+ # def format_docs_xml(docs: List[Document]) -> str:
13
+ # """
14
+ # Takes a list of Document objects and formats each into XML.
15
+ # """
16
+ # formatted_docs = []
17
+ # for i, doc in enumerate(docs):
18
+ # metadata_source = doc.metadata.get("source", "Unknown")
19
+ # snippet = doc.page_content or ""
20
+ # formatted = (
21
+ # f'<source id="{i}">\n'
22
+ # f'<source>{metadata_source}</source>\n'
23
+ # f'<article_snippet>{snippet}</article_snippet>\n'
24
+ # f'</source>'
25
+ # )
26
+ # formatted_docs.append(formatted)
27
+ # return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"
28
+
29
+
30
+ def get_article_info(df: pd.DataFrame, file_name: str):
31
+ """
32
+ Given a DataFrame and a file name, return the corresponding
33
+ title and link from the CSV. Assumes file_name is unique in the DF.
34
+ """
35
+ row = df[df["file_name"] == file_name]
36
+ if row.empty:
37
+ # Fallback if not found
38
+ return "IBHS Website", "https://ibhs.org"
39
+ return row["title"].iloc[0], row["link"].iloc[0]
main.py CHANGED
@@ -4,9 +4,6 @@ from langchain_chroma import Chroma
4
  import chromadb
5
  from chromadb.config import Settings
6
  from langchain_core.prompts import ChatPromptTemplate
7
- from typing import List
8
-
9
- from langchain_core.documents import Document
10
  from langchain_core.runnables import RunnablePassthrough
11
  from langchain_core.output_parsers import XMLOutputParser
12
  import gradio as gr
@@ -15,13 +12,7 @@ import logging
15
  from langchain_core.exceptions import OutputParserException
16
  import os
17
  from dotenv import load_dotenv
18
-
19
- from sympy.codegen.ast import continue_
20
-
21
  import azure.cosmos.cosmos_client as cosmos_client
22
- import azure.cosmos.exceptions as exceptions
23
- from azure.cosmos.partition_key import PartitionKey
24
-
25
  from langchain_community.document_loaders import PyPDFLoader
26
  from langchain_core.vectorstores import InMemoryVectorStore
27
  from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -30,38 +21,16 @@ from langchain.chains.combine_documents import create_stuff_documents_chain
30
  from langchain_core.runnables import RunnableLambda
31
  import datetime
32
  import uuid
33
- import hashlib
 
 
 
34
  load_dotenv()
35
  # Constants
36
  PERSIST_DIRECTORY = "chroma_store"
37
  K_VALUE = 5
38
 
39
- xml_system = """You're a helpful AI assistant. Given a user question and some scientific literature
40
- documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
41
- and their performance against natural hazards(e.g., wind, hail), answer the user
42
- question.
43
- You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile)
44
- and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.
45
-
46
- When addressing questions about ‘what is the best roof,’ consider the following factors:
47
- • Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
48
- • For the insurance industry, the ‘best roof’ depends on the specific hazards (their location and frequency), performance expectations and predictability, and the cost of materials.
49
-
50
- If none of the articles answer the question, simply say that there are no articles relevant to your inquiry.
51
- Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that
52
- justifies the answer and the ID and also Source Name of the quote article. Return a citation for every quote across all articles
53
- that justify the answer. Use the following format for your final output:
54
- <cited_answer>
55
- <answer></answer>
56
- <citations>
57
- <citation><source_id></source_id><source></source><quote></quote></citation>
58
- <citation><source_id></source_id><source></source><quote></quote></citation>
59
- ...
60
- </citations>
61
- </cited_answer>
62
- Here are the articles:{context}"""
63
-
64
- xml_prompt = ChatPromptTemplate.from_messages([("system", xml_system), ("human", "{input}")])
65
 
66
  ENV = os.getenv('ENV')
67
  HOST = os.getenv('ACCOUNT_HOST')
@@ -73,24 +42,8 @@ client = cosmos_client.CosmosClient(HOST, {'masterKey': MASTER_KEY}, user_agent=
73
  database = client.get_database_client(DATABASE_ID)
74
  container = database.get_container_client(CONTAINER_ID)
75
  history_container = database.get_container_client(HISTORY_CONTAINER_ID)
76
-
77
-
78
- def format_docs_xml(docs: List[Document]) -> str:
79
- formatted_docs = [
80
- f"<source id=\"{i}\">\n<source>{doc.metadata['source']}</source>\n<article_snippet>{doc.page_content}</article_snippet>\n</source>"
81
- for i, doc in enumerate(docs)
82
- ]
83
- return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"
84
-
85
- def parse_output_with_error_handling(output):
86
- try:
87
- return XMLOutputParser()
88
- except Exception:
89
- # return {'answer':{'cited_answer': [{'answer': ''},{'citations': []}]}}
90
- return XMLOutputParser().parse("")
91
-
92
-
93
  llm = ChatOpenAI(model="gpt-4o", temperature=0)
 
94
 
95
 
96
  rag_chain_from_docs = (
@@ -120,18 +73,9 @@ pdf_vectorstore = InMemoryVectorStore.from_documents(
120
 
121
  pdf_retriever = pdf_vectorstore.as_retriever()
122
 
123
- refine_system_prompt = (
124
- "You are an assistant for question-answering tasks. "
125
- "Use the following pieces of retrieved context to answer "
126
- "the question. provide clear, concise, and informed answers without unnecessary fluff. "
127
- "If you cannot answer the question with the retrieved context, only say that 'Nothing' "
128
- "\n\n"
129
- "{context}"
130
- )
131
-
132
  refine_prompt = ChatPromptTemplate.from_messages(
133
  [
134
- ("system", refine_system_prompt),
135
  ("human", "{input}"),
136
  ]
137
  )
@@ -139,24 +83,14 @@ refine_prompt = ChatPromptTemplate.from_messages(
139
  question_answer_chain = create_stuff_documents_chain(llm, refine_prompt)
140
  pdf_rag_chain = create_retrieval_chain(pdf_retriever, question_answer_chain)
141
 
142
- def get_article_info(df, file_name):
143
- title = df[df["file_name"] == file_name]["title"].iloc[0]
144
- link = df[df["file_name"] == file_name]["link"].iloc[0]
145
- return title, link
146
-
147
-
148
- df = pd.read_csv("articles_db.csv")
149
-
150
-
151
  def vectordb_search(query):
152
  titles, links = [], []
153
  question_search = retriever.invoke(query)
154
  for item in question_search:
155
- edited_item = item.metadata["source"].replace("Articles/", "")
156
  title, link = get_article_info(df, edited_item)
157
  if title not in titles:
158
  titles.append(title)
159
- # if link not in links:
160
  links.append(link)
161
  return "\n".join([f"- [{title}]({link})" for title, link in zip(titles, links)])
162
 
@@ -169,7 +103,6 @@ def initialize_session(session_id):
169
 
170
 
171
  def llm_response(query, session_id):
172
- print("session_id", session_id)
173
  chat = {}
174
  titles, links, res_titles, res_links = [], [], [], []
175
  session_id = initialize_session(session_id)
@@ -184,34 +117,31 @@ def llm_response(query, session_id):
184
  if 'f wave' in query.lower() or 'f-wave' in query.lower() or 'fwave' in query.lower():
185
  query = query.replace('f wave', 'f-wave shingle').replace('f-wave', 'f-wave shingle').replace('fwave', 'f-wave shingle')
186
  result = chain.invoke({"input": query})
187
- if pdf_answer['answer'] == 'Nothing':
188
- answer = result['answer']['cited_answer'][0]["answer"]
189
- citations = result['answer']['cited_answer'][1]['citations']
190
- for citation in citations:
191
- try:
192
- edited_item = citation['citation'][1]["source"].replace("Articles/", "")
193
- title, link = get_article_info(df, edited_item)
194
- if title not in titles:
195
- titles.append(title)
196
- # if link not in links:
197
- links.append(link)
198
- except (TypeError, KeyError, IndexError):
199
- # Handle the error or simply pass if citation does not have the expected keys
200
- continue
201
- else:
202
- answer = pdf_answer['answer']
203
 
204
- if not result['answer']['cited_answer'][1]['citations']:
205
- answer_with_citations = f"{answer}"
206
  else:
207
- question_search = retriever.invoke(query)
208
- for res_item in question_search:
209
- edited_item = res_item.metadata["source"].replace("Articles/", "")
210
- res_title, res_link = get_article_info(df, edited_item)
211
- if res_title not in res_titles and res_title not in titles:
212
- res_titles.append(res_title)
213
- # if res_link not in res_links and res_link not in links:
214
- res_links.append(res_link)
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
 
217
  except OutputParserException as e:
@@ -252,7 +182,6 @@ def llm_response(query, session_id):
252
 
253
  def vote(value, data: gr.LikeData, session_id: str = None):
254
  session_id = initialize_session(session_id)
255
- print("session_id", session_id)
256
  chat_vote = {}
257
  chat_vote["id"] = str(uuid.uuid4())
258
  chat_vote["chat_id"] = session_id
@@ -278,7 +207,6 @@ def show_feedback_column(visible):
278
 
279
  def user_feedback(value, session_id):
280
  session_id = initialize_session(session_id)
281
- print("session_id", session_id)
282
  chat_feedback = {}
283
  chat_feedback["id"] = str(uuid.uuid4())
284
  chat_feedback["chat_id"] = session_id
 
4
  import chromadb
5
  from chromadb.config import Settings
6
  from langchain_core.prompts import ChatPromptTemplate
 
 
 
7
  from langchain_core.runnables import RunnablePassthrough
8
  from langchain_core.output_parsers import XMLOutputParser
9
  import gradio as gr
 
12
  from langchain_core.exceptions import OutputParserException
13
  import os
14
  from dotenv import load_dotenv
 
 
 
15
  import azure.cosmos.cosmos_client as cosmos_client
 
 
 
16
  from langchain_community.document_loaders import PyPDFLoader
17
  from langchain_core.vectorstores import InMemoryVectorStore
18
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
21
  from langchain_core.runnables import RunnableLambda
22
  import datetime
23
  import uuid
24
+ from config import XML_SYSTEM_PROMPT, REFINE_SYSTEM_PROMPT
25
+ from helpers import *
26
+
27
+
28
  load_dotenv()
29
  # Constants
30
  PERSIST_DIRECTORY = "chroma_store"
31
  K_VALUE = 5
32
 
33
+ xml_prompt = ChatPromptTemplate.from_messages([("system", XML_SYSTEM_PROMPT), ("human", "{input}")])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  ENV = os.getenv('ENV')
36
  HOST = os.getenv('ACCOUNT_HOST')
 
42
  database = client.get_database_client(DATABASE_ID)
43
  container = database.get_container_client(CONTAINER_ID)
44
  history_container = database.get_container_client(HISTORY_CONTAINER_ID)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  llm = ChatOpenAI(model="gpt-4o", temperature=0)
46
+ df = pd.read_csv("articles_db.csv")
47
 
48
 
49
  rag_chain_from_docs = (
 
73
 
74
  pdf_retriever = pdf_vectorstore.as_retriever()
75
 
 
 
 
 
 
 
 
 
 
76
  refine_prompt = ChatPromptTemplate.from_messages(
77
  [
78
+ ("system", REFINE_SYSTEM_PROMPT),
79
  ("human", "{input}"),
80
  ]
81
  )
 
83
  question_answer_chain = create_stuff_documents_chain(llm, refine_prompt)
84
  pdf_rag_chain = create_retrieval_chain(pdf_retriever, question_answer_chain)
85
 
 
 
 
 
 
 
 
 
 
86
  def vectordb_search(query):
87
  titles, links = [], []
88
  question_search = retriever.invoke(query)
89
  for item in question_search:
90
+ edited_item = item.metadata["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
91
  title, link = get_article_info(df, edited_item)
92
  if title not in titles:
93
  titles.append(title)
 
94
  links.append(link)
95
  return "\n".join([f"- [{title}]({link})" for title, link in zip(titles, links)])
96
 
 
103
 
104
 
105
  def llm_response(query, session_id):
 
106
  chat = {}
107
  titles, links, res_titles, res_links = [], [], [], []
108
  session_id = initialize_session(session_id)
 
117
  if 'f wave' in query.lower() or 'f-wave' in query.lower() or 'fwave' in query.lower():
118
  query = query.replace('f wave', 'f-wave shingle').replace('f-wave', 'f-wave shingle').replace('fwave', 'f-wave shingle')
119
  result = chain.invoke({"input": query})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ if pdf_answer['answer'] != 'Nothing':
122
+ answer = pdf_answer['answer']
123
  else:
124
+ answer = result['answer']['cited_answer'][0].get("answer", "No answer available.")
125
+
126
+ citations = result['answer']['cited_answer'][1].get('citations', [])
127
+ for citation in citations:
128
+ try:
129
+ edited_item = citation['citation'][1]["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
130
+ title, link = get_article_info(df, edited_item)
131
+ if title not in titles:
132
+ titles.append(title)
133
+ # if link not in links:
134
+ links.append(link)
135
+ except (TypeError, KeyError, IndexError):
136
+ # Handle the error or simply pass if citation does not have the expected keys
137
+ continue
138
+ question_search = retriever.invoke(query)
139
+ for res_item in question_search:
140
+ edited_item = res_item.metadata["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
141
+ res_title, res_link = get_article_info(df, edited_item)
142
+ if res_title not in res_titles and res_title not in titles:
143
+ res_titles.append(res_title)
144
+ res_links.append(res_link)
145
 
146
 
147
  except OutputParserException as e:
 
182
 
183
  def vote(value, data: gr.LikeData, session_id: str = None):
184
  session_id = initialize_session(session_id)
 
185
  chat_vote = {}
186
  chat_vote["id"] = str(uuid.uuid4())
187
  chat_vote["chat_id"] = session_id
 
207
 
208
  def user_feedback(value, session_id):
209
  session_id = initialize_session(session_id)
 
210
  chat_feedback = {}
211
  chat_feedback["id"] = str(uuid.uuid4())
212
  chat_feedback["chat_id"] = session_id