Spaces:

IBHS
/

RoofingRoadmap

Sleeping

App Files Files

IBHS commited on Feb 22, 2025

Commit

04aa1c8

verified ·

1 Parent(s): 475ef85

Upload 3 files

Browse files

Files changed (3) hide show

config.py +35 -0
helpers.py +39 -0
main.py +31 -103

config.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from langchain.chains.summarize.refine_prompts import REFINE_PROMPT
+XML_SYSTEM_PROMPT= """You're a helpful AI assistant. Given a user question and some scientific literature
+documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
+and their performance against natural hazards(e.g., wind, hail), answer the user
+question.
+You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile)
+and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.
+When addressing questions about ‘what is the best roof,’ consider the following factors:
+	•	Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
+	•	For the insurance industry, the ‘best roof’ depends on the specific hazards (their location and frequency), performance expectations and predictability, and the cost of materials.
+If none of the articles answer the question, simply say that there are no articles relevant to your inquiry.
+Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that
+justifies the answer and the ID and also Source Name of the quote article. Return a citation for every quote across all articles
+that justify the answer. Use the following format for your final output:
+<cited_answer>
+    <answer></answer>
+    <citations>
+        <citation><source_id></source_id><source></source><quote></quote></citation>
+        <citation><source_id></source_id><source></source><quote></quote></citation>
+        ...
+    </citations>
+</cited_answer>
+Here are the articles:{context}"""
+REFINE_SYSTEM_PROMPT = (
+    "You are an assistant for question-answering tasks. "
+    "Use the following pieces of retrieved context to answer "
+    "the question. provide clear, concise, and informed answers without unnecessary fluff. "
+    "If you cannot answer the question with the retrieved context, only say that 'Nothing' "
+    "\n\n"
+    "{context}"
+)

helpers.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from langchain_core.documents import Document
+from typing import List
+import pandas as pd
+def format_docs_xml(docs: List[Document]) -> str:
+    formatted_docs = [
+        f"<source id=\"{i}\">\n<source>{doc.metadata['source']}</source>\n<article_snippet>{doc.page_content}</article_snippet>\n</source>"
+        for i, doc in enumerate(docs)
+    ]
+    return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"
+# def format_docs_xml(docs: List[Document]) -> str:
+#     """
+#     Takes a list of Document objects and formats each into XML.
+#     """
+#     formatted_docs = []
+#     for i, doc in enumerate(docs):
+#         metadata_source = doc.metadata.get("source", "Unknown")
+#         snippet = doc.page_content or ""
+#         formatted = (
+#             f'<source id="{i}">\n'
+#             f'<source>{metadata_source}</source>\n'
+#             f'<article_snippet>{snippet}</article_snippet>\n'
+#             f'</source>'
+#         )
+#         formatted_docs.append(formatted)
+#     return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"
+def get_article_info(df: pd.DataFrame, file_name: str):
+    """
+    Given a DataFrame and a file name, return the corresponding
+    title and link from the CSV. Assumes file_name is unique in the DF.
+    """
+    row = df[df["file_name"] == file_name]
+    if row.empty:
+        # Fallback if not found
+        return "IBHS Website", "https://ibhs.org"
+    return row["title"].iloc[0], row["link"].iloc[0]

main.py CHANGED Viewed

@@ -4,9 +4,6 @@ from langchain_chroma import Chroma
 import chromadb
 from chromadb.config import Settings
 from langchain_core.prompts import ChatPromptTemplate
-from typing import List
-from langchain_core.documents import Document
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.output_parsers import XMLOutputParser
 import gradio as gr
@@ -15,13 +12,7 @@ import logging
 from langchain_core.exceptions import OutputParserException
 import os
 from dotenv import load_dotenv
-from sympy.codegen.ast import continue_
 import azure.cosmos.cosmos_client as cosmos_client
-import azure.cosmos.exceptions as exceptions
-from azure.cosmos.partition_key import PartitionKey
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_core.vectorstores import InMemoryVectorStore
 from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -30,38 +21,16 @@ from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain_core.runnables import RunnableLambda
 import datetime
 import uuid
-import hashlib
 load_dotenv()
 # Constants
 PERSIST_DIRECTORY = "chroma_store"
 K_VALUE = 5
-xml_system = """You're a helpful AI assistant. Given a user question and some scientific literature
-documents which highlight research on different roof cover materials (e.g., asphalt shingles, metal, tile)
-and their performance against natural hazards(e.g., wind, hail), answer the user
-question.
-You’re a helpful AI assistant. Given a user question and scientific literature on various roof cover materials (e.g., asphalt shingles, metal, tile)
-and their performance against natural hazards (e.g., wind, hail), provide clear, concise, and informed answers without unnecessary fluff.
-When addressing questions about ‘what is the best roof,’ consider the following factors:
-	•	Geography, aesthetic preferences, budget, frequency of weather-related hazards, roof cover performance, and how performance changes with age.
-	•	For the insurance industry, the ‘best roof’ depends on the specific hazards (their location and frequency), performance expectations and predictability, and the cost of materials.
-If none of the articles answer the question, simply say that there are no articles relevant to your inquiry.
-Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that
-justifies the answer and the ID and also Source Name of the quote article. Return a citation for every quote across all articles
-that justify the answer. Use the following format for your final output:
-<cited_answer>
-    <answer></answer>
-    <citations>
-        <citation><source_id></source_id><source></source><quote></quote></citation>
-        <citation><source_id></source_id><source></source><quote></quote></citation>
-        ...
-    </citations>
-</cited_answer>
-Here are the articles:{context}"""
-xml_prompt = ChatPromptTemplate.from_messages([("system", xml_system), ("human", "{input}")])
 ENV = os.getenv('ENV')
 HOST = os.getenv('ACCOUNT_HOST')
@@ -73,24 +42,8 @@ client = cosmos_client.CosmosClient(HOST, {'masterKey': MASTER_KEY}, user_agent=
 database = client.get_database_client(DATABASE_ID)
 container = database.get_container_client(CONTAINER_ID)
 history_container = database.get_container_client(HISTORY_CONTAINER_ID)
-def format_docs_xml(docs: List[Document]) -> str:
-    formatted_docs = [
-        f"<source id=\"{i}\">\n<source>{doc.metadata['source']}</source>\n<article_snippet>{doc.page_content}</article_snippet>\n</source>"
-        for i, doc in enumerate(docs)
-    ]
-    return f"\n\n<sources>\n{chr(10).join(formatted_docs)}\n</sources>"
-def parse_output_with_error_handling(output):
-    try:
-        return XMLOutputParser()
-    except Exception:
-        # return {'answer':{'cited_answer': [{'answer': ''},{'citations': []}]}}
-        return XMLOutputParser().parse("")
 llm = ChatOpenAI(model="gpt-4o", temperature=0)
 rag_chain_from_docs = (
@@ -120,18 +73,9 @@ pdf_vectorstore = InMemoryVectorStore.from_documents(
 pdf_retriever = pdf_vectorstore.as_retriever()
-refine_system_prompt = (
-    "You are an assistant for question-answering tasks. "
-    "Use the following pieces of retrieved context to answer "
-    "the question. provide clear, concise, and informed answers without unnecessary fluff. "
-    "If you cannot answer the question with the retrieved context, only say that 'Nothing' "
-    "\n\n"
-    "{context}"
-)
 refine_prompt = ChatPromptTemplate.from_messages(
     [
-        ("system", refine_system_prompt),
         ("human", "{input}"),
     ]
 )
@@ -139,24 +83,14 @@ refine_prompt = ChatPromptTemplate.from_messages(
 question_answer_chain = create_stuff_documents_chain(llm, refine_prompt)
 pdf_rag_chain = create_retrieval_chain(pdf_retriever, question_answer_chain)
-def get_article_info(df, file_name):
-    title = df[df["file_name"] == file_name]["title"].iloc[0]
-    link = df[df["file_name"] == file_name]["link"].iloc[0]
-    return title, link
-df = pd.read_csv("articles_db.csv")
 def vectordb_search(query):
     titles, links = [], []
     question_search = retriever.invoke(query)
     for item in question_search:
-        edited_item = item.metadata["source"].replace("Articles/", "")
         title, link = get_article_info(df, edited_item)
         if title not in titles:
             titles.append(title)
-        # if link not in links:
             links.append(link)
     return "\n".join([f"- [{title}]({link})" for title, link in zip(titles, links)])
@@ -169,7 +103,6 @@ def initialize_session(session_id):
 def llm_response(query, session_id):
-    print("session_id", session_id)
     chat = {}
     titles, links, res_titles, res_links = [], [], [], []
     session_id = initialize_session(session_id)
@@ -184,34 +117,31 @@ def llm_response(query, session_id):
         if 'f wave' in query.lower() or 'f-wave' in query.lower() or 'fwave' in query.lower():
             query = query.replace('f wave', 'f-wave shingle').replace('f-wave', 'f-wave shingle').replace('fwave', 'f-wave shingle')
         result = chain.invoke({"input": query})
-        if pdf_answer['answer'] == 'Nothing':
-            answer = result['answer']['cited_answer'][0]["answer"]
-            citations = result['answer']['cited_answer'][1]['citations']
-            for citation in citations:
-                try:
-                    edited_item = citation['citation'][1]["source"].replace("Articles/", "")
-                    title, link = get_article_info(df, edited_item)
-                    if title not in titles:
-                        titles.append(title)
-                    # if link not in links:
-                        links.append(link)
-                except (TypeError, KeyError, IndexError):
-                    # Handle the error or simply pass if citation does not have the expected keys
-                    continue
-        else:
-            answer = pdf_answer['answer']
-        if not result['answer']['cited_answer'][1]['citations']:
-            answer_with_citations = f"{answer}"
         else:
-            question_search = retriever.invoke(query)
-            for res_item in question_search:
-                edited_item = res_item.metadata["source"].replace("Articles/", "")
-                res_title, res_link = get_article_info(df, edited_item)
-                if res_title not in res_titles and res_title not in titles:
-                    res_titles.append(res_title)
-                # if res_link not in res_links and res_link not in links:
-                    res_links.append(res_link)
     except OutputParserException as e:
@@ -252,7 +182,6 @@ def llm_response(query, session_id):
 def vote(value, data: gr.LikeData, session_id: str = None):
     session_id = initialize_session(session_id)
-    print("session_id", session_id)
     chat_vote = {}
     chat_vote["id"] = str(uuid.uuid4())
     chat_vote["chat_id"] = session_id
@@ -278,7 +207,6 @@ def show_feedback_column(visible):
 def user_feedback(value, session_id):
     session_id = initialize_session(session_id)
-    print("session_id", session_id)
     chat_feedback = {}
     chat_feedback["id"] = str(uuid.uuid4())
     chat_feedback["chat_id"] = session_id

 import chromadb
 from chromadb.config import Settings
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.output_parsers import XMLOutputParser
 import gradio as gr
 from langchain_core.exceptions import OutputParserException
 import os
 from dotenv import load_dotenv
 import azure.cosmos.cosmos_client as cosmos_client
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_core.vectorstores import InMemoryVectorStore
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_core.runnables import RunnableLambda
 import datetime
 import uuid
+from config import XML_SYSTEM_PROMPT, REFINE_SYSTEM_PROMPT
+from helpers import *
 load_dotenv()
 # Constants
 PERSIST_DIRECTORY = "chroma_store"
 K_VALUE = 5
+xml_prompt = ChatPromptTemplate.from_messages([("system", XML_SYSTEM_PROMPT), ("human", "{input}")])
 ENV = os.getenv('ENV')
 HOST = os.getenv('ACCOUNT_HOST')
 database = client.get_database_client(DATABASE_ID)
 container = database.get_container_client(CONTAINER_ID)
 history_container = database.get_container_client(HISTORY_CONTAINER_ID)
 llm = ChatOpenAI(model="gpt-4o", temperature=0)
+df = pd.read_csv("articles_db.csv")
 rag_chain_from_docs = (
 pdf_retriever = pdf_vectorstore.as_retriever()
 refine_prompt = ChatPromptTemplate.from_messages(
     [
+        ("system", REFINE_SYSTEM_PROMPT),
         ("human", "{input}"),
     ]
 )
 question_answer_chain = create_stuff_documents_chain(llm, refine_prompt)
 pdf_rag_chain = create_retrieval_chain(pdf_retriever, question_answer_chain)
 def vectordb_search(query):
     titles, links = [], []
     question_search = retriever.invoke(query)
     for item in question_search:
+        edited_item = item.metadata["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
         title, link = get_article_info(df, edited_item)
         if title not in titles:
             titles.append(title)
             links.append(link)
     return "\n".join([f"- [{title}]({link})" for title, link in zip(titles, links)])
 def llm_response(query, session_id):
     chat = {}
     titles, links, res_titles, res_links = [], [], [], []
     session_id = initialize_session(session_id)
         if 'f wave' in query.lower() or 'f-wave' in query.lower() or 'fwave' in query.lower():
             query = query.replace('f wave', 'f-wave shingle').replace('f-wave', 'f-wave shingle').replace('fwave', 'f-wave shingle')
         result = chain.invoke({"input": query})
+        if pdf_answer['answer'] != 'Nothing':
+            answer = pdf_answer['answer']
         else:
+            answer = result['answer']['cited_answer'][0].get("answer", "No answer available.")
+        citations = result['answer']['cited_answer'][1].get('citations', [])
+        for citation in citations:
+            try:
+                edited_item = citation['citation'][1]["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
+                title, link = get_article_info(df, edited_item)
+                if title not in titles:
+                    titles.append(title)
+                # if link not in links:
+                    links.append(link)
+            except (TypeError, KeyError, IndexError):
+                # Handle the error or simply pass if citation does not have the expected keys
+                continue
+        question_search = retriever.invoke(query)
+        for res_item in question_search:
+            edited_item = res_item.metadata["source"].replace("\\", "/").replace("Articles/", "").replace("Articles\\", "")
+            res_title, res_link = get_article_info(df, edited_item)
+            if res_title not in res_titles and res_title not in titles:
+                res_titles.append(res_title)
+                res_links.append(res_link)
     except OutputParserException as e:
 def vote(value, data: gr.LikeData, session_id: str = None):
     session_id = initialize_session(session_id)
     chat_vote = {}
     chat_vote["id"] = str(uuid.uuid4())
     chat_vote["chat_id"] = session_id
 def user_feedback(value, session_id):
     session_id = initialize_session(session_id)
     chat_feedback = {}
     chat_feedback["id"] = str(uuid.uuid4())
     chat_feedback["chat_id"] = session_id