Spaces:

Mjlehtim
/

ESG_analysis_tool

Sleeping

App Files Files Community

Mjlehtim commited on Sep 30, 2024

Commit

e2e4e28

verified ·

1 Parent(s): fc9f5e2

persist_directory (2) added

Browse files

Files changed (1) hide show

app.py +38 -42

app.py CHANGED Viewed

@@ -245,15 +245,18 @@ def create_vector_database_ESG():
     #len(docs)
     print(f"length of documents loaded: {len(documents)}")
     print(f"total number of document chunks generated :{len(docs)}")
     embed_model = HuggingFaceEmbeddings()
     vs = Chroma.from_documents(
         documents=docs,
         embedding=embed_model,
-        collection_name="rag",
     )
     doc_retriever_ESG = vs.as_retriever()
     index = VectorStoreIndex.from_documents(llama_parse_documents)
     query_engine = index.as_query_engine()
@@ -274,19 +277,25 @@ def create_vector_database_financials():
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=15)
     docs = text_splitter.split_documents(documents)
     embed_model = HuggingFaceEmbeddings()
     vs = Chroma.from_documents(
         documents=docs,
         embedding=embed_model,
-        collection_name="rag"
     )
     doc_retriever_financials = vs.as_retriever()
     index = VectorStoreIndex.from_documents(llama_parse_documents)
     query_engine_financials = index.as_query_engine()
-    print('Vector DB created successfully !')
     return doc_retriever_financials, query_engine_financials
 #--------------
@@ -328,6 +337,7 @@ for uploaded_file in uploaded_files_financials:
 #---------------
 def ESG_strategy():
     doc_retriever_ESG, _ = create_vector_database_ESG()
     prompt_template = """<|system|>
     You are a seasoned specialist in environmental, social and governance matters. You write expert analyses for institutional investors. Always use figures, nemerical and statistical data when possible. Output must have sub-headings in bold font and be fluent.<|end|>
     <|user|>
@@ -505,15 +515,8 @@ with strategies_container:
     with mrow1_col2:
         if "ESG_analysis_button_key" in st.session_state.results and st.session_state.results["ESG_analysis_button_key"]:
-            doc_retriever_ESG, query_engine = create_vector_database_ESG()
-            # Define the file path
-            file_path = os.path.join("data", "parsed_data_financials.pkl")
-            # Check if the file exists before running the function
-            if os.path.exists(file_path):
-                doc_retriever_financials, query_engine_financials = create_vector_database_financials()
-            else:
-                print(f"The file {file_path} does not exist. Skipping vector database creation.")
             memory = ConversationBufferMemory(memory_key="chat_history", k=3, return_messages=True)
             search = SerpAPIWrapper()
@@ -548,19 +551,17 @@ with strategies_container:
                 """
             )
-            # LCEL Chains with memory integration
-            if os.path.exists(file_path):
-                financials_chain = (
-                    {
-                        "context": doc_retriever_financials,
-                        # Lambda function now accepts one argument (even if unused)
-                        "chat_history": lambda _: format_chat_history(memory.load_memory_variables({})["chat_history"]),
-                        "question": RunnablePassthrough(),
-                    }
-                    | prompt_financials
-                    | llm_tool
-                    | StrOutputParser()
-                )
             ESG_chain = (
                 {
@@ -581,12 +582,11 @@ with strategies_container:
             description="Useful for answering questions about specific ESG figures, data and statistics.",
             )
-            if os.path.exists(file_path):
-                vector_query_tool_financials = Tool(
-                name="Vector Query Engine Financials",
-                func=lambda query: query_engine_financials.query(query),  # Use query_engine to query the vector database
-                description="Useful for answering questions about specific financial figures, data and statistics.",
-                )
             tools = [
                 Tool(
@@ -594,23 +594,19 @@ with strategies_container:
                     func=ESG_chain.invoke,
                     description="Useful for answering general questions about environmental, social, and governance (ESG) matters related to the company. ",
                 ),
                 Tool(
                     name="Search Tool",
                     func=search.run,
                     description="Useful when other tools do not provide the answer.",
                 ),
                 vector_query_tool_ESG,
-            ]
-            if os.path.exists(file_path):
-                tools.append(
-                    Tool(
-                    name="Financials QA System",
-                    func=financials_chain.invoke,
-                    description="Useful for answering general questions about financial or operational information concerning the company.",
-                ),
                 vector_query_tool_financials,
-                )
             # Initialize the agent with LCEL tools and memory
             agent = initialize_agent(

     #len(docs)
     print(f"length of documents loaded: {len(documents)}")
     print(f"total number of document chunks generated :{len(docs)}")
+    persist_directory = "./chroma_db_ESG"  # Specify directory for Chroma persistence
     embed_model = HuggingFaceEmbeddings()
     vs = Chroma.from_documents(
         documents=docs,
         embedding=embed_model,
+        collection_name="rag_ESG",
+        persist_directory=persist_directory  # Ensure persistence
     )
     doc_retriever_ESG = vs.as_retriever()
     index = VectorStoreIndex.from_documents(llama_parse_documents)
     query_engine = index.as_query_engine()
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=15)
     docs = text_splitter.split_documents(documents)
+    # Add a persist directory for Chroma DB
+    persist_directory = "./chroma_db_financials"  # Specify directory for persistence
     embed_model = HuggingFaceEmbeddings()
+    # Initialize Chroma with persistence
     vs = Chroma.from_documents(
         documents=docs,
         embedding=embed_model,
+        collection_name="rag_financials",  # Use a unique collection name
+        persist_directory=persist_directory  # Persist the data
     )
     doc_retriever_financials = vs.as_retriever()
+    # Build a VectorStore index for querying
     index = VectorStoreIndex.from_documents(llama_parse_documents)
     query_engine_financials = index.as_query_engine()
+    print('Vector DB for financials created successfully!')
     return doc_retriever_financials, query_engine_financials
 #--------------
 #---------------
 def ESG_strategy():
     doc_retriever_ESG, _ = create_vector_database_ESG()
     prompt_template = """<|system|>
     You are a seasoned specialist in environmental, social and governance matters. You write expert analyses for institutional investors. Always use figures, nemerical and statistical data when possible. Output must have sub-headings in bold font and be fluent.<|end|>
     <|user|>
     with mrow1_col2:
         if "ESG_analysis_button_key" in st.session_state.results and st.session_state.results["ESG_analysis_button_key"]:
+            doc_retriever_ESG, query_engine = create_vector_database_ESG()
+            doc_retriever_financials, query_engine_financials = create_vector_database_financials()
             memory = ConversationBufferMemory(memory_key="chat_history", k=3, return_messages=True)
             search = SerpAPIWrapper()
                 """
             )
+            financials_chain = (
+                {
+                    "context": doc_retriever_financials,
+                    # Lambda function now accepts one argument (even if unused)
+                    "chat_history": lambda _: format_chat_history(memory.load_memory_variables({})["chat_history"]),
+                    "question": RunnablePassthrough(),
+                }
+                | prompt_financials
+                | llm_tool
+                | StrOutputParser()
+            )
             ESG_chain = (
                 {
             description="Useful for answering questions about specific ESG figures, data and statistics.",
             )
+            vector_query_tool_financials = Tool(
+            name="Vector Query Engine Financials",
+            func=lambda query: query_engine_financials.query(query),  # Use query_engine to query the vector database
+            description="Useful for answering questions about specific financial figures, data and statistics.",
+            )
             tools = [
                 Tool(
                     func=ESG_chain.invoke,
                     description="Useful for answering general questions about environmental, social, and governance (ESG) matters related to the company. ",
                 ),
+                Tool(
+                    name="Financials QA System",
+                    func=financials_chain.invoke,
+                    description="Useful for answering general questions about financial or operational information concerning the company.",
+                ),
                 Tool(
                     name="Search Tool",
                     func=search.run,
                     description="Useful when other tools do not provide the answer.",
                 ),
                 vector_query_tool_ESG,
                 vector_query_tool_financials,
+            ]
             # Initialize the agent with LCEL tools and memory
             agent = initialize_agent(