Spaces:

bohmian
/

esg_countries_chatbot

Sleeping

App Files Files Community

bohmian commited on Feb 15, 2024

Commit

5e078a0

verified ·

1 Parent(s): 02f90d5

Update web_scrape_and_pdf_loader.py

Browse files

Files changed (1) hide show

web_scrape_and_pdf_loader.py +5 -6

web_scrape_and_pdf_loader.py CHANGED Viewed

@@ -114,15 +114,13 @@ def pdf_loader(url, country):
 # Same as above but for pdf in local directory
 def pdf_loader_local(pdf_filename, country):
     try:
-        with open(pdf_filename, 'wb') as f: # save the pdf locally first
-            f.write(response.content)
         loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
         raw_pdf_documents = loader.load()
         raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
         return raw_pdf_documents
     except Exception as e:
-        print(f"Failed to load for {url}")
         return False
 # If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
@@ -182,7 +180,7 @@ def process_links_load_documents(all_links):
 # Note: If we are using a lot more data than can be stored in the RAM or when in production,
 # better to initialize a separate vector store in a server (Postgres or online solutions like Pinecone) before pushing the document chunks to it bit by bit.
-def setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country):
     chromadb_dir = "chromadb"
     if not os.path.exists(chromadb_dir):
         os.makedirs(chromadb_dir)
@@ -192,7 +190,7 @@ def setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country
         chunk_size=chunk_size, chunk_overlap=chunk_overlap
         )
     split_documents = text_splitter.split_documents(all_documents)
-    persist_directory = f"{chromadb_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
     # Build the vector database using Chroma and persist it in a local directory
     chroma_db =  Chroma.from_documents(split_documents,
@@ -222,8 +220,9 @@ def setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country):
     split_documents = text_splitter.split_documents(all_documents)
     split_documents = [doc for doc in split_documents if doc.metadata['country']==country]
     bm25_retriever = BM25Retriever.from_documents(split_documents)
-    filename = f"{bm25_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}.pickle"
     with open(filename, 'wb') as handle:
         pickle.dump(bm25_retriever, handle)

 # Same as above but for pdf in local directory
 def pdf_loader_local(pdf_filename, country):
     try:
         loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
         raw_pdf_documents = loader.load()
         raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
         return raw_pdf_documents
     except Exception as e:
+        print(f"Failed to load for {pdf_filename} {e}")
         return False
 # If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
 # Note: If we are using a lot more data than can be stored in the RAM or when in production,
 # better to initialize a separate vector store in a server (Postgres or online solutions like Pinecone) before pushing the document chunks to it bit by bit.
+def setup_chromadb_vectorstore(hf_embeddings, all_documents, chunk_size, chunk_overlap, country):
     chromadb_dir = "chromadb"
     if not os.path.exists(chromadb_dir):
         os.makedirs(chromadb_dir)
         chunk_size=chunk_size, chunk_overlap=chunk_overlap
         )
     split_documents = text_splitter.split_documents(all_documents)
+    persist_directory = f"{chromadb_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
     # Build the vector database using Chroma and persist it in a local directory
     chroma_db =  Chroma.from_documents(split_documents,
     split_documents = text_splitter.split_documents(all_documents)
     split_documents = [doc for doc in split_documents if doc.metadata['country']==country]
     bm25_retriever = BM25Retriever.from_documents(split_documents)
+    filename = f"{bm25_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_.pickle"
     with open(filename, 'wb') as handle:
         pickle.dump(bm25_retriever, handle)
+    return True # to let user know this process is done