Spaces:
Sleeping
Sleeping
Update web_scrape_and_pdf_loader.py
Browse files
web_scrape_and_pdf_loader.py
CHANGED
|
@@ -114,15 +114,13 @@ def pdf_loader(url, country):
|
|
| 114 |
# Same as above but for pdf in local directory
|
| 115 |
def pdf_loader_local(pdf_filename, country):
|
| 116 |
try:
|
| 117 |
-
with open(pdf_filename, 'wb') as f: # save the pdf locally first
|
| 118 |
-
f.write(response.content)
|
| 119 |
loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
|
| 120 |
raw_pdf_documents = loader.load()
|
| 121 |
raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
|
| 122 |
return raw_pdf_documents
|
| 123 |
|
| 124 |
except Exception as e:
|
| 125 |
-
print(f"Failed to load for {
|
| 126 |
return False
|
| 127 |
|
| 128 |
# If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
|
|
@@ -182,7 +180,7 @@ def process_links_load_documents(all_links):
|
|
| 182 |
# Note: If we are using a lot more data than can be stored in the RAM or when in production,
|
| 183 |
# better to initialize a separate vector store in a server (Postgres or online solutions like Pinecone) before pushing the document chunks to it bit by bit.
|
| 184 |
|
| 185 |
-
def setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country):
|
| 186 |
chromadb_dir = "chromadb"
|
| 187 |
if not os.path.exists(chromadb_dir):
|
| 188 |
os.makedirs(chromadb_dir)
|
|
@@ -192,7 +190,7 @@ def setup_chromadb_vectorstore(all_documents, chunk_size, chunk_overlap, country
|
|
| 192 |
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
| 193 |
)
|
| 194 |
split_documents = text_splitter.split_documents(all_documents)
|
| 195 |
-
persist_directory = f"{chromadb_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}"
|
| 196 |
|
| 197 |
# Build the vector database using Chroma and persist it in a local directory
|
| 198 |
chroma_db = Chroma.from_documents(split_documents,
|
|
@@ -222,8 +220,9 @@ def setup_bm25_retriever(all_documents, chunk_size, chunk_overlap, country):
|
|
| 222 |
split_documents = text_splitter.split_documents(all_documents)
|
| 223 |
split_documents = [doc for doc in split_documents if doc.metadata['country']==country]
|
| 224 |
bm25_retriever = BM25Retriever.from_documents(split_documents)
|
| 225 |
-
filename = f"{bm25_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}.pickle"
|
| 226 |
|
| 227 |
with open(filename, 'wb') as handle:
|
| 228 |
pickle.dump(bm25_retriever, handle)
|
| 229 |
|
|
|
|
|
|
| 114 |
# Same as above but for pdf in local directory
|
| 115 |
def pdf_loader_local(pdf_filename, country):
|
| 116 |
try:
|
|
|
|
|
|
|
| 117 |
loader = PyPDFLoader(pdf_filename) # then use langchain loader to load it
|
| 118 |
raw_pdf_documents = loader.load()
|
| 119 |
raw_pdf_documents = add_country_metadata(raw_pdf_documents, country)
|
| 120 |
return raw_pdf_documents
|
| 121 |
|
| 122 |
except Exception as e:
|
| 123 |
+
print(f"Failed to load for {pdf_filename} {e}")
|
| 124 |
return False
|
| 125 |
|
| 126 |
# If link is just a HTML page, use Langchain WebBaseLoader to convert it to raw documents.
|
|
|
|
| 180 |
# Note: If we are using a lot more data than can be stored in the RAM or when in production,
|
| 181 |
# better to initialize a separate vector store in a server (Postgres or online solutions like Pinecone) before pushing the document chunks to it bit by bit.
|
| 182 |
|
| 183 |
+
def setup_chromadb_vectorstore(hf_embeddings, all_documents, chunk_size, chunk_overlap, country):
|
| 184 |
chromadb_dir = "chromadb"
|
| 185 |
if not os.path.exists(chromadb_dir):
|
| 186 |
os.makedirs(chromadb_dir)
|
|
|
|
| 190 |
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
| 191 |
)
|
| 192 |
split_documents = text_splitter.split_documents(all_documents)
|
| 193 |
+
persist_directory = f"{chromadb_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_"
|
| 194 |
|
| 195 |
# Build the vector database using Chroma and persist it in a local directory
|
| 196 |
chroma_db = Chroma.from_documents(split_documents,
|
|
|
|
| 220 |
split_documents = text_splitter.split_documents(all_documents)
|
| 221 |
split_documents = [doc for doc in split_documents if doc.metadata['country']==country]
|
| 222 |
bm25_retriever = BM25Retriever.from_documents(split_documents)
|
| 223 |
+
filename = f"{bm25_dir}/new_{country}_chunk_{chunk_size}_overlap_{chunk_overlap}_.pickle"
|
| 224 |
|
| 225 |
with open(filename, 'wb') as handle:
|
| 226 |
pickle.dump(bm25_retriever, handle)
|
| 227 |
|
| 228 |
+
return True # to let user know this process is done
|