add pdf files
Browse files
utils.py
CHANGED
|
@@ -7,7 +7,7 @@ from langchain import HuggingFaceHub
|
|
| 7 |
from langchain.cache import InMemoryCache
|
| 8 |
from langchain.chains import ConversationalRetrievalChain
|
| 9 |
from langchain.chat_models import ChatOpenAI
|
| 10 |
-
from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader
|
| 11 |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
|
| 12 |
from langchain.memory import ConversationBufferWindowMemory
|
| 13 |
from langchain.prompts.chat import (
|
|
@@ -151,6 +151,10 @@ def search_index_from_docs(source_chunks):
|
|
| 151 |
return search_index
|
| 152 |
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
def get_html_files():
|
| 155 |
loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
|
| 156 |
document_list = loader.load()
|
|
@@ -160,6 +164,7 @@ def get_html_files():
|
|
| 160 |
def fetch_data_for_embeddings():
|
| 161 |
document_list = get_text_files()
|
| 162 |
document_list.extend(get_html_files())
|
|
|
|
| 163 |
|
| 164 |
# use file_url_mapping to set metadata of document to url which has been set as the source
|
| 165 |
for document in document_list:
|
|
|
|
| 7 |
from langchain.cache import InMemoryCache
|
| 8 |
from langchain.chains import ConversationalRetrievalChain
|
| 9 |
from langchain.chat_models import ChatOpenAI
|
| 10 |
+
from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader, PyPDFDirectoryLoader
|
| 11 |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
|
| 12 |
from langchain.memory import ConversationBufferWindowMemory
|
| 13 |
from langchain.prompts.chat import (
|
|
|
|
| 151 |
return search_index
|
| 152 |
|
| 153 |
|
| 154 |
+
def get_pdf_files():
|
| 155 |
+
loader = PyPDFDirectoryLoader('docs', glob="**/*.pdf", recursive=True)
|
| 156 |
+
document_list = loader.load()
|
| 157 |
+
return document_list
|
| 158 |
def get_html_files():
|
| 159 |
loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
|
| 160 |
document_list = loader.load()
|
|
|
|
| 164 |
def fetch_data_for_embeddings():
|
| 165 |
document_list = get_text_files()
|
| 166 |
document_list.extend(get_html_files())
|
| 167 |
+
document_list.extend(get_pdf_files())
|
| 168 |
|
| 169 |
# use file_url_mapping to set metadata of document to url which has been set as the source
|
| 170 |
for document in document_list:
|