Spaces:

curiocities
/

chat-with-docs

Runtime error

App Files Files Community

gyroflaw commited on Jun 15, 2023

Commit

cdaa0b4

1 Parent(s): 0f14809

fix langchain deprecation messages and change docs

Browse files

Files changed (5) hide show

.gitignore +7 -1
.vscode/settings.json +6 -0
app.py +22 -18
example_docs/{Lonely Planet Japan.epub → pg70973-images.epub} +2 -2
init.py +52 -17

.gitignore CHANGED Viewed

@@ -4,4 +4,10 @@ __pycache__
 # ENV
 .env*
-!.env.example

 # ENV
 .env*
+!.env.example
+# Chromadb
+db
+# Misc
+temp

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "[python]": {
+    "editor.defaultFormatter": "ms-python.black-formatter"
+  },
+  "python.formatting.provider": "none"
+}

app.py CHANGED Viewed

@@ -4,15 +4,11 @@ import chromadb
 import openai
 import langchain
-from os.path import join, dirname
-from dotenv import load_dotenv
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import Chroma
-from langchain.text_splitter import TokenTextSplitter
-from langchain.llms import OpenAI
-from langchain.chains import ChatVectorDBChain
-from langchain.document_loaders import GutenbergLoader
 import gradio as gr
@@ -21,22 +17,30 @@ from init import create_vectorstore
 from config import (
     CHROMA_SETTINGS,
     PERSIST_DIRECTORY,
-    )
-create_vectorstore()
 def query(question):
-  embeddings = OpenAIEmbeddings()
-  db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
-  text_qa = ChatVectorDBChain.from_llm(
-        OpenAI(temperature=0, model_name="gpt-3.5-turbo"),
-        db,
-        return_source_documents=True,
     )
-  result = text_qa({"question": question})
-  return result["answer"]
 demo = gr.Interface(fn=query, inputs="text", outputs="text")
-demo.launch()

 import openai
 import langchain
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import Chroma
+from langchain.chat_models import ChatOpenAI
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
 import gradio as gr
 from config import (
     CHROMA_SETTINGS,
     PERSIST_DIRECTORY,
+)
 def query(question):
+    embeddings = OpenAIEmbeddings()
+    db = Chroma(
+        persist_directory=PERSIST_DIRECTORY,
+        embedding_function=embeddings,
+        client_settings=CHROMA_SETTINGS,
+    )
+    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+    text_qa = ConversationalRetrievalChain.from_llm(
+        ChatOpenAI(model_name="gpt-3.5-turbo"),
+        db.as_retriever(),
+        memory=memory,
     )
+    result = text_qa({"question": question})
+    return result["answer"]
 demo = gr.Interface(fn=query, inputs="text", outputs="text")
+create_vectorstore()
+demo.launch()

example_docs/{Lonely Planet Japan.epub → pg70973-images.epub} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90832f22a09d1d1307342d8d81a1c40908797468b9a8fb3b4e6d89553589f9da
-size 58957457

 version https://git-lfs.github.com/spec/v1
+oid sha256:9dcd9db0ba476be13573a6408eebadc188b3607159b4578126fd12662e9641b9
+size 7922273

init.py CHANGED Viewed

@@ -28,12 +28,11 @@ from config import (
     PERSIST_DIRECTORY,
     CHUNK_SIZE,
     CHUNK_OVERLAP,
-    )
 # Map file extensions to document loaders and their arguments
 LOADER_MAPPING = {
     ".csv": (CSVLoader, {}),
-    # ".docx": (Docx2txtLoader, {}),
     ".doc": (UnstructuredWordDocumentLoader, {}),
     ".docx": (UnstructuredWordDocumentLoader, {}),
     ".enex": (EverNoteLoader, {}),
@@ -45,7 +44,6 @@ LOADER_MAPPING = {
     ".ppt": (UnstructuredPowerPointLoader, {}),
     ".pptx": (UnstructuredPowerPointLoader, {}),
     ".txt": (TextLoader, {"encoding": "utf8"}),
-    # Add more mappings for other file extensions and loaders as needed
 }
@@ -59,6 +57,7 @@ def load_single_document(file_path: str) -> List[Document]:
     raise ValueError(f"Unsupported file extension '{ext}'")
 def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
     """
     Loads all documents from the source documents directory, ignoring specified files
@@ -68,17 +67,24 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum
         all_files.extend(
             glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
         )
-    filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
     with Pool(processes=os.cpu_count()) as pool:
         results = []
-        with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
-            for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
                 results.extend(docs)
                 pbar.update()
     return results
 def process_documents(ignored_files: List[str] = []) -> List[Document]:
     """
     Load documents and split in chunks
@@ -87,26 +93,36 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
     documents = load_documents(DOCUMENTS_PATH, ignored_files)
     if not documents:
         print("No new documents to load")
-        exit(0)
     print(f"Loaded {len(documents)} new documents from {DOCUMENTS_PATH}")
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
     texts = text_splitter.split_documents(documents)
     print(f"Split into {len(texts)} chunks of text (max. {CHUNK_SIZE} tokens each)")
     return texts
 def does_vectorstore_exist(persist_directory: str) -> bool:
     """
     Checks if vectorstore exists
     """
-    if os.path.exists(os.path.join(persist_directory, 'index')):
-        if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
-            list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
-            list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
             # At least 3 documents are needed in a working vectorstore
             if len(list_index_files) > 3:
                 return True
     return False
 def create_vectorstore():
     # Create embeddings
     embeddings = OpenAIEmbeddings()
@@ -114,17 +130,36 @@ def create_vectorstore():
     if does_vectorstore_exist(PERSIST_DIRECTORY):
         # Update and store locally vectorstore
         print(f"Appending to existing vectorstore at {PERSIST_DIRECTORY}")
-        db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
         collection = db.get()
-        texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
-        print(f"Creating vectorstore. May take some minutes...")
         db.add_documents(texts)
     else:
         # Create and store locally vectorstore
         print("Creating new vectorstore")
         texts = process_documents()
-        print(f"Creating vectorstore. May take some minutes...")
-        db = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY, client_settings=CHROMA_SETTINGS)
     db.persist()
     db = None

     PERSIST_DIRECTORY,
     CHUNK_SIZE,
     CHUNK_OVERLAP,
+)
 # Map file extensions to document loaders and their arguments
 LOADER_MAPPING = {
     ".csv": (CSVLoader, {}),
     ".doc": (UnstructuredWordDocumentLoader, {}),
     ".docx": (UnstructuredWordDocumentLoader, {}),
     ".enex": (EverNoteLoader, {}),
     ".ppt": (UnstructuredPowerPointLoader, {}),
     ".pptx": (UnstructuredPowerPointLoader, {}),
     ".txt": (TextLoader, {"encoding": "utf8"}),
 }
     raise ValueError(f"Unsupported file extension '{ext}'")
 def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
     """
     Loads all documents from the source documents directory, ignoring specified files
         all_files.extend(
             glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
         )
+    filtered_files = [
+        file_path for file_path in all_files if file_path not in ignored_files
+    ]
     with Pool(processes=os.cpu_count()) as pool:
         results = []
+        with tqdm(
+            total=len(filtered_files), desc="Loading new documents", ncols=80
+        ) as pbar:
+            for i, docs in enumerate(
+                pool.imap_unordered(load_single_document, filtered_files)
+            ):
                 results.extend(docs)
                 pbar.update()
     return results
 def process_documents(ignored_files: List[str] = []) -> List[Document]:
     """
     Load documents and split in chunks
     documents = load_documents(DOCUMENTS_PATH, ignored_files)
     if not documents:
         print("No new documents to load")
+        return []
     print(f"Loaded {len(documents)} new documents from {DOCUMENTS_PATH}")
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
+    )
     texts = text_splitter.split_documents(documents)
     print(f"Split into {len(texts)} chunks of text (max. {CHUNK_SIZE} tokens each)")
     return texts
 def does_vectorstore_exist(persist_directory: str) -> bool:
     """
     Checks if vectorstore exists
     """
+    if os.path.exists(os.path.join(persist_directory, "index")):
+        if os.path.exists(
+            os.path.join(persist_directory, "chroma-collections.parquet")
+        ) and os.path.exists(
+            os.path.join(persist_directory, "chroma-embeddings.parquet")
+        ):
+            list_index_files = glob.glob(os.path.join(persist_directory, "index/*.bin"))
+            list_index_files += glob.glob(
+                os.path.join(persist_directory, "index/*.pkl")
+            )
             # At least 3 documents are needed in a working vectorstore
             if len(list_index_files) > 3:
                 return True
     return False
 def create_vectorstore():
     # Create embeddings
     embeddings = OpenAIEmbeddings()
     if does_vectorstore_exist(PERSIST_DIRECTORY):
         # Update and store locally vectorstore
         print(f"Appending to existing vectorstore at {PERSIST_DIRECTORY}")
+        db = Chroma(
+            persist_directory=PERSIST_DIRECTORY,
+            embedding_function=embeddings,
+            client_settings=CHROMA_SETTINGS,
+        )
         collection = db.get()
+        texts = process_documents(
+            [metadata["source"] for metadata in collection["metadatas"]]
+        )
+        if not texts:
+            return
+        print(f"Creating embeddings. May take some minutes...")
         db.add_documents(texts)
     else:
         # Create and store locally vectorstore
         print("Creating new vectorstore")
         texts = process_documents()
+        if not texts:
+            return
+        print(f"Creating embeddings. May take some minutes...")
+        db = Chroma.from_documents(
+            texts,
+            embeddings,
+            persist_directory=PERSIST_DIRECTORY,
+            client_settings=CHROMA_SETTINGS,
+        )
     db.persist()
     db = None