gyroflaw commited on
Commit
cdaa0b4
·
1 Parent(s): 0f14809

fix langchain deprecation messages and change docs

Browse files
.gitignore CHANGED
@@ -4,4 +4,10 @@ __pycache__
4
 
5
  # ENV
6
  .env*
7
- !.env.example
 
 
 
 
 
 
 
4
 
5
  # ENV
6
  .env*
7
+ !.env.example
8
+
9
+ # Chromadb
10
+ db
11
+
12
+ # Misc
13
+ temp
.vscode/settings.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter"
4
+ },
5
+ "python.formatting.provider": "none"
6
+ }
app.py CHANGED
@@ -4,15 +4,11 @@ import chromadb
4
  import openai
5
  import langchain
6
 
7
- from os.path import join, dirname
8
- from dotenv import load_dotenv
9
-
10
  from langchain.embeddings.openai import OpenAIEmbeddings
11
  from langchain.vectorstores import Chroma
12
- from langchain.text_splitter import TokenTextSplitter
13
- from langchain.llms import OpenAI
14
- from langchain.chains import ChatVectorDBChain
15
- from langchain.document_loaders import GutenbergLoader
16
 
17
  import gradio as gr
18
 
@@ -21,22 +17,30 @@ from init import create_vectorstore
21
  from config import (
22
  CHROMA_SETTINGS,
23
  PERSIST_DIRECTORY,
24
- )
25
 
26
- create_vectorstore()
27
 
28
  def query(question):
29
- embeddings = OpenAIEmbeddings()
30
- db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
31
- text_qa = ChatVectorDBChain.from_llm(
32
- OpenAI(temperature=0, model_name="gpt-3.5-turbo"),
33
- db,
34
- return_source_documents=True,
 
 
 
 
 
35
  )
36
- result = text_qa({"question": question})
 
 
37
 
38
- return result["answer"]
39
 
40
  demo = gr.Interface(fn=query, inputs="text", outputs="text")
41
 
42
- demo.launch()
 
 
 
 
4
  import openai
5
  import langchain
6
 
 
 
 
7
  from langchain.embeddings.openai import OpenAIEmbeddings
8
  from langchain.vectorstores import Chroma
9
+ from langchain.chat_models import ChatOpenAI
10
+ from langchain.chains import ConversationalRetrievalChain
11
+ from langchain.memory import ConversationBufferMemory
 
12
 
13
  import gradio as gr
14
 
 
17
  from config import (
18
  CHROMA_SETTINGS,
19
  PERSIST_DIRECTORY,
20
+ )
21
 
 
22
 
23
  def query(question):
24
+ embeddings = OpenAIEmbeddings()
25
+ db = Chroma(
26
+ persist_directory=PERSIST_DIRECTORY,
27
+ embedding_function=embeddings,
28
+ client_settings=CHROMA_SETTINGS,
29
+ )
30
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
31
+ text_qa = ConversationalRetrievalChain.from_llm(
32
+ ChatOpenAI(model_name="gpt-3.5-turbo"),
33
+ db.as_retriever(),
34
+ memory=memory,
35
  )
36
+ result = text_qa({"question": question})
37
+
38
+ return result["answer"]
39
 
 
40
 
41
  demo = gr.Interface(fn=query, inputs="text", outputs="text")
42
 
43
+
44
+ create_vectorstore()
45
+
46
+ demo.launch()
example_docs/{Lonely Planet Japan.epub → pg70973-images.epub} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90832f22a09d1d1307342d8d81a1c40908797468b9a8fb3b4e6d89553589f9da
3
- size 58957457
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dcd9db0ba476be13573a6408eebadc188b3607159b4578126fd12662e9641b9
3
+ size 7922273
init.py CHANGED
@@ -28,12 +28,11 @@ from config import (
28
  PERSIST_DIRECTORY,
29
  CHUNK_SIZE,
30
  CHUNK_OVERLAP,
31
- )
32
 
33
  # Map file extensions to document loaders and their arguments
34
  LOADER_MAPPING = {
35
  ".csv": (CSVLoader, {}),
36
- # ".docx": (Docx2txtLoader, {}),
37
  ".doc": (UnstructuredWordDocumentLoader, {}),
38
  ".docx": (UnstructuredWordDocumentLoader, {}),
39
  ".enex": (EverNoteLoader, {}),
@@ -45,7 +44,6 @@ LOADER_MAPPING = {
45
  ".ppt": (UnstructuredPowerPointLoader, {}),
46
  ".pptx": (UnstructuredPowerPointLoader, {}),
47
  ".txt": (TextLoader, {"encoding": "utf8"}),
48
- # Add more mappings for other file extensions and loaders as needed
49
  }
50
 
51
 
@@ -59,6 +57,7 @@ def load_single_document(file_path: str) -> List[Document]:
59
 
60
  raise ValueError(f"Unsupported file extension '{ext}'")
61
 
 
62
  def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
63
  """
64
  Loads all documents from the source documents directory, ignoring specified files
@@ -68,17 +67,24 @@ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Docum
68
  all_files.extend(
69
  glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
70
  )
71
- filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
 
 
72
 
73
  with Pool(processes=os.cpu_count()) as pool:
74
  results = []
75
- with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
76
- for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
 
 
 
 
77
  results.extend(docs)
78
  pbar.update()
79
 
80
  return results
81
 
 
82
  def process_documents(ignored_files: List[str] = []) -> List[Document]:
83
  """
84
  Load documents and split in chunks
@@ -87,26 +93,36 @@ def process_documents(ignored_files: List[str] = []) -> List[Document]:
87
  documents = load_documents(DOCUMENTS_PATH, ignored_files)
88
  if not documents:
89
  print("No new documents to load")
90
- exit(0)
91
  print(f"Loaded {len(documents)} new documents from {DOCUMENTS_PATH}")
92
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
 
 
93
  texts = text_splitter.split_documents(documents)
94
  print(f"Split into {len(texts)} chunks of text (max. {CHUNK_SIZE} tokens each)")
95
  return texts
96
 
 
97
  def does_vectorstore_exist(persist_directory: str) -> bool:
98
  """
99
  Checks if vectorstore exists
100
  """
101
- if os.path.exists(os.path.join(persist_directory, 'index')):
102
- if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
103
- list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
104
- list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
 
 
 
 
 
 
105
  # At least 3 documents are needed in a working vectorstore
106
  if len(list_index_files) > 3:
107
  return True
108
  return False
109
 
 
110
  def create_vectorstore():
111
  # Create embeddings
112
  embeddings = OpenAIEmbeddings()
@@ -114,17 +130,36 @@ def create_vectorstore():
114
  if does_vectorstore_exist(PERSIST_DIRECTORY):
115
  # Update and store locally vectorstore
116
  print(f"Appending to existing vectorstore at {PERSIST_DIRECTORY}")
117
- db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
 
 
 
 
118
  collection = db.get()
119
- texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
120
- print(f"Creating vectorstore. May take some minutes...")
 
 
 
 
 
 
121
  db.add_documents(texts)
122
  else:
123
  # Create and store locally vectorstore
124
  print("Creating new vectorstore")
125
  texts = process_documents()
126
- print(f"Creating vectorstore. May take some minutes...")
127
- db = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY, client_settings=CHROMA_SETTINGS)
 
 
 
 
 
 
 
 
 
128
  db.persist()
129
  db = None
130
 
 
28
  PERSIST_DIRECTORY,
29
  CHUNK_SIZE,
30
  CHUNK_OVERLAP,
31
+ )
32
 
33
  # Map file extensions to document loaders and their arguments
34
  LOADER_MAPPING = {
35
  ".csv": (CSVLoader, {}),
 
36
  ".doc": (UnstructuredWordDocumentLoader, {}),
37
  ".docx": (UnstructuredWordDocumentLoader, {}),
38
  ".enex": (EverNoteLoader, {}),
 
44
  ".ppt": (UnstructuredPowerPointLoader, {}),
45
  ".pptx": (UnstructuredPowerPointLoader, {}),
46
  ".txt": (TextLoader, {"encoding": "utf8"}),
 
47
  }
48
 
49
 
 
57
 
58
  raise ValueError(f"Unsupported file extension '{ext}'")
59
 
60
+
61
  def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
62
  """
63
  Loads all documents from the source documents directory, ignoring specified files
 
67
  all_files.extend(
68
  glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
69
  )
70
+ filtered_files = [
71
+ file_path for file_path in all_files if file_path not in ignored_files
72
+ ]
73
 
74
  with Pool(processes=os.cpu_count()) as pool:
75
  results = []
76
+ with tqdm(
77
+ total=len(filtered_files), desc="Loading new documents", ncols=80
78
+ ) as pbar:
79
+ for i, docs in enumerate(
80
+ pool.imap_unordered(load_single_document, filtered_files)
81
+ ):
82
  results.extend(docs)
83
  pbar.update()
84
 
85
  return results
86
 
87
+
88
  def process_documents(ignored_files: List[str] = []) -> List[Document]:
89
  """
90
  Load documents and split in chunks
 
93
  documents = load_documents(DOCUMENTS_PATH, ignored_files)
94
  if not documents:
95
  print("No new documents to load")
96
+ return []
97
  print(f"Loaded {len(documents)} new documents from {DOCUMENTS_PATH}")
98
+ text_splitter = RecursiveCharacterTextSplitter(
99
+ chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
100
+ )
101
  texts = text_splitter.split_documents(documents)
102
  print(f"Split into {len(texts)} chunks of text (max. {CHUNK_SIZE} tokens each)")
103
  return texts
104
 
105
+
106
  def does_vectorstore_exist(persist_directory: str) -> bool:
107
  """
108
  Checks if vectorstore exists
109
  """
110
+ if os.path.exists(os.path.join(persist_directory, "index")):
111
+ if os.path.exists(
112
+ os.path.join(persist_directory, "chroma-collections.parquet")
113
+ ) and os.path.exists(
114
+ os.path.join(persist_directory, "chroma-embeddings.parquet")
115
+ ):
116
+ list_index_files = glob.glob(os.path.join(persist_directory, "index/*.bin"))
117
+ list_index_files += glob.glob(
118
+ os.path.join(persist_directory, "index/*.pkl")
119
+ )
120
  # At least 3 documents are needed in a working vectorstore
121
  if len(list_index_files) > 3:
122
  return True
123
  return False
124
 
125
+
126
  def create_vectorstore():
127
  # Create embeddings
128
  embeddings = OpenAIEmbeddings()
 
130
  if does_vectorstore_exist(PERSIST_DIRECTORY):
131
  # Update and store locally vectorstore
132
  print(f"Appending to existing vectorstore at {PERSIST_DIRECTORY}")
133
+ db = Chroma(
134
+ persist_directory=PERSIST_DIRECTORY,
135
+ embedding_function=embeddings,
136
+ client_settings=CHROMA_SETTINGS,
137
+ )
138
  collection = db.get()
139
+ texts = process_documents(
140
+ [metadata["source"] for metadata in collection["metadatas"]]
141
+ )
142
+
143
+ if not texts:
144
+ return
145
+
146
+ print(f"Creating embeddings. May take some minutes...")
147
  db.add_documents(texts)
148
  else:
149
  # Create and store locally vectorstore
150
  print("Creating new vectorstore")
151
  texts = process_documents()
152
+
153
+ if not texts:
154
+ return
155
+
156
+ print(f"Creating embeddings. May take some minutes...")
157
+ db = Chroma.from_documents(
158
+ texts,
159
+ embeddings,
160
+ persist_directory=PERSIST_DIRECTORY,
161
+ client_settings=CHROMA_SETTINGS,
162
+ )
163
  db.persist()
164
  db = None
165