araeyn commited on
Commit
2cfa857
·
verified ·
1 Parent(s): 95e34ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -0
app.py CHANGED
@@ -22,6 +22,8 @@ from langchain_community.chat_message_histories import ChatMessageHistory
22
  if not os.path.isdir('database'):
23
  os.system("unzip database.zip")
24
 
 
 
25
  loader = DirectoryLoader('./database', glob="./*.txt", loader_cls=TextLoader)
26
 
27
  documents = loader.load()
@@ -29,10 +31,22 @@ documents = loader.load()
29
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
30
  texts = text_splitter.split_documents(documents)
31
 
 
 
 
 
 
 
32
  persist_directory = 'db'
33
 
34
  embedding = HuggingFaceEmbeddings()
35
 
 
 
 
 
 
 
36
  vectordb = Chroma.from_documents(documents=texts,
37
  embedding=embedding,
38
  persist_directory=persist_directory)
@@ -40,9 +54,21 @@ vectordb = Chroma.from_documents(documents=texts,
40
  vectordb.persist()
41
  vectordb = None
42
 
 
 
 
 
 
 
43
  vectordb = Chroma(persist_directory=persist_directory,
44
  embedding_function=embedding)
45
 
 
 
 
 
 
 
46
  def format_docs(docs):
47
  return "\n\n".join(doc.page_content for doc in docs)
48
 
@@ -56,6 +82,12 @@ rag_chain = (
56
  | StrOutputParser()
57
  )
58
 
 
 
 
 
 
 
59
  contextualize_q_system_prompt = """Given a chat history and the latest user question \
60
  which might reference context in the chat history, formulate a standalone question \
61
  which can be understood without the chat history. Do NOT answer the question, \
 
22
  if not os.path.isdir('database'):
23
  os.system("unzip database.zip")
24
 
25
+ clean_up_tokenization_spaces = True
26
+
27
  loader = DirectoryLoader('./database', glob="./*.txt", loader_cls=TextLoader)
28
 
29
  documents = loader.load()
 
31
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
32
  texts = text_splitter.split_documents(documents)
33
 
34
+ print()
35
+ print("-------")
36
+ print("TextSplitter, DirectoryLoader")
37
+ print("-------")
38
+ print("--")
39
+
40
  persist_directory = 'db'
41
 
42
  embedding = HuggingFaceEmbeddings()
43
 
44
+ print()
45
+ print("-------")
46
+ print("Embeddings")
47
+ print("-------")
48
+ print("--")
49
+
50
  vectordb = Chroma.from_documents(documents=texts,
51
  embedding=embedding,
52
  persist_directory=persist_directory)
 
54
  vectordb.persist()
55
  vectordb = None
56
 
57
+ print()
58
+ print("-------")
59
+ print("Chroma1")
60
+ print("-------")
61
+ print("--")
62
+
63
  vectordb = Chroma(persist_directory=persist_directory,
64
  embedding_function=embedding)
65
 
66
+ print()
67
+ print("-------")
68
+ print("Chroma2")
69
+ print("-------")
70
+ print("--")
71
+
72
  def format_docs(docs):
73
  return "\n\n".join(doc.page_content for doc in docs)
74
 
 
82
  | StrOutputParser()
83
  )
84
 
85
+ print()
86
+ print("-------")
87
+ print("Retriever, Prompt, LLM, Rag_Chain")
88
+ print("-------")
89
+ print("--")
90
+
91
  contextualize_q_system_prompt = """Given a chat history and the latest user question \
92
  which might reference context in the chat history, formulate a standalone question \
93
  which can be understood without the chat history. Do NOT answer the question, \