mgreg555 commited on
Commit
5bc88f0
·
verified ·
1 Parent(s): 051beb5

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +158 -0
  2. constitution.pdf +0 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Doc_chat_vegleges.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1G34ZCuupJZxNy-CFxjMNIa4_I3jynKqC
8
+
9
+ # Setting up environment
10
+ """
11
+
12
+ from PyPDF2 import PdfReader
13
+ from langchain.embeddings.openai import OpenAIEmbeddings
14
+ from langchain.text_splitter import CharacterTextSplitter
15
+ from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
16
+
17
+ # Get your API keys from openai, you will need to create an account.
18
+ # Here is the link to get the keys: https://platform.openai.com/account/billing/overview
19
+ import os
20
+
21
+ """# Preprocessing document"""
22
+
23
+ # location of the pdf file/files.
24
+ reader = PdfReader('constitution.pdf')
25
+ #reader = PdfReader('/content/WOW.pdf')
26
+ #reader = PdfReader('/content/the_little_prince.pdf')
27
+
28
+ # read data from the file
29
+ raw_text = ''
30
+ for i, page in enumerate(reader.pages):
31
+ text = page.extract_text()
32
+ if text:
33
+ raw_text += text
34
+
35
+ # We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.
36
+
37
+ text_splitter = CharacterTextSplitter(
38
+ separator = "\n",
39
+ chunk_size = 1000,
40
+ chunk_overlap = 200,
41
+ length_function = len,
42
+ )
43
+ texts = text_splitter.split_text(raw_text)
44
+
45
+ len(texts)
46
+
47
+ """## Setting up doc search"""
48
+
49
+ embeddings = OpenAIEmbeddings()
50
+ doc_search = FAISS.from_texts(texts, embeddings)
51
+
52
+ """# Setting up chatbot"""
53
+
54
+ from langchain.chains.question_answering import load_qa_chain
55
+ from langchain.memory import ConversationBufferWindowMemory
56
+ from langchain.prompts import PromptTemplate
57
+ from langchain_openai import OpenAI
58
+
59
+ template = """You are a chatbot having a conversation with a human.
60
+
61
+ Given the following extracted parts of a long document and a question, create a final answer based on the document ONLY and NOTHING else.
62
+ Any questions outside of the document is irrelevant and you certanly dont know!
63
+
64
+ {context}
65
+
66
+ {chat_history}
67
+ Human: {human_input}
68
+ Chatbot:"""
69
+
70
+ prompt = PromptTemplate(
71
+ input_variables=["chat_history", "human_input", "context"], template=template
72
+ )
73
+
74
+ memory = ConversationBufferWindowMemory(memory_key="chat_history", input_key="human_input",k=3)
75
+ chain = load_qa_chain( OpenAI(), chain_type="stuff", memory=memory, prompt=prompt)
76
+
77
+ """## The chatbot should know the answer"""
78
+
79
+ query = "Who wrote the constitution?"
80
+ docs = doc_search.similarity_search(query)
81
+
82
+ chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)['output_text']
83
+
84
+ # query = "Acronyms?"
85
+ # docs = doc_search.similarity_search(query)
86
+
87
+ # chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)['output_text']
88
+
89
+ # query = "Say 3 of them"
90
+ # docs = doc_search.similarity_search(query)
91
+
92
+ # chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)['output_text']
93
+
94
+ """## The chatbot should not know the answer."""
95
+
96
+ query = "What is the capital of France?"
97
+ docs = doc_search.similarity_search(query)
98
+
99
+ chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)['output_text']
100
+
101
+ """## Previous chatbot (deprecated)"""
102
+
103
+ #print(chain.memory.buffer)
104
+
105
+ # from langchain.chains.question_answering import load_qa_chain
106
+ # from langchain.llms import OpenAI
107
+
108
+ # embeddings = OpenAIEmbeddings()
109
+ # doc_search = FAISS.from_texts(texts, embeddings)
110
+
111
+ # chain = load_qa_chain(OpenAI(), chain_type="stuff")
112
+
113
+ # query = "Who wrote the constitution?"
114
+
115
+
116
+ # answer = chain.run(input_documents=docs, question=query)
117
+ # print(answer)
118
+
119
+ # query = "What is the capital of france?"
120
+ # answer = chain.run(input_documents=docs, question=query)
121
+ # print(answer)
122
+
123
+ """# Demo
124
+
125
+ ## Setting up methods
126
+ """
127
+
128
+ def chat(query,history):
129
+ docs = doc_search.similarity_search(query)
130
+ return chain({"input_documents": docs, "human_input": query}, return_only_outputs=True)['output_text']
131
+
132
+ """## Setting up UI with gradio"""
133
+
134
+ import gradio as gr
135
+
136
+ css = """
137
+ body {
138
+ background-color: #FFFFFF; /* White background */
139
+ }
140
+
141
+ .gradio-chat-interface, .gradio-chat-input {
142
+ background-color: #E0FFFF; /* Light blue-green background */
143
+ }
144
+ """
145
+
146
+ gr.ChatInterface(
147
+ chat,
148
+ chatbot=gr.Chatbot(height=500),
149
+ title="Doc-chat",
150
+ description="Ask about the constitution!",
151
+ theme="soft",
152
+ examples=["Who wrote the constitution?","What is the capital of France?"],
153
+ cache_examples=True,
154
+ retry_btn=None,
155
+ undo_btn="Delete Previous",
156
+ clear_btn="Clear",
157
+ css=css
158
+ ).launch()
constitution.pdf ADDED
Binary file (414 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ PyPDF2
4
+ faiss-cpu
5
+ tiktoken
6
+ langchain_openai
7
+ gradio