snehasquasher commited on
Commit
b0d4092
·
1 Parent(s): c9535bf

Upload folder using huggingface_hub

Browse files
.github/workflows/huggingface.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push https://snehasquasher:$HF_TOKEN@huggingface.co/spaces/snehasquasher/spur-mvp main
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: '3.9'
20
+
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
+
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
+
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiKey.py
2
+ logs
3
+ logs/*.*
4
+ __pycache__
5
+ __pycache__/*
6
+ *.pyc
7
+ chromadb/*
8
+ data
9
+ notiondb
10
+ .DS_Store
11
+ chroma.sqlite3
Constants.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DB_TYPE ="notion" #faiss or chromadb or notion
2
+ PERSIST_DIRECTORY="./"
3
+ CHROMA_PERSIST_DIRECTORY="./chromadb"
4
+ NOTION_PERSIST_DIRECTORY="./notiondb"
5
+ COLLECTION_NAME="chatdata"
6
+ CHROMA_COLLECTION_NAME="LLMData"
7
+ NOTION_COLLECTION_NAME="notionData"
8
+ DATA_DIRECTORY="./data"
9
+ LOG_FILE="./logs/output.log"
10
+ NOTION_DB="0c3bfaa0a33c4038aeeb988c16f83abb"
11
+ #MAX_PAGES_TO_READ=
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Harrison Chase
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,37 @@
1
  ---
2
- title: Spur Chatbot
3
- emoji: 🏃
4
- colorFrom: yellow
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 3.41.2
8
  app_file: app.py
9
- pinned: false
 
10
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
  ---
2
+ title: spur-chatbot
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 3.40.1
6
  ---
7
+ # Chat-Your-Data
8
+
9
+ Create a ChatGPT like experience over your custom docs using [LangChain](https://github.com/langchain-ai/langchain).
10
+
11
+ See [this blog post](blogpost.md) for a more detailed explanation.
12
+
13
+ ## Step 0: Install requirements
14
+
15
+ `pip install -r requirements.txt`
16
+
17
+ ## Step 1: Set your open AI Key
18
+
19
+ ```sh
20
+ export OPENAI_API_KEY="Your OpenAPI Key"
21
+ ```
22
+ ## Step 2: Query data
23
+
24
+ Custom prompts are used to ground the answers in the state of the union text file.
25
+
26
+ ## Step 3: Running the Application
27
+
28
+ By running `python app.py` from the command line you can easily interact with your ChatGPT over your own data.
29
+
30
+ # Others
31
+
32
+ ## Notion Integration
33
 
34
+ ## Step 1: Set your Notion API Key
35
+ ```sh
36
+ export NOTION_API_KEY= "Your Notion API Key"
37
+ ```
__pycache__/Constants.cpython-310.pyc ADDED
Binary file (389 Bytes). View file
 
__pycache__/apiKey.cpython-310.pyc ADDED
Binary file (226 Bytes). View file
 
__pycache__/db_types.cpython-310.pyc ADDED
Binary file (417 Bytes). View file
 
__pycache__/ingest_data.cpython-310.pyc ADDED
Binary file (3.76 kB). View file
 
__pycache__/metadatainfo.cpython-310.pyc ADDED
Binary file (553 Bytes). View file
 
__pycache__/notionMetadataInfo.cpython-310.pyc ADDED
Binary file (450 Bytes). View file
 
__pycache__/query_data.cpython-310.pyc ADDED
Binary file (6.79 kB). View file
 
__pycache__/read_notion.cpython-310.pyc ADDED
Binary file (1.03 kB). View file
 
__pycache__/utilities.cpython-310.pyc ADDED
Binary file (921 Bytes). View file
 
apiKey.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ OPENAI_API_KEY="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"
2
+ NOTION_API_KEY="secret_OfFuUM9P85js07guQiRpyVFGiNMvTgQf57QBOBOsB94"
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from typing import Optional, Tuple
4
+ from threading import Lock
5
+ import json
6
+ import shutil
7
+ import gradio as gr
8
+ from query_data import chain_options
9
+ from query_data import get_basic_qa_chain
10
+ from zipfile import ZipFile
11
+ from ingest_data import ingestData
12
+
13
+ from query_data import (get_basic_qa_chain,
14
+ get_qa_with_sources_chain,
15
+ get_custom_prompt_qa_chain,
16
+ get_condense_prompt_qa_chain,
17
+ get_retrievalqa_with_sources_chain)
18
+
19
+ from metadatainfo import metadata_field_info
20
+ from Constants import *
21
+ from apiKey import *
22
+
23
+ def set_openai_api_key(api_key: str):
24
+ """Set the api key and return chain.
25
+ If no api_key, then None is returned.
26
+ """
27
+ if api_key:
28
+ os.environ["OPENAI_API_KEY"] = api_key
29
+ chain = getChainSelectedByUser(chainType)
30
+ os.environ["OPENAI_API_KEY"] = ""
31
+ return chain
32
+ '''
33
+ os.environ["OPENAI_API_KEY"] = api_key
34
+ chain=get_basic_qa_chain()
35
+ return chain'''
36
+
37
+ def getChainSelectedByUser(chainType: gr.Dropdown) :
38
+ chain = get_basic_qa_chain()
39
+
40
+ if (chainType == "with_sources" ):
41
+ chain = get_qa_with_sources_chain()
42
+ elif (chainType == "custom_prompt"):
43
+ chain = get_custom_prompt_qa_chain()
44
+ elif (chainType == "condense_prompt"):
45
+ chain = get_condense_prompt_qa_chain()
46
+ elif (chainType == "retrieval_sources_chain"):
47
+ chain = get_retrievalqa_with_sources_chain()
48
+
49
+ return chain
50
+
51
+ class Logger:
52
+ def __init__(self, filename):
53
+ self.terminal = sys.stdout
54
+ self.log = open(filename, "w")
55
+
56
+ def write(self, message):
57
+ self.terminal.write(message)
58
+ self.log.write(message)
59
+
60
+ def flush(self):
61
+ self.terminal.flush()
62
+ self.log.flush()
63
+
64
+ def isatty(self):
65
+ return False
66
+
67
+ sys.stdout = Logger(LOG_FILE)
68
+
69
+ def read_logs():
70
+ sys.stdout.flush()
71
+ with open(LOG_FILE, "r") as f:
72
+ return f.read()
73
+
74
+ def upload_file(files):
75
+ file_paths = [file.name for file in files]
76
+ for f in file_paths:
77
+ print("moving file :" + f)
78
+ shutil.copy(f, DATA_DIRECTORY)
79
+ return file_paths
80
+
81
+ def ingest():
82
+ ingestData()
83
+
84
+ class ChatWrapper:
85
+
86
+ def __init__(self):
87
+ self.lock = Lock()
88
+
89
+ def __call__(
90
+ self, api_key: str, inp: str, history: Optional[Tuple[str, str]], chain, chainType
91
+ ):
92
+ """Execute the chat functionality."""
93
+ self.lock.acquire()
94
+ try:
95
+ history = history or []
96
+ # If chain is None, that is because no API key was provided.
97
+ if chain is None:
98
+ '''os.environ["OPENAI_API_KEY"] = api_key
99
+ chain=get_basic_qa_chain()'''
100
+ history.append((inp, "Please paste your OpenAI key to use"))
101
+ return history, history
102
+ # Set OpenAI key
103
+ import openai
104
+
105
+ openai.api_key = api_key
106
+ print("calling chain of type " + str(type(chain)))
107
+ # Run chain and append input.
108
+ results = chain({"question": inp})
109
+ #metadata=metadata_field_info,
110
+ #include_run_info=True)
111
+ print("result keys :")
112
+ print(*results, sep=" " )
113
+
114
+ output = results["answer"]
115
+
116
+ if (chainType == "with_sources") :
117
+ print("document source count :"+str(len(results["source_documents"])))
118
+ for s in results["source_documents"]:
119
+ for key in s.metadata:
120
+ output = output + "<br>" + key + ":"+ s.metadata[key] + "<br>"
121
+
122
+ elif (chainType == "retrieval_sources_chain"):
123
+ print("results")
124
+ #output = output + "<br>" + "SOURCE:" + results["sources"]
125
+ history.append((inp, output))
126
+ except Exception as e:
127
+ raise e
128
+ finally:
129
+ self.lock.release()
130
+ return history, history
131
+
132
+ chat = ChatWrapper()
133
+
134
+ block = gr.Blocks(gr.themes.Soft(),
135
+ analytics_enabled=True)
136
+
137
+ with block :
138
+ with gr.Row():
139
+ #api_key=OPENAI_API_KEY
140
+ gr.Markdown(
141
+ "<h3><center>Chat-Your-Data</center></h3>")
142
+
143
+ openai_api_key_textbox = gr.Textbox(
144
+ #value=api_key,
145
+ placeholder="Paste your OpenAI API key (sk-...)",
146
+ show_label=False,
147
+ lines=1,
148
+ type="password",
149
+ )
150
+ #set_openai_api_key(api_key)
151
+ chatbot = gr.Chatbot()
152
+
153
+
154
+ with gr.Row():
155
+ message = gr.Textbox(
156
+ value="ask me something about your data",
157
+ label="What's your question?",
158
+ placeholder="Ask questions about the most recent state of the union",
159
+ lines=1,
160
+ )
161
+ submit = gr.Button(value="Send", variant="secondary").style(
162
+ scale=1)
163
+
164
+ gr.Examples(
165
+ examples=[
166
+ "Who is Tanmay Chopra?",
167
+ "Which persons know about the topics LLM?",
168
+ "What did Navid say about LLM?",
169
+ ],
170
+ inputs=message,
171
+ )
172
+
173
+ with gr.Row():
174
+ chainType = gr.Dropdown(list(chain_options.keys()),
175
+ label="Chain Type", value="basic"
176
+
177
+ )
178
+
179
+ with gr.Accordion(label="show_logs"):
180
+ logs = gr.Textbox(label="Console")
181
+ block.load(read_logs, None, logs, every=1)
182
+
183
+ file_output = gr.File()
184
+ upload_button = gr.UploadButton("Click to Upload a File", file_types=[".docx", ".pdf",".txt",".json"], file_count="multiple")
185
+ files = upload_button.upload(upload_file, upload_button, file_output)
186
+ # gr.Gallery(files)
187
+ btn = gr.Button(value="Ingest")
188
+ btn.click(ingest)
189
+ gr.HTML("Demo application of a LangChain chain.")
190
+
191
+ gr.HTML(
192
+ "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
193
+ )
194
+
195
+ state = gr.State()
196
+ agent_state = gr.State()
197
+
198
+ submit.click(chat, inputs=[openai_api_key_textbox,message,
199
+ state, agent_state, chainType], outputs=[chatbot, state])
200
+ message.submit(chat, inputs=[
201
+ openai_api_key_textbox, message, state, agent_state, chainType], outputs=[chatbot, state])
202
+
203
+ openai_api_key_textbox.change(
204
+ set_openai_api_key,
205
+ inputs=[openai_api_key_textbox],
206
+ outputs=[agent_state],
207
+ )
208
+
209
+ chainType.change(
210
+ getChainSelectedByUser,
211
+ inputs=[chainType],
212
+ outputs=[agent_state],
213
+ )
214
+
215
+ block.queue().launch(debug=True)
assets/logo/logo.jpg ADDED
blogpost.md ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **_Note: See the accompanying GitHub repo for this blogpost [here](https://github.com/hwchase17/chat-your-data)._**
2
+ **Note: Last updated by [Bill Chambers](http://billchambers.me/). August, 2023.**
3
+
4
+ ChatGPT has taken the world by storm. But while it’s great for general purpose knowledge, it only knows information about what it has been trained on, which is pre-2021 generally available internet data. It doesn’t know about your private data nor does it know recent sources of data.
5
+
6
+ Wouldn’t it be useful if it did?
7
+
8
+ This blog post is a tutorial on how to set up your own version of ChatGPT over a specific corpus of data. There is an [accompanying GitHub repo](https://github.com/hwchase17/chat-your-data) that has the relevant code referenced in this post. Specifically, this deals with text data. For how to interact with other sources of data with a natural language layer, see the below tutorials:
9
+
10
+ * [SQL Database](https://python.langchain.com/docs/modules/chains/popular/sqlite)
11
+ * [APIs](https://python.langchain.com/docs/modules/chains/popular/api)
12
+
13
+ ## High Level Overview
14
+
15
+ At a high level, there are two components to setting up ChatGPT over your own data: (1) ingestion of the data, (2) chatbot over the data. Let's talk a bit about the steps involved in each of those.
16
+
17
+ ### Ingestion of data
18
+
19
+ ![Diagram of ingestion process](https://blog.langchain.dev/content/images/2023/02/ingest.png)
20
+
21
+ Ingestion involves several steps. The steps are:
22
+
23
+ 1. **Load data sources to text**: this involves loading your data from arbitrary sources to text in a form that it can be used downstream. This is one place where we hope the community will help out!
24
+ 2. **Chunk text**: this involves chunking the loaded text into smaller chunks. This is necessary because language models generally have a limit to the amount of text (tokens) they can deal with. "Chunk size" is something to be tuned over time.
25
+ 3. **Embed text**: this involves creating a numerical embedding for each chunk of text. This is necessary because we only want to select the most relevant chunks of text for a given question, and we will do this by finding the most similar chunks in the embedding space.
26
+ 4. **Load embeddings to vectorstore**: this involves putting embeddings and documents into a vectorstore. Vectorstores help us find the most similar chunks in the embedding space quickly and efficiently.
27
+
28
+ Langchain strives to be modular, so that each of these steps are straightforward to swap out with other components or approaches.
29
+
30
+ ### Querying of Data
31
+
32
+ ![Diagram of query process](https://blog.langchain.dev/content/images/2023/02/query.png)
33
+
34
+ This can also be broken down into a few steps. The high level steps are:
35
+
36
+ 1. **Get input from the user**: we'll use a web interface and a cli interface to receive input from the user about the documents.
37
+ 2. **Combine that input with chat history**: we'll combine chat history and a new question into a single standalone question. This is often necessary because we want to allow for the ability to ask follow up questions (an important UX consideration).
38
+ 3. **Lookup relevant documents**: using the vectorstore created during ingestion, we will look up relevant documents for the answer.
39
+ 4. **Generate a response**: Given the standalone question and the relevant documents, we will use a language model to generate a response.
40
+
41
+ In this post, we'll explore some design decisions you have with history, prompts, and the chat experience. We won't touch on the deployment, but for more information see [our deployment guide](https://python.langchain.com/docs/guides/deployments/).
42
+
43
+ ## Step by Step Details
44
+
45
+ This section dives into more detail on the steps necessary to ingest data.
46
+
47
+ ![Diagram of ingestion process](https://blog.langchain.dev/content/images/2023/02/ingest-1.png)
48
+
49
+ ### Load data
50
+
51
+ First, we need to load data into a standard format. In langchain, a [`Document`](https://docs.langchain.com/docs/components/schema/document) consists of (1) the text itself, (2) any metadata associated with that text (where it came from, etc). This is often critical for understanding and communicating the context for testing or for the end user.
52
+
53
+ The community has contributed dozens of document loaders and we look forward to seeing more and more join the community. [See our documention (and over 120 data loaders) for more information about document loaders](https://python.langchain.com/docs/integrations/document_loaders/). Please open a pull request or file an issue if you'd like to contribute (or request) a new document loader.
54
+
55
+ The line below contains the line of code responsible for loading the relevant documents.
56
+
57
+ ```py
58
+ print("Loading data...")
59
+ loader = UnstructuredFileLoader("state_of_the_union.txt")
60
+ raw_documents = loader.load()
61
+ ```
62
+
63
+ ### Split Text
64
+
65
+ Splitting documents into smaller units of text for input into the model is critical for getting relevant information back from our chatbot. When documents are too big, you'll include irrelevant information to the model. Conversely, when they're too small, you'll not include enough information and the model may be confused about what is actually relevant.
66
+
67
+ The chunk size isn't quite a science, so you'll have to experiment to see if you can get good results.
68
+
69
+ ```py
70
+ print("Splitting text...")
71
+ text_splitter = CharacterTextSplitter(
72
+ separator="\n\n",
73
+ chunk_size=600,
74
+ chunk_overlap=100,
75
+ length_function=len,
76
+ )
77
+ documents = text_splitter.split_documents(raw_documents)
78
+ ```
79
+
80
+ ### Create embeddings and store in vectorstore
81
+
82
+ Next, now that we have small chunks of text we need to create embeddings for each piece of text and store them in a vectorstore. We create embeddings because this is an efficient way of storing this text data and subsequently querying the store for documents relevant to our query.
83
+
84
+ Here we use OpenAI’s embeddings and a [FAISS vectorstore](https://faiss.ai/index.html) and store that as a python pickle file for later use.
85
+
86
+ ```py
87
+ print("Creating vectorstore...")
88
+ embeddings = OpenAIEmbeddings()
89
+ vectorstore = FAISS.from_documents(documents, embeddings)
90
+ with open("vectorstore.pkl", "wb") as f:
91
+ pickle.dump(vectorstore, f)
92
+ ```
93
+
94
+ Run `python ingest_data.py` to create the vectorstore. This is necessary after changing how you split the text or loading new documents. If you're making changes, adding documents, or splitting text different, you'll have to re-run things.
95
+
96
+ ## Query data
97
+
98
+ So now that we’ve ingested the data, we can now use it in a chatbot interface. In order to do this, we will use the [ConversationalRetrievalChain](https://python.langchain.com/docs/use_cases/question_answering/how_to/chat_vector_db).
99
+
100
+ ![Diagram of ConversationalRetrievalChain](https://blog.langchain.dev/content/images/2023/02/query-1.png)
101
+
102
+ There are several different options when it comes to querying the data. Do you allow follow up questions? Want to include other user context? There are lots of design decisions and below we'll discuss some of the most critical.
103
+
104
+ ### Do you want to have conversation history?
105
+
106
+ This is table stakes from a UX perspective because it allows for follow up questions. Adding memory is simple, you can either use a built in module.
107
+
108
+ ```py
109
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0)
110
+ retriever = load_retriever()
111
+ memory = ConversationBufferMemory(
112
+ memory_key="chat_history", return_messages=True)
113
+ # model = RetrievalQA.from_llm(llm=llm, retriever=retriever)
114
+ # if you don't want memory use the above, you will have to change
115
+ # the app.py or cli_app.py file to include `query` in the input instead of `question`
116
+ model = ConversationalRetrievalChain.from_llm(
117
+ llm=llm,
118
+ retriever=retriever,
119
+ memory=memory)
120
+ ```
121
+
122
+ Alternatively, you can specify memory and pass it into the model, tracking it on your own. Run this example from the github repo with the following, then read the code in `query_data.py`.
123
+
124
+ ```sh
125
+ python cli_app.py
126
+
127
+ Which QA model would you like to work with? [basic/with_sources/custom_prompt/condense_prompt] (basic):
128
+ Chat with your docs!
129
+ ---------------
130
+ Your Question: (what did the president say about ketanji brown?):
131
+ Answer: The President nominated Ketanji Brown Jackson to serve on the United States Supreme Court, describing her as one of the nation's top legal minds who will continue Justice Breyer's legacy of excellence. He also mentioned that she
132
+ is a former top litigator in private practice, a former federal public defender, and comes from a family of public school educators and police officers. He referred to her as a consensus builder and noted that since her nomination, she
133
+ has received a broad range of support from various groups, including the Fraternal Order of Police and former judges appointed by both Democrats and Republicans.
134
+ ---------------
135
+ ```
136
+
137
+ ### Do you want to customize the QA prompt?
138
+
139
+ You can easily customize the QA prompt by passing in a prompt of your choice. This is similar in experience to most all chains in langchain. [Learn more about custom prompts here.](https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_qa#return-source-documents)
140
+
141
+ ```py
142
+ template = """You are an AI assistant for answering questions about the most recent state of the union address.
143
+ You are given the following extracted parts of a long document and a question. Provide a conversational answer.
144
+ If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
145
+ If the question is not about the most recent state of the union, politely inform them that you are tuned to only answer questions about the most recent state of the union.
146
+ Lastly, answer the question as if you were a pirate from the south seas and are just coming back from a pirate expedition where you found a treasure chest full of gold doubloons.
147
+ Question: {question}
148
+ =========
149
+ {context}
150
+ =========
151
+ Answer in Markdown:"""
152
+
153
+ QA_PROMPT = PromptTemplate(template=template, input_variables=[
154
+ "question", "context"])
155
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0)
156
+ retriever = load_retriever()
157
+ memory = ConversationBufferMemory(
158
+ memory_key="chat_history", return_messages=True)
159
+ model = ConversationalRetrievalChain.from_llm(
160
+ llm=llm,
161
+ retriever=retriever,
162
+ memory=memory,
163
+ combine_docs_chain_kwargs={"prompt": QA_PROMPT})
164
+ ```
165
+
166
+ Run this example from the github repo with the following, then read the code in `query_data.py`.
167
+
168
+ ```sh
169
+ python cli_app.py
170
+ Which QA model would you like to work with? [basic/with_sources/custom_prompt/condense_prompt] (basic): custom_prompt
171
+ Chat with your docs!
172
+ ---------------
173
+ Your Question: (what did the president say about ketanji brown?):
174
+ Answer: Arr matey, the cap'n, I mean the President, he did speak of Ketanji Brown Jackson, he did. He nominated her to the United States Supreme Court, he did, just 4 days before his address. He spoke highly of her, he did, callin' her
175
+ one of the nation's top legal minds. He believes she'll continue Justice Breyer’s legacy of excellence, he does.
176
+
177
+ She's been a top litigator in private practice, a federal public defender, and comes from a family of public school educators and police officers. She's a consensus builder, she is. Since her nomination, she's received support from all
178
+ over, from the Fraternal Order of Police to former judges appointed by both Democrats and Republicans. So, that's what the President had to say about Ketanji Brown Jackson, it is.
179
+ ---------------
180
+ Your Question: (what did the president say about ketanji brown?): who did she succeed?
181
+ Answer: Arr matey, ye be askin' about who Judge Ketanji Brown Jackson be succeedin'. From the words of the President himself, she be takin' over from Justice Breyer, continuin' his legacy of excellence on the United States Supreme
182
+ Court. Now, let's get back to countin' me gold doubloons, aye?
183
+ ---------------
184
+ ```
185
+
186
+ ### Do you expect long conversations?
187
+
188
+ If so, you're going to want to condense previous questions and history in order to add context into the prompt. If you embed the whole chat history along with the new question to look up relevant documents, you may pull in documents no longer relevant to the conversation (if the new question is not related at all). Therefor, this step of condensing the chat history and a new question to a standalone question is very important.
189
+
190
+ ```py
191
+ _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
192
+ You can assume the question about the most recent state of the union address.
193
+
194
+ Chat History:
195
+ {chat_history}
196
+ Follow Up Input: {question}
197
+ Standalone question:"""
198
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
199
+
200
+
201
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0)
202
+ retriever = load_retriever()
203
+ memory = ConversationBufferMemory(
204
+ memory_key="chat_history", return_messages=True)
205
+ # see: https://github.com/langchain-ai/langchain/issues/5890
206
+ model = ConversationalRetrievalChain.from_llm(
207
+ llm=llm,
208
+ retriever=retriever,
209
+ memory=memory,
210
+ condense_question_prompt=CONDENSE_QUESTION_PROMPT,
211
+ combine_docs_chain_kwargs={"prompt": QA_PROMPT}) # includes the custom prompt as well
212
+ ```
213
+
214
+ Read the code in `query_data.py` for some example code to apply to your own projects.
215
+
216
+ ### Do you want the model to cite sources?
217
+
218
+ [Langchain can cite source documents in the model.](https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_qa#return-source-documents). There's a lot you can do here, you can add your own metadata, your own sections, and other relevant information to return the most relevant metadata for your query.
219
+
220
+ ```py
221
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0)
222
+ retriever = load_retriever()
223
+ history = []
224
+ model = ConversationalRetrievalChain.from_llm(
225
+ llm=llm,
226
+ retriever=retriever,
227
+ return_source_documents=True)
228
+
229
+ def model_func(question):
230
+ # bug: this doesn't work with the built-in memory
231
+ # see: https://github.com/langchain-ai/langchain/issues/5630
232
+ new_input = {"question": question['question'], "chat_history": history}
233
+ result = model(new_input)
234
+ history.append((question['question'], result['answer']))
235
+ return result
236
+
237
+ model_func({"question":"some question you have"})
238
+ # this is the same interface as all the other models.
239
+ ```
240
+
241
+ Run this example from the github repo with the following, then read the code in `query_data.py`.
242
+
243
+ ```sh
244
+ python cli_app.py
245
+ Which QA model would you like to work with? [basic/with_sources/custom_prompt/condense_prompt] (basic): with_sources
246
+ Chat with your docs!
247
+ ---------------
248
+ Your Question: (what did the president say about ketanji brown?):
249
+ Answer: The President nominated Ketanji Brown Jackson to serve on the United States Supreme Court, describing her as one of the nation's top legal minds who will continue Justice Breyer's legacy of excellence. He also mentioned that she
250
+ is a former top litigator in private practice, a former federal public defender, and comes from a family of public school educators and police officers. Since her nomination, she has received a broad range of support, including from the
251
+ Fraternal Order of Police and former judges appointed by both Democrats and Republicans.
252
+ Sources:
253
+ state_of_the_union.txt
254
+ One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.
255
+
256
+ And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.
257
+ state_of_the_union.txt
258
+ As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential.
259
+
260
+ While it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military
261
+ justice.
262
+ state_of_the_union.txt
263
+ But in my administration, the watchdogs have been welcomed back.
264
+
265
+ We’re going after the criminals who stole billions in relief money meant for small businesses and millions of Americans.
266
+
267
+ And tonight, I’m announcing that the Justice Department will name a chief prosecutor for pandemic fraud.
268
+
269
+ By the end of this year, the deficit will be down to less than half what it was before I took office.
270
+
271
+ The only president ever to cut the deficit by more than one trillion dollars in a single year.
272
+
273
+ Lowering your costs also means demanding more competition.
274
+ state_of_the_union.txt
275
+ A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of
276
+ support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.
277
+
278
+ And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system.
279
+
280
+ We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.
281
+ ---------------
282
+ Your Question: (what did the president say about ketanji brown?): where did she work before?
283
+ Answer: Before her nomination to the United States Supreme Court, Ketanji Brown Jackson worked as a Circuit Court of Appeals Judge. She was also a former top litigator in private practice and a former federal public defender.
284
+ Sources:
285
+ state_of_the_union.txt
286
+ One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.
287
+
288
+ And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.
289
+ state_of_the_union.txt
290
+ A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of
291
+ support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.
292
+
293
+ And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system.
294
+
295
+ We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.
296
+ state_of_the_union.txt
297
+ We cannot let this happen.
298
+
299
+ Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections.
300
+
301
+ Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for
302
+ your service.
303
+ state_of_the_union.txt
304
+ Vice President Harris and I ran for office with a new economic vision for America.
305
+
306
+ Invest in America. Educate Americans. Grow the workforce. Build the economy from the bottom up and the middle out, not from the top down.
307
+
308
+ Because we know that when the middle class grows, the poor have a ladder up and the wealthy do very well.
309
+
310
+ America used to have the best roads, bridges, and airports on Earth.
311
+
312
+ Now our infrastructure is ranked 13th in the world.
313
+
314
+ We won’t be able to compete for the jobs of the 21st Century if we don’t fix that.
315
+ ---------------
316
+ ```
317
+
318
+ ### Language Model
319
+
320
+ The final lever to pull is what language model you use to power your chatbot. In our example we use the OpenAI LLM, but this can easily be substituted to other language models that LangChain supports, or you can even write your own wrapper.
321
+
322
+ ## Putting it all together
323
+
324
+ After making all the necessary customizations, and running `python ingest_data.py`, you can now interact with the chatbot.
325
+
326
+ We’ve exposed a really simple interface through which you can do. You can access this just by running `python cli_app.py` and this will open in the terminal a way to ask questions and get back answers. Try it out!
327
+
328
+ We also have an example of deploying this app via Gradio! You can do so by running `python app.py`. This can also easily be deployed to Hugging Face spaces - see [example space here](https://huggingface.co/spaces/hwchase17/chat-your-data-state-of-the-union).
329
+
330
+ ![langchain hugging face spaces](https://blog.langchain.dev/content/images/2023/02/Screen-Shot-2023-02-07-at-9.01.42-AM.png)
chromaclient.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ import openai
3
+ from Constants import *
4
+ from langchain.embeddings import OpenAIEmbeddings
5
+ import os
6
+ from langchain.vectorstores import Chroma
7
+ openai.api_key = OPENAI_API_KEY
8
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
9
+ #chroma_client = chromadb.PersistentClient(path=CHROMA_PERSIST_DIRECTORY, embeddings=OpenAIEmbeddings())
10
+
11
+ vectorstore = Chroma(persist_directory=CHROMA_PERSIST_DIRECTORY,collection_name=CHROMA_COLLECTION_NAME,embedding_function=OpenAIEmbeddings())
12
+ print("Chroma collection count : " + str(vectorstore._collection.count()))
13
+ #collection = chroma_client.get_collection(name="myname")
14
+ #results = collection.query(query_texts=["Who is Tanmay"], n_results=10)
15
+ #print(collection.get(include=["embeddings", "documents", "metadatas"]))
16
+ #collection.get(include=["embeddings", "documents", "metadatas"])
17
+
chromadb/cd6d665c-a1d1-4b9b-b8a5-cfa0f731d4d0/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcefd9550ec0a4b4cf0550e48b6589a991eaf7590c089fdaafa4298dab5c6f90
3
+ size 4000
cli_app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from query_data import chain_options
3
+ from rich.console import Console
4
+ from rich.prompt import Prompt
5
+ from Constants import *
6
+ from apiKey import *
7
+
8
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
9
+ if __name__ == "__main__":
10
+ c = Console()
11
+ model = Prompt.ask("Which QA model would you like to work with?",
12
+ choices=list(chain_options.keys()),
13
+ default="basic")
14
+ chain = chain_options[model]()
15
+
16
+ c.print("[bold]Chat with your docs!")
17
+ c.print("[bold red]---------------")
18
+
19
+ while True:
20
+ default_question = "what did the president say about ketanji brown?"
21
+ question = Prompt.ask("Your Question: ", default=default_question)
22
+ # change this line if you're using RetrievalQA
23
+ # input = query
24
+ # output = result
25
+ result = chain({"question": question})
26
+ c.print("[green]Answer: [/green]" + result['answer'])
27
+
28
+ # include a bit more if we're using `with_sources`
29
+ if model == "with_sources" and result.get('source_documents', None):
30
+ c.print("[green]Sources: [/green]")
31
+ for doc in result['source_documents']:
32
+ c.print(f"[bold underline green]{doc.metadata['source']}")
33
+ c.print("[green]" + doc.page_content)
34
+ c.print("[bold red]---------------")
data/Evan Cover.docx ADDED
Binary file (7.56 kB). View file
 
data/Josua Krause.docx ADDED
Binary file (8.33 kB). View file
 
data/Navid.docx ADDED
Binary file (13.6 kB). View file
 
data/Neal Patel.docx ADDED
Binary file (7.36 kB). View file
 
data/Siva_values.docx ADDED
Binary file (15.7 kB). View file
 
data/Tanmay Chopra.docx ADDED
Binary file (13.1 kB). View file
 
data_back/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data_back/Evan Cover.docx ADDED
Binary file (7.56 kB). View file
 
data_back/Josua Krause.docx ADDED
Binary file (8.33 kB). View file
 
data_back/Navid.docx ADDED
Binary file (13.6 kB). View file
 
data_back/Neal Patel.docx ADDED
Binary file (7.36 kB). View file
 
data_back/Siva_values.docx ADDED
Binary file (15.7 kB). View file
 
data_back/Tanmay Chopra.docx ADDED
Binary file (13.1 kB). View file
 
db_types.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+ class DBTypes(Enum):
4
+ CHROMA="chromadb"
5
+ FAISS="faiss"
6
+ NOTION="notion"
ingest_data.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ from langchain.text_splitter import CharacterTextSplitter
4
+ from langchain.document_loaders import UnstructuredFileLoader
5
+ from langchain.vectorstores.faiss import FAISS
6
+ from langchain.embeddings import OpenAIEmbeddings
7
+ from langchain.document_loaders import DirectoryLoader
8
+ from langchain.document_loaders import TextLoader
9
+ from langchain.document_loaders import CSVLoader
10
+ from langchain.document_loaders import PyPDFLoader
11
+ from langchain.document_loaders import UnstructuredWordDocumentLoader
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
13
+ from langchain.vectorstores import Chroma
14
+ from langchain.document_loaders import NotionDBLoader
15
+ from langchain.vectorstores.utils import filter_complex_metadata
16
+ import pickle
17
+ from Constants import *
18
+ from apiKey import *
19
+ from db_types import *
20
+ from utilities import transform_complex_metadata
21
+
22
+ def createChromaFromNotiondb(documents, embeddings) :
23
+ vectordb = Chroma(persist_directory=NOTION_PERSIST_DIRECTORY, embedding_function=embeddings,
24
+ collection_name=NOTION_COLLECTION_NAME)
25
+ print("Checking for existing collection count "+str(vectordb._collection.count()))
26
+ if (vectordb._collection.count()== 0):
27
+ print("Transforming notion collection "+ NOTION_COLLECTION_NAME)
28
+ documents = transform_complex_metadata(documents)
29
+ print("Creating notion database")
30
+ vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=NOTION_PERSIST_DIRECTORY, collection_name=NOTION_COLLECTION_NAME)
31
+ vectordb.persist()
32
+ print("Count of Notion collections: " + str(vectordb._collection.count()))
33
+ else :
34
+ print("Count of Notion collections: " + str(vectordb._collection.count()))
35
+
36
+ def createChromadb(documents, embeddings) :
37
+ vectordb = Chroma(persist_directory=CHROMA_PERSIST_DIRECTORY, embedding_function=embeddings,
38
+ collection_name=CHROMA_COLLECTION_NAME)
39
+ if (vectordb._collection.count()== 0):
40
+ print("Creating chromadb")
41
+ vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=CHROMA_PERSIST_DIRECTORY, collection_name=CHROMA_COLLECTION_NAME)
42
+ vectordb.persist()
43
+ print("Count of collections: " + str(vectordb._collection.count()))
44
+ else :
45
+ print("Count of collections: " + str(vectordb._collection.count()))
46
+
47
+ def createFaissVectorstore(documents, embeddings) :
48
+ print("Creating vectorstore...")
49
+ vectorstore = FAISS.from_documents(documents, embeddings)
50
+ with open("myvectorstore.pkl", "wb") as f:
51
+ pickle.dump(vectorstore, f)
52
+
53
+ def enrichMetada(docs):
54
+
55
+ for doc in docs:
56
+ for m in custom_meta_data:
57
+ if (doc.metadata["source"] != ""):
58
+ if ((m.get("name"))in doc.metadata["source"] ):
59
+ doc.metadata["name"] = m.get("name")
60
+ doc.metadata["profile"] = m.get("profile")
61
+ doc.metadata["creationYear"] = m.get("creationYear")
62
+ doc.metadata["topics"] = m.get("topics")
63
+
64
+ class MyLoader:
65
+ def __init__(self, file_path, **kwargs):
66
+ if file_path.endswith('.docx'):
67
+ self.loader = UnstructuredWordDocumentLoader(file_path, **kwargs)
68
+ elif file_path.endswith('.pdf'):
69
+ self.loader = PyPDFLoader(file_path, **kwargs)
70
+ elif file_path.endswith('.csv'):
71
+ self.loader = CSVLoader(file_path, **kwargs)
72
+ else:
73
+ self.loader = TextLoader(file_path, **kwargs)
74
+
75
+ def load(self):
76
+ return self.loader.load()
77
+
78
+ custom_meta_data = [
79
+ {
80
+ "name":"Tanmay Chopra",
81
+ "profile":"https://www.linkedin.com/in/tanmayc98/",
82
+ "creationYear":"2023",
83
+ "topics":"Pinecone",
84
+ },
85
+ {
86
+ "name":"Neal Patel",
87
+ "profile":"https://www.linkedin.com/in/nealpatel112/",
88
+ "creationYear":"2023",
89
+ "topics" :"Core - Model",
90
+ },
91
+ {
92
+ "name":"Navid",
93
+ "profile":"https://www.linkedin.com/in/Navid",
94
+ "creationYear":"2022",
95
+ "topics":"LLM",
96
+ },
97
+ {
98
+ "name":"Josua Krause",
99
+ "profile":"https://www.linkedin.com/in/Josua",
100
+ "creationYear":"2022",
101
+ "topics":"vector databases",
102
+ },
103
+ {
104
+ "name":"Jay Zhong",
105
+ "profile":"https://www.linkedin.com/in/Jay",
106
+ "creationYear":"2021",
107
+ "topics" : "LLM",
108
+ },
109
+ {
110
+ "name":"Evan",
111
+ "profile":"https://www.linkedin.com/in/Evan",
112
+ "creationYear":"2021",
113
+ "topics":"OpenAI",
114
+ },
115
+ {
116
+ "name":"Siva_values",
117
+ "profile":"https://www.linkedin.com/Siva",
118
+ "creationYear":"2023",
119
+ "topics":"Personal goals"
120
+ },
121
+ ]
122
+
123
+ custom_meta_data = [
124
+ {
125
+ "name":"Tanmay Chopra",
126
+ "profile":"https://www.linkedin.com/in/tanmayc98/",
127
+ "creationYear":"2023",
128
+ "topics":"Pinecone",
129
+ },
130
+ {
131
+ "name":"Neal Patel",
132
+ "profile":"https://www.linkedin.com/in/nealpatel112/",
133
+ "creationYear":"2023",
134
+ "topics" :"Core - Model",
135
+ },
136
+ {
137
+ "name":"Navid",
138
+ "profile":"https://www.linkedin.com/in/Navid",
139
+ "creationYear":"2022",
140
+ "topics":"LLM",
141
+ },
142
+ {
143
+ "name":"Josua Krause",
144
+ "profile":"https://www.linkedin.com/in/Josua",
145
+ "creationYear":"2022",
146
+ "topics":"vector databases",
147
+ },
148
+ {
149
+ "name":"Jay Zhong",
150
+ "profile":"https://www.linkedin.com/in/Jay",
151
+ "creationYear":"2021",
152
+ "topics" : "LLM",
153
+ },
154
+ {
155
+ "name":"Evan",
156
+ "profile":"https://www.linkedin.com/in/Evan",
157
+ "creationYear":"2021",
158
+ "topics":"OpenAI",
159
+ },
160
+ {
161
+ "name":"Siva_values",
162
+ "profile":"https://www.linkedin.com/Siva",
163
+ "creationYear":"2023",
164
+ "topics":"Personal goals"
165
+ },
166
+ ]
167
+ custom_meta_data = [
168
+ {
169
+ "name":"Tanmay Chopra",
170
+ "profile":"https://www.linkedin.com/in/tanmayc98/",
171
+ "creationYear":"2023",
172
+ "topics":"Pinecone",
173
+ },
174
+ {
175
+ "name":"Neal Patel",
176
+ "profile":"https://www.linkedin.com/in/nealpatel112/",
177
+ "creationYear":"2023",
178
+ "topics" :"Core - Model",
179
+ },
180
+ {
181
+ "name":"Navid",
182
+ "profile":"https://www.linkedin.com/in/Navid",
183
+ "creationYear":"2022",
184
+ "topics":"LLM",
185
+ },
186
+ {
187
+ "name":"Josua Krause",
188
+ "profile":"https://www.linkedin.com/in/Josua",
189
+ "creationYear":"2022",
190
+ "topics":"vector databases",
191
+ },
192
+ {
193
+ "name":"Jay Zhong",
194
+ "profile":"https://www.linkedin.com/in/Jay",
195
+ "creationYear":"2021",
196
+ "topics" : "LLM",
197
+ },
198
+ {
199
+ "name":"Evan",
200
+ "profile":"https://www.linkedin.com/in/Evan",
201
+ "creationYear":"2021",
202
+ "topics":"OpenAI",
203
+ },
204
+ {
205
+ "name":"Siva_values",
206
+ "profile":"https://www.linkedin.com/Siva",
207
+ "creationYear":"2023",
208
+ "topics":"Personal goals"
209
+ },
210
+ ]
211
+
212
+ def ingestData():
213
+ os.environ['OPENAI_API_KEY'] =OPENAI_API_KEY
214
+ print("Loading data...")
215
+
216
+ embeddings = OpenAIEmbeddings()
217
+
218
+ if (DB_TYPE == DBTypes['FAISS'].value or DB_TYPE == DBTypes['CHROMA'].value) :
219
+ loader = DirectoryLoader(DATA_DIRECTORY, glob="**/*.*", loader_cls=MyLoader)
220
+ print("Loading directory")
221
+ docs = loader.load()
222
+
223
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
224
+
225
+ enrichMetada(docs)
226
+ print("splitting documents")
227
+ documents = (text_splitter.split_documents(docs))
228
+ if (DB_TYPE == DBTypes['FAISS']):
229
+ createFaissVectorstore(documents, embeddings)
230
+ elif (DB_TYPE == DBTypes['CHROMA'].value) :
231
+ createChromadb(documents, embeddings)
232
+ elif (DB_TYPE == DBTypes['NOTION'].value):
233
+ loader = NotionDBLoader(
234
+ integration_token=NOTION_API_KEY,
235
+ database_id=NOTION_DB,
236
+ request_timeout_sec=30, # optional, defaults to 10
237
+ )
238
+
239
+ documents = loader.load()
240
+ createChromaFromNotiondb(documents, embeddings)
241
+
242
+ #ingestData()
logs/output.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Running on local URL: http://127.0.0.1:7860
2
+ Running on public URL: https://5ecacbc10380802821.gradio.live
3
+
4
+ This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
5
+ Keyboard interruption in main thread... closing server.
6
+ Killing tunnel 127.0.0.1:7860 <> https://5ecacbc10380802821.gradio.live
7
+ Keyboard interruption in main thread... closing server.
metadatainfo.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains.query_constructor.base import AttributeInfo
2
+ metadata_field_info = [
3
+ AttributeInfo(
4
+ name="source",
5
+ description="Document path",
6
+ type="str",
7
+ ),
8
+ AttributeInfo(
9
+ name="name",
10
+ description="Name of the person",
11
+ type="str",
12
+ ),
13
+ AttributeInfo(
14
+ name="profile",
15
+ description="Linkedin profile",
16
+ type="str",
17
+ ),
18
+ AttributeInfo(
19
+ name="creationYear",
20
+ description="creation Year",
21
+ type="str",
22
+ ),AttributeInfo(
23
+ name="topics",
24
+ description="The topics the person discussed",
25
+ type="str",
26
+ ),
27
+ ]
myvectorstore.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d94de10425e826e311340ed98b6bc9c176cbbb650c87b14fdf610e697f23b84
3
+ size 122738
notionMetadataInfo.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains.query_constructor.base import AttributeInfo
2
+ notion_metadata_field_info = [
3
+ AttributeInfo(
4
+ name="source",
5
+ description="source",
6
+ type="str",
7
+ ),
8
+ AttributeInfo(
9
+ name="name",
10
+ description="Name",
11
+ type="str",
12
+ ),
13
+ AttributeInfo(
14
+ name="id",
15
+ description="Id of the person",
16
+ type="str",
17
+ ),
18
+ AttributeInfo(
19
+ name="tags",
20
+ description="tags",
21
+ type="str",
22
+ ),
23
+ ]
notiondb/chroma.sqlite3 ADDED
Binary file (98.3 kB). View file
 
old/app_copy.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional, Tuple
3
+ from threading import Lock
4
+ from query_data import chain_options
5
+
6
+ import gradio as gr
7
+
8
+ from query_data import get_basic_qa_chain
9
+
10
+
11
+ def set_openai_api_key(api_key: str):
12
+ """Set the api key and return chain.
13
+ If no api_key, then None is returned.
14
+ """
15
+ os.environ["OPENAI_API_KEY"] = "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"
16
+ if api_key:
17
+ #os.environ["OPENAI_API_KEY"] = api_key
18
+ chain = get_basic_qa_chain
19
+ #os.environ["OPENAI_API_KEY"] = ""
20
+ return chain
21
+
22
+ def chatFlag(message) :
23
+ return message
24
+ '''
25
+ def getChainSelectedByUser() :
26
+ chain = get_basic_qa_chain
27
+
28
+ if (chainType == "get_qa_with_sources_chain" ):
29
+ chain = get_qa_with_sources_chain
30
+ elif (chainType == "get_custom_prompt_qa_chain"):
31
+ chain = get_custom_prompt_qa_chain
32
+ elif (chainType == "get_condense_prompt_qa_chain"):
33
+ chain = get_condense_prompt_qa_chain
34
+ elif (chainType == "get_retrievalqa_with_sources_chain"):
35
+ chain = get_retrievalqa_with_sources_chain
36
+
37
+ print("landed")
38
+ print("chainType" + chainType.value)
39
+
40
+ return chain
41
+ '''
42
+ class ChatWrapper:
43
+
44
+ def __init__(self):
45
+ self.lock = Lock()
46
+
47
+ def __call__(
48
+ self, api_key: str, inp: str, history: Optional[Tuple[str, str]], chain
49
+ ):
50
+ """Execute the chat functionality."""
51
+ self.lock.acquire()
52
+ try:
53
+ history = history or []
54
+ # If chain is None, that is because no API key was provided.
55
+ if chain is None:
56
+ history.append((inp, "Please paste your OpenAI key to use"))
57
+ return history, history
58
+ # Set OpenAI key
59
+ import openai
60
+ openai.api_key = api_key
61
+ # Run chain and append input.
62
+ output = chain({"question": inp})["answer"]
63
+ history.append((inp, output))
64
+ except Exception as e:
65
+ raise e
66
+ finally:
67
+ self.lock.release()
68
+ return history, history
69
+
70
+
71
+ chat = ChatWrapper()
72
+
73
+ block = gr.Blocks(css=".gradio-container {background-color: lightblue}",
74
+ )
75
+
76
+ with block:
77
+ with gr.Row():
78
+ gr.Markdown(
79
+ "<h3><center>Chat-Your-Data</center></h3>")
80
+
81
+ openai_api_key_textbox = gr.Textbox(
82
+ placeholder="",
83
+ show_label=False,
84
+ lines=1,
85
+ type="password",
86
+ )
87
+ chatbot = gr.Chatbot()
88
+
89
+ with gr.Row():
90
+ message = gr.Textbox(
91
+ label="What's your question?",
92
+ placeholder="Ask questions about the uploaded documents",
93
+ lines=1,
94
+ )
95
+ submit = gr.Button(value="Send", variant="secondary").style(
96
+ full_width=False)
97
+
98
+ gr.Examples(
99
+ examples=[
100
+ "What did the president say about Ketanji Brown Jackson?",
101
+ "Did he mention Stephen Breyer?",
102
+ "What was his stance on Ukraine?",
103
+ ],
104
+
105
+ inputs=message,
106
+
107
+ )
108
+
109
+ with gr.Row():
110
+ chainType = gr.Dropdown(list(chain_options.keys()),
111
+ label="Chain Type",
112
+ value="get_retrievalqa_with_sources_chain",
113
+ )
114
+
115
+
116
+ gr.HTML("Mime your data by AI.")
117
+
118
+ gr.HTML(
119
+ "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
120
+ )
121
+
122
+ state = gr.State()
123
+ agent_state = gr.State()
124
+
125
+ submit.click(chat, inputs=[openai_api_key_textbox, message,
126
+ state, agent_state], outputs=[chatbot, state])
127
+ message.submit(chat, inputs=[
128
+ openai_api_key_textbox, message, state, agent_state], outputs=[chatbot, state])
129
+
130
+ openai_api_key_textbox.change(
131
+ set_openai_api_key,
132
+ inputs=[openai_api_key_textbox],
133
+ outputs=[agent_state],
134
+ )
135
+
136
+ #chainType.change(getChainSelectedByUser(), inputs=[chainType.value],
137
+ # outputs=[agent_state] )
138
+
139
+ block.launch(debug=True)
140
+
query_data.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+
3
+ from Constants import *
4
+ from langchain.chains import (ConversationalRetrievalChain, RetrievalQA,
5
+ RetrievalQAWithSourcesChain)
6
+ from langchain.chains.query_constructor.base import AttributeInfo
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.prompts.chat import (ChatPromptTemplate,
10
+ HumanMessagePromptTemplate,
11
+ SystemMessagePromptTemplate)
12
+ from langchain.prompts.prompt import PromptTemplate
13
+ #from langchain.retrievers.self_query import BaseTranslator
14
+ from langchain.retrievers.self_query.base import SelfQueryRetriever
15
+ from langchain.chains.query_constructor.ir import Visitor
16
+ from langchain.vectorstores import Chroma
17
+ from langchain.vectorstores.base import VectorStoreRetriever
18
+ from metadatainfo import metadata_field_info
19
+ from notionMetadataInfo import notion_metadata_field_info
20
+ from langchain.embeddings import OpenAIEmbeddings
21
+ from typing import Any, List, Optional, Sequence, Union
22
+ import chromadb
23
+ from db_types import *
24
+
25
+ _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
26
+ You can assume the question about persons.
27
+
28
+ Chat History:
29
+ {chat_history}
30
+ Follow Up Input: {question}
31
+ Standalone question:"""
32
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
33
+
34
+ template = """You are an AI assistant for answering questions about persons.
35
+ You are given the following extracted parts of a long document and a question. Provide a conversational answer.
36
+ If you don't know the answer, do not try to makeup the answer from other sources. If the answer is found quote the source of the answer as SOURCE:
37
+ Also include Topics in the answers as "TOPICS": Also include tags in the answers as "TAGS":
38
+ Question: {question}
39
+ =========
40
+ {context}
41
+ =========
42
+ Answer in Markdown:"""
43
+ QA_PROMPT = PromptTemplate(template=template, input_variables=[
44
+ "question", "context"])
45
+
46
+ class MyVisitor(Visitor) :
47
+
48
+ def visit_operation(self, op) -> Any:
49
+ print ("in operation")
50
+ return op
51
+ def visit_comparison(self, comparison) -> Any:
52
+ print("in comparison")
53
+ return comparison
54
+ def visit_structured_query(self, arg2) -> Any:
55
+ print("in structured query "+ arg2.query)
56
+ return self, arg2
57
+
58
+
59
+ def load_retriever():
60
+
61
+ retriever = VectorStoreRetriever(vectorstore=get_vectorstore(),dict=metadata_field_info)
62
+ return retriever
63
+
64
+ def get_vectorstore():
65
+ print("Reading from vectorstore " + DB_TYPE)
66
+ custom_meta_data_info = metadata_field_info
67
+ if (DB_TYPE==DBTypes['FAISS'].value) :
68
+ print("reading faiss vectorstore")
69
+ vectorstore = PERSIST_DIRECTORY + "myvectorstore.pkl"
70
+ with open(vectorstore, "rb") as f:
71
+ vectorstore = pickle.load(f)
72
+ elif (DB_TYPE == DBTypes['NOTION'].value) :
73
+ print("reading from Notion...")
74
+ custom_meta_data_info = notion_metadata_field_info
75
+ vectorstore = Chroma(persist_directory=NOTION_PERSIST_DIRECTORY,embedding_function=OpenAIEmbeddings(),collection_name=NOTION_COLLECTION_NAME)
76
+ print("Notion collection count : " + str(vectorstore._collection.count()))
77
+ else :
78
+ vectorstore = Chroma(persist_directory=CHROMA_PERSIST_DIRECTORY,collection_name=CHROMA_COLLECTION_NAME,embedding_function=OpenAIEmbeddings())
79
+ print("Chroma collection count : " + str(vectorstore._collection.count()))
80
+ #vectorstore = Chroma(persist_directory=PERSIST_DIRECTORY,embedding_function=OpenAIEmbeddings(),collection_name="chatdata")
81
+ return vectorstore
82
+
83
+ def get_basic_qa_chain():
84
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0)
85
+ retriever = load_retriever()
86
+ memory = ConversationBufferMemory(
87
+ memory_key="chat_history", return_messages=True)
88
+ model = ConversationalRetrievalChain.from_llm(
89
+ llm=llm,
90
+ retriever=retriever,
91
+ memory=memory,
92
+ verbose=True)
93
+ return model
94
+
95
+
96
+ def get_custom_prompt_qa_chain():
97
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0)
98
+ retriever = load_retriever()
99
+ memory = ConversationBufferMemory(
100
+ memory_key="chat_history", return_messages=True)
101
+ # see: https://github.com/langchain-ai/langchain/issues/6635
102
+ # see: https://github.com/langchain-ai/langchain/issues/1497
103
+ model = ConversationalRetrievalChain.from_llm(
104
+ llm=llm,
105
+ retriever=retriever,
106
+ memory=memory,
107
+ combine_docs_chain_kwargs={"prompt": QA_PROMPT})
108
+ return model
109
+
110
+
111
+ def get_condense_prompt_qa_chain():
112
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0)
113
+ retriever = load_retriever()
114
+ memory = ConversationBufferMemory(
115
+ memory_key="chat_history", return_messages=True)
116
+ # see: https://github.com/langchain-ai/langchain/issues/5890
117
+ model = ConversationalRetrievalChain.from_llm(
118
+ llm=llm,
119
+ retriever=retriever,
120
+ memory=memory,
121
+ condense_question_prompt=CONDENSE_QUESTION_PROMPT,
122
+ combine_docs_chain_kwargs={"prompt": QA_PROMPT})
123
+ return model
124
+
125
+ def get_retrievalqa_with_sources_chain():
126
+ system_template="""Use the following pieces of context to answer the users question.
127
+ Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
128
+ Also include Topics in the answers as "TOPICS". Also include tags in the answer as "TAGS". Include creationYear in the answers as "YEAR". If you don't know the answer, just say that "I donot know", don't try to make up an answer.
129
+ ----------------
130
+ {summaries}"""
131
+ messages = [
132
+ SystemMessagePromptTemplate.from_template(system_template),
133
+ HumanMessagePromptTemplate.from_template("{question}")
134
+ ]
135
+
136
+ prompt = ChatPromptTemplate.from_messages(messages)
137
+
138
+ chain_type_kwargs = {"prompt": prompt}
139
+
140
+ document_content_description = "Personal files"
141
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0)
142
+ vectorstore = get_vectorstore()
143
+ history=[]
144
+ retriever = SelfQueryRetriever.from_llm(
145
+ llm,
146
+ vectorstore,
147
+ document_content_description,
148
+ metadata_field_info,
149
+ #structured_query_translator=myVisitor,
150
+ verbose=True,
151
+ enable_limit=True,
152
+ )
153
+
154
+ #myVisitor = MyVisitor()
155
+ def model_func(question) :
156
+
157
+ #retriever.get_relevant_documents(query)
158
+
159
+ '''
160
+ def model_func(question):
161
+ # bug: this doesn't work with the built-in memory
162
+ # hacking around it for the tutorial
163
+ # see: https://github.com/langchain-ai/langchain/issues/5630
164
+ result = retriever.get_relevant_documents(question)
165
+
166
+ history.append((question, result['answer']))
167
+ return result
168
+
169
+ return model_func
170
+ '''
171
+ #print("metadata : " + retriever.metadata)
172
+ chain = RetrievalQAWithSourcesChain.from_chain_type(llm,
173
+ chain_type="stuff",
174
+ retriever=retriever,
175
+ chain_type_kwargs=chain_type_kwargs
176
+ )
177
+
178
+ results = chain({"question": question})
179
+ return results
180
+ return model_func
181
+
182
+ def get_qa_with_sources_chain():
183
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0)
184
+ retriever = load_retriever()
185
+ history = []
186
+ model = ConversationalRetrievalChain.from_llm(
187
+ llm=llm,
188
+ retriever=retriever,
189
+ return_source_documents=True,
190
+ verbose=True)
191
+
192
+ def model_func(question):
193
+ # bug: this doesn't work with the built-in memory
194
+ # hacking around it for the tutorial
195
+ # see: https://github.com/langchain-ai/langchain/issues/5630
196
+ new_input = {"question": question['question'], "chat_history": history}
197
+ for i in new_input:
198
+ print("new_input"+ i)
199
+ result = model(new_input)
200
+ history.append((question['question'], result['answer']))
201
+ return result
202
+
203
+ return model_func
204
+
205
+
206
+ chain_options = {
207
+ "basic": get_basic_qa_chain,
208
+ "with_sources": get_qa_with_sources_chain,
209
+ "custom_prompt": get_custom_prompt_qa_chain,
210
+ "condense_prompt": get_condense_prompt_qa_chain,
211
+ "retrieval_sources_chain" : get_retrievalqa_with_sources_chain,
212
+ }
read_notion.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from apiKey import *
3
+ from Constants import *
4
+
5
+ NOTION_TOKEN = NOTION_API_KEY
6
+ DATABASE_ID = NOTION_DB
7
+
8
+ headers = {
9
+ "Authorization": "Bearer " + NOTION_TOKEN,
10
+ "Content-Type": "application/json",
11
+ "Notion-Version": "2022-06-28",
12
+ }
13
+
14
+ def get_pages(num_pages=None):
15
+ """
16
+ If num_pages is None, get all pages, otherwise just the defined number.
17
+ """
18
+ url = f"https://api.notion.com/v1/databases/{DATABASE_ID}/query"
19
+
20
+ get_all = num_pages is None
21
+ page_size = MAX_PAGES_TO_READ if get_all else num_pages
22
+
23
+ payload = {"page_size": page_size}
24
+ response = requests.post(url, json=payload, headers=headers)
25
+
26
+ data = response.json()
27
+
28
+ # Comment this out to dump all data to a file
29
+ # import json
30
+ # with open('db.json', 'w', encoding='utf8') as f:
31
+ # json.dump(data, f, ensure_ascii=False, indent=4)
32
+
33
+ results = data["results"]
34
+ while data["has_more"] and get_all:
35
+ payload = {"page_size": page_size, "start_cursor": data["next_cursor"]}
36
+ url = f"https://api.notion.com/v1/databases/{DATABASE_ID}/query"
37
+ response = requests.post(url, json=payload, headers=headers)
38
+ data = response.json()
39
+ results.extend(data["results"])
40
+
41
+
42
+ ''' for r in results :
43
+ print(r)'''
44
+
45
+ return results
46
+
47
+ #get_pages()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ faiss-cpu
4
+ unstructured
5
+ tiktoken
6
+ rich #for console formatting
7
+ gradio
utilities.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from enum import Enum
3
+ from typing import List, Tuple, Type
4
+
5
+ import numpy as np
6
+
7
+ from langchain.docstore.document import Document
8
+
9
+ def transform_complex_metadata(
10
+ documents: List[Document],
11
+ *,
12
+ allowed_types: Tuple[Type, ...] = (str, bool, int, float)
13
+ ) -> List[Document]:
14
+ """Filter out metadata types that are not supported for a vector store."""
15
+ updated_documents = []
16
+ newValue = ""
17
+ for document in documents:
18
+ transformed_metadata = {}
19
+ for key, value in document.metadata.items():
20
+ if not isinstance(value, allowed_types):
21
+ if isinstance(value,list):
22
+ newValue = ','.join(value)
23
+ transformed_metadata[key] = newValue
24
+ else: continue
25
+ else :
26
+ transformed_metadata[key] = value
27
+
28
+ document.metadata = transformed_metadata
29
+ updated_documents.append(document)
30
+
31
+ return updated_documents
vectorstore.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00e5530b85a9588de9a81eb6e60c633d21774b26e8a8fcb0b6be85dfa95167f5
3
+ size 523469