maclenn77 commited on
Commit
9dddd1e
·
unverified ·
1 Parent(s): 39103b5

Create embeddings with OpenAI (#6)

Browse files
Files changed (3) hide show
  1. app.py +55 -11
  2. requirements.txt +1 -1
  3. wk_flow_requirements.txt +4 -1
app.py CHANGED
@@ -1,15 +1,36 @@
1
  """ A simple example of Streamlit. """
2
  from datetime import datetime as Date
 
 
 
3
  import chromadb
 
4
  import fitz
5
  import streamlit as st
 
 
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  # from openai import OpenAI
8
 
9
  chroma_client = chromadb.PersistentClient(path="tmp/chroma")
10
  chroma_client.heartbeat()
11
 
12
- collection = chroma_client.get_or_create_collection("pdf-explainer")
 
 
13
 
14
  # Query ChromaDb
15
  query = st.text_input("Query ChromaDb", value="", placeholder="Enter query")
@@ -25,29 +46,52 @@ if st.button("Search"):
25
  + "..."
26
  + "**Source:** "
27
  + results["metadatas"][0][idx]["source"]
 
 
28
  )
29
 
30
 
31
  pdf = st.file_uploader("Upload a file", type="pdf")
32
 
33
-
34
- if st.button("Save"):
35
- if pdf is not None:
36
- with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document
37
- text = chr(12).join([page.get_text() for page in doc])
38
- st.write(text[0:200])
39
  collection.add(
40
  documents=[text],
41
  metadatas=[{"source": pdf.name}],
42
  ids=[pdf.name + str(Date.now())],
43
  )
44
- else:
45
- st.write("Please upload a file of type: pdf")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
 
48
  if st.button("Chroma data collection"):
49
  st.write(collection)
50
 
51
  if st.button("Delete Chroma Collection"):
52
- chroma_client.delete_collection(collection.name)
53
- st.write("Deleted Chroma Collection")
 
 
 
 
1
  """ A simple example of Streamlit. """
2
  from datetime import datetime as Date
3
+ import textwrap
4
+ import os
5
+ import tiktoken
6
  import chromadb
7
+ from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
8
  import fitz
9
  import streamlit as st
10
+ import openai
11
+ from dotenv import load_dotenv
12
+ from openai import OpenAI
13
 
14
+ load_dotenv()
15
+
16
+ if os.getenv("OPENAI_API_KEY") is None:
17
+ st.error("Please set OPENAI_API_KEY environment variable")
18
+ st.stop()
19
+ else:
20
+ openai.api_key = os.getenv("OPENAI_API_KEY")
21
+
22
+ client = OpenAI()
23
+ embedding_function = OpenAIEmbeddingFunction(
24
+ api_key=openai.api_key, model_name="text-embedding-ada-002"
25
+ )
26
  # from openai import OpenAI
27
 
28
  chroma_client = chromadb.PersistentClient(path="tmp/chroma")
29
  chroma_client.heartbeat()
30
 
31
+ collection = chroma_client.get_or_create_collection(
32
+ name="pdf-explainer", embedding_function=embedding_function
33
+ )
34
 
35
  # Query ChromaDb
36
  query = st.text_input("Query ChromaDb", value="", placeholder="Enter query")
 
46
  + "..."
47
  + "**Source:** "
48
  + results["metadatas"][0][idx]["source"]
49
+ + " **Tokens:** "
50
+ + str(results["metadatas"][0][idx]["num_tokens"])
51
  )
52
 
53
 
54
  pdf = st.file_uploader("Upload a file", type="pdf")
55
 
56
+ if pdf is not None:
57
+ with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document
58
+ text = chr(12).join([page.get_text() for page in doc])
59
+ st.write(text[0:200])
60
+ if st.button("Add to collection"):
 
61
  collection.add(
62
  documents=[text],
63
  metadatas=[{"source": pdf.name}],
64
  ids=[pdf.name + str(Date.now())],
65
  )
66
+ if st.button("Save chunks"):
67
+ with st.spinner("Saving chunks..."):
68
+ chunks = textwrap.wrap(text, 24000)
69
+ for idx, chunk in enumerate(chunks):
70
+ encoding = tiktoken.get_encoding("cl100k_base")
71
+ num_tokens = len(encoding.encode(chunk))
72
+ response = (
73
+ client.embeddings.create(
74
+ input=chunk, model="text-embedding-ada-002"
75
+ )
76
+ .data[0]
77
+ .embedding
78
+ )
79
+ collection.add(
80
+ embeddings=[response],
81
+ documents=[chunk],
82
+ metadatas=[{"source": pdf.name, "num_tokens": num_tokens}],
83
+ ids=[pdf.name + str(idx)],
84
+ )
85
+ else:
86
+ st.write("Please upload a file of type: pdf")
87
 
88
 
89
  if st.button("Chroma data collection"):
90
  st.write(collection)
91
 
92
  if st.button("Delete Chroma Collection"):
93
+ try:
94
+ chroma_client.delete_collection(collection.name)
95
+ except AttributeError:
96
+ st.error("Collection erased.")
97
+
requirements.txt CHANGED
@@ -3,6 +3,6 @@ tiktoken
3
  langchain
4
  pymupdf
5
  pypdf
6
- chromadb
7
  sentence_transformers
8
  streamlit
 
3
  langchain
4
  pymupdf
5
  pypdf
6
+ chromadb>='0.4.18'
7
  sentence_transformers
8
  streamlit
wk_flow_requirements.txt CHANGED
@@ -1,4 +1,7 @@
1
  streamlit
2
  pymupdf
 
 
3
  pylint
4
- chromadb
 
 
1
  streamlit
2
  pymupdf
3
+ openai
4
+ tiktoken
5
  pylint
6
+ langchain
7
+ chromadb>='0.4.18'