maclenn77 commited on
Commit
39103b5
·
unverified ·
1 Parent(s): 7eeda1a

Save pdf content in chroma collection (#5)

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. README.md +1 -0
  3. app.py +40 -5
  4. requirements.txt +2 -0
  5. wk_flow_requirements.txt +2 -1
.gitignore CHANGED
@@ -158,3 +158,5 @@ cython_debug/
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
  #.idea/
 
 
 
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
  #.idea/
161
+
162
+ tmp/
README.md CHANGED
@@ -24,5 +24,6 @@ Run streamlit run app.py
24
 
25
  - Streamlit
26
  - HuggingFace
 
27
  - pymupdf for pdf extraction
28
  - An open ai openapi key
 
24
 
25
  - Streamlit
26
  - HuggingFace
27
+ - ChromaDB
28
  - pymupdf for pdf extraction
29
  - An open ai openapi key
app.py CHANGED
@@ -1,18 +1,53 @@
1
  """ A simple example of Streamlit. """
2
- import streamlit as st
 
3
  import fitz
 
4
 
5
- # from tika import parser
6
  # from openai import OpenAI
7
 
8
- # client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  pdf = st.file_uploader("Upload a file", type="pdf")
11
 
12
- if st.button("Extract text"):
 
13
  if pdf is not None:
14
  with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document
15
  text = chr(12).join([page.get_text() for page in doc])
16
- st.write(text)
 
 
 
 
 
17
  else:
18
  st.write("Please upload a file of type: pdf")
 
 
 
 
 
 
 
 
 
1
  """ A simple example of Streamlit. """
2
+ from datetime import datetime as Date
3
+ import chromadb
4
  import fitz
5
+ import streamlit as st
6
 
 
7
  # from openai import OpenAI
8
 
9
+ chroma_client = chromadb.PersistentClient(path="tmp/chroma")
10
+ chroma_client.heartbeat()
11
+
12
+ collection = chroma_client.get_or_create_collection("pdf-explainer")
13
+
14
+ # Query ChromaDb
15
+ query = st.text_input("Query ChromaDb", value="", placeholder="Enter query")
16
+ if st.button("Search"):
17
+ results = collection.query(
18
+ query_texts=[query],
19
+ n_results=3,
20
+ )
21
+
22
+ for idx, result in enumerate(results["documents"][0]):
23
+ st.markdown(
24
+ result[0:150]
25
+ + "..."
26
+ + "**Source:** "
27
+ + results["metadatas"][0][idx]["source"]
28
+ )
29
+
30
 
31
  pdf = st.file_uploader("Upload a file", type="pdf")
32
 
33
+
34
+ if st.button("Save"):
35
  if pdf is not None:
36
  with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document
37
  text = chr(12).join([page.get_text() for page in doc])
38
+ st.write(text[0:200])
39
+ collection.add(
40
+ documents=[text],
41
+ metadatas=[{"source": pdf.name}],
42
+ ids=[pdf.name + str(Date.now())],
43
+ )
44
  else:
45
  st.write("Please upload a file of type: pdf")
46
+
47
+
48
+ if st.button("Chroma data collection"):
49
+ st.write(collection)
50
+
51
+ if st.button("Delete Chroma Collection"):
52
+ chroma_client.delete_collection(collection.name)
53
+ st.write("Deleted Chroma Collection")
requirements.txt CHANGED
@@ -1,6 +1,8 @@
1
  openai
 
2
  langchain
3
  pymupdf
 
4
  chromadb
5
  sentence_transformers
6
  streamlit
 
1
  openai
2
+ tiktoken
3
  langchain
4
  pymupdf
5
+ pypdf
6
  chromadb
7
  sentence_transformers
8
  streamlit
wk_flow_requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  streamlit
2
  pymupdf
3
- pylint
 
 
1
  streamlit
2
  pymupdf
3
+ pylint
4
+ chromadb