Spaces:
Runtime error
Runtime error
Save pdf content in chroma collection (#5)
Browse files- .gitignore +2 -0
- README.md +1 -0
- app.py +40 -5
- requirements.txt +2 -0
- wk_flow_requirements.txt +2 -1
.gitignore
CHANGED
|
@@ -158,3 +158,5 @@ cython_debug/
|
|
| 158 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 160 |
#.idea/
|
|
|
|
|
|
|
|
|
| 158 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 160 |
#.idea/
|
| 161 |
+
|
| 162 |
+
tmp/
|
README.md
CHANGED
|
@@ -24,5 +24,6 @@ Run streamlit run app.py
|
|
| 24 |
|
| 25 |
- Streamlit
|
| 26 |
- HuggingFace
|
|
|
|
| 27 |
- pymupdf for pdf extraction
|
| 28 |
- An open ai openapi key
|
|
|
|
| 24 |
|
| 25 |
- Streamlit
|
| 26 |
- HuggingFace
|
| 27 |
+
- ChromaDB
|
| 28 |
- pymupdf for pdf extraction
|
| 29 |
- An open ai openapi key
|
app.py
CHANGED
|
@@ -1,18 +1,53 @@
|
|
| 1 |
""" A simple example of Streamlit. """
|
| 2 |
-
import
|
|
|
|
| 3 |
import fitz
|
|
|
|
| 4 |
|
| 5 |
-
# from tika import parser
|
| 6 |
# from openai import OpenAI
|
| 7 |
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
pdf = st.file_uploader("Upload a file", type="pdf")
|
| 11 |
|
| 12 |
-
|
|
|
|
| 13 |
if pdf is not None:
|
| 14 |
with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document
|
| 15 |
text = chr(12).join([page.get_text() for page in doc])
|
| 16 |
-
st.write(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
else:
|
| 18 |
st.write("Please upload a file of type: pdf")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
""" A simple example of Streamlit. """
|
| 2 |
+
from datetime import datetime as Date
|
| 3 |
+
import chromadb
|
| 4 |
import fitz
|
| 5 |
+
import streamlit as st
|
| 6 |
|
|
|
|
| 7 |
# from openai import OpenAI
|
| 8 |
|
| 9 |
+
chroma_client = chromadb.PersistentClient(path="tmp/chroma")
|
| 10 |
+
chroma_client.heartbeat()
|
| 11 |
+
|
| 12 |
+
collection = chroma_client.get_or_create_collection("pdf-explainer")
|
| 13 |
+
|
| 14 |
+
# Query ChromaDb
|
| 15 |
+
query = st.text_input("Query ChromaDb", value="", placeholder="Enter query")
|
| 16 |
+
if st.button("Search"):
|
| 17 |
+
results = collection.query(
|
| 18 |
+
query_texts=[query],
|
| 19 |
+
n_results=3,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
for idx, result in enumerate(results["documents"][0]):
|
| 23 |
+
st.markdown(
|
| 24 |
+
result[0:150]
|
| 25 |
+
+ "..."
|
| 26 |
+
+ "**Source:** "
|
| 27 |
+
+ results["metadatas"][0][idx]["source"]
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
|
| 31 |
pdf = st.file_uploader("Upload a file", type="pdf")
|
| 32 |
|
| 33 |
+
|
| 34 |
+
if st.button("Save"):
|
| 35 |
if pdf is not None:
|
| 36 |
with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document
|
| 37 |
text = chr(12).join([page.get_text() for page in doc])
|
| 38 |
+
st.write(text[0:200])
|
| 39 |
+
collection.add(
|
| 40 |
+
documents=[text],
|
| 41 |
+
metadatas=[{"source": pdf.name}],
|
| 42 |
+
ids=[pdf.name + str(Date.now())],
|
| 43 |
+
)
|
| 44 |
else:
|
| 45 |
st.write("Please upload a file of type: pdf")
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
if st.button("Chroma data collection"):
|
| 49 |
+
st.write(collection)
|
| 50 |
+
|
| 51 |
+
if st.button("Delete Chroma Collection"):
|
| 52 |
+
chroma_client.delete_collection(collection.name)
|
| 53 |
+
st.write("Deleted Chroma Collection")
|
requirements.txt
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
openai
|
|
|
|
| 2 |
langchain
|
| 3 |
pymupdf
|
|
|
|
| 4 |
chromadb
|
| 5 |
sentence_transformers
|
| 6 |
streamlit
|
|
|
|
| 1 |
openai
|
| 2 |
+
tiktoken
|
| 3 |
langchain
|
| 4 |
pymupdf
|
| 5 |
+
pypdf
|
| 6 |
chromadb
|
| 7 |
sentence_transformers
|
| 8 |
streamlit
|
wk_flow_requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
streamlit
|
| 2 |
pymupdf
|
| 3 |
-
pylint
|
|
|
|
|
|
| 1 |
streamlit
|
| 2 |
pymupdf
|
| 3 |
+
pylint
|
| 4 |
+
chromadb
|