Spaces:
Runtime error
Runtime error
fix: Extract pdf content (#4)
Browse files* Change PDF extractor library to be used with HF
- README.md +2 -2
- app.py +9 -3
- requirements.txt +1 -1
- wk_flow_requirements.txt +1 -1
README.md
CHANGED
|
@@ -24,5 +24,5 @@ Run streamlit run app.py
|
|
| 24 |
|
| 25 |
- Streamlit
|
| 26 |
- HuggingFace
|
| 27 |
-
-
|
| 28 |
-
-
|
|
|
|
| 24 |
|
| 25 |
- Streamlit
|
| 26 |
- HuggingFace
|
| 27 |
+
- pymupdf for pdf extraction
|
| 28 |
+
- An open ai openapi key
|
app.py
CHANGED
|
@@ -1,12 +1,18 @@
|
|
| 1 |
""" A simple example of Streamlit. """
|
| 2 |
import streamlit as st
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
pdf = st.file_uploader("Upload a file", type="pdf")
|
| 6 |
|
| 7 |
if st.button("Extract text"):
|
| 8 |
if pdf is not None:
|
| 9 |
-
|
| 10 |
-
|
|
|
|
| 11 |
else:
|
| 12 |
st.write("Please upload a file of type: pdf")
|
|
|
|
| 1 |
""" A simple example of Streamlit. """
|
| 2 |
import streamlit as st
|
| 3 |
+
import fitz
|
| 4 |
+
|
| 5 |
+
# from tika import parser
|
| 6 |
+
# from openai import OpenAI
|
| 7 |
+
|
| 8 |
+
# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 9 |
|
| 10 |
pdf = st.file_uploader("Upload a file", type="pdf")
|
| 11 |
|
| 12 |
if st.button("Extract text"):
|
| 13 |
if pdf is not None:
|
| 14 |
+
with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document
|
| 15 |
+
text = chr(12).join([page.get_text() for page in doc])
|
| 16 |
+
st.write(text)
|
| 17 |
else:
|
| 18 |
st.write("Please upload a file of type: pdf")
|
requirements.txt
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
openai
|
| 2 |
langchain
|
| 3 |
-
|
| 4 |
chromadb
|
| 5 |
sentence_transformers
|
| 6 |
streamlit
|
|
|
|
| 1 |
openai
|
| 2 |
langchain
|
| 3 |
+
pymupdf
|
| 4 |
chromadb
|
| 5 |
sentence_transformers
|
| 6 |
streamlit
|
wk_flow_requirements.txt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
streamlit
|
| 2 |
-
|
| 3 |
pylint
|
|
|
|
| 1 |
streamlit
|
| 2 |
+
pymupdf
|
| 3 |
pylint
|