Spaces:

maclenn77
/

pdf-explainer

Runtime error

maclenn77 commited on Dec 8, 2023

Commit

7eeda1a

unverified ·

1 Parent(s): 7a95605

fix: Extract pdf content (#4)

* Change PDF extractor library to be used with HF

Files changed (4) hide show

README.md CHANGED Viewed

@@ -24,5 +24,5 @@ Run streamlit run app.py
 - Streamlit
 - HuggingFace
-- Tika: For extracting pdf text
-- Java Runtime

 - Streamlit
 - HuggingFace
+- pymupdf for pdf extraction
+- An open ai openapi key

app.py CHANGED Viewed

@@ -1,12 +1,18 @@
 """ A simple example of Streamlit. """
 import streamlit as st
-from tika import parser
 pdf = st.file_uploader("Upload a file", type="pdf")
 if st.button("Extract text"):
     if pdf is not None:
-        extracted_text = parser.from_file(pdf)
-        st.write(extracted_text["content"])
     else:
         st.write("Please upload a file of type: pdf")

 """ A simple example of Streamlit. """
 import streamlit as st
+import fitz
+# from tika import parser
+# from openai import OpenAI
+# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 pdf = st.file_uploader("Upload a file", type="pdf")
 if st.button("Extract text"):
     if pdf is not None:
+        with fitz.open(stream=pdf.read(), filetype="pdf") as doc:  # open document
+            text = chr(12).join([page.get_text() for page in doc])
+            st.write(text)
     else:
         st.write("Please upload a file of type: pdf")

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 openai
 langchain
-tika
 chromadb
 sentence_transformers
 streamlit

 openai
 langchain
+pymupdf
 chromadb
 sentence_transformers
 streamlit

wk_flow_requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 streamlit
-tika
 pylint

 streamlit
+pymupdf
 pylint