maclenn77 commited on
Commit
7eeda1a
·
unverified ·
1 Parent(s): 7a95605

fix: Extract pdf content (#4)

Browse files

* Change PDF extractor library to be used with HF

Files changed (4) hide show
  1. README.md +2 -2
  2. app.py +9 -3
  3. requirements.txt +1 -1
  4. wk_flow_requirements.txt +1 -1
README.md CHANGED
@@ -24,5 +24,5 @@ Run streamlit run app.py
24
 
25
  - Streamlit
26
  - HuggingFace
27
- - Tika: For extracting pdf text
28
- - Java Runtime
 
24
 
25
  - Streamlit
26
  - HuggingFace
27
+ - pymupdf for pdf extraction
28
+ - An open ai openapi key
app.py CHANGED
@@ -1,12 +1,18 @@
1
  """ A simple example of Streamlit. """
2
  import streamlit as st
3
- from tika import parser
 
 
 
 
 
4
 
5
  pdf = st.file_uploader("Upload a file", type="pdf")
6
 
7
  if st.button("Extract text"):
8
  if pdf is not None:
9
- extracted_text = parser.from_file(pdf)
10
- st.write(extracted_text["content"])
 
11
  else:
12
  st.write("Please upload a file of type: pdf")
 
1
  """ A simple example of Streamlit. """
2
  import streamlit as st
3
+ import fitz
4
+
5
+ # from tika import parser
6
+ # from openai import OpenAI
7
+
8
+ # client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
9
 
10
  pdf = st.file_uploader("Upload a file", type="pdf")
11
 
12
  if st.button("Extract text"):
13
  if pdf is not None:
14
+ with fitz.open(stream=pdf.read(), filetype="pdf") as doc: # open document
15
+ text = chr(12).join([page.get_text() for page in doc])
16
+ st.write(text)
17
  else:
18
  st.write("Please upload a file of type: pdf")
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  openai
2
  langchain
3
- tika
4
  chromadb
5
  sentence_transformers
6
  streamlit
 
1
  openai
2
  langchain
3
+ pymupdf
4
  chromadb
5
  sentence_transformers
6
  streamlit
wk_flow_requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  streamlit
2
- tika
3
  pylint
 
1
  streamlit
2
+ pymupdf
3
  pylint