Asanaly commited on
Commit
441273a
·
verified ·
1 Parent(s): f591344

Update pdf_reader.py

Browse files
Files changed (1) hide show
  1. pdf_reader.py +13 -7
pdf_reader.py CHANGED
@@ -1,10 +1,16 @@
1
- import fitz # PyMuPDF
 
2
 
3
- def extract_text_from_pdf(pdf_bytes):
4
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
5
- text = ""
 
6
 
7
- for page in doc:
8
- text += page.get_text()
 
 
9
 
10
- return text
 
 
 
1
+ import io
2
+ from PyPDF2 import PdfReader
3
 
4
+ def extract_text_from_pdf(pdf_bytes: bytes) -> str:
5
+ try:
6
+ reader = PdfReader(io.BytesIO(pdf_bytes))
7
+ text = ""
8
 
9
+ for page in reader.pages:
10
+ page_text = page.extract_text()
11
+ if page_text:
12
+ text += page_text + "\n"
13
 
14
+ return text
15
+ except:
16
+ return ""