Spaces:
Runtime error
Runtime error
Liam Dyer
commited on
rebuild pdf reader after ocr
Browse files
app.py
CHANGED
|
@@ -18,11 +18,14 @@ def convert(pdf_file):
|
|
| 18 |
out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
|
| 19 |
ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
|
| 20 |
pdf_file = out_pdf_file
|
|
|
|
| 21 |
|
| 22 |
# Extract text
|
| 23 |
full_text = ""
|
| 24 |
for idx, page in enumerate(reader.pages):
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# Extract metadata
|
| 28 |
metadata = {
|
|
|
|
| 18 |
out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
|
| 19 |
ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
|
| 20 |
pdf_file = out_pdf_file
|
| 21 |
+
reader = PdfReader(pdf_file)
|
| 22 |
|
| 23 |
# Extract text
|
| 24 |
full_text = ""
|
| 25 |
for idx, page in enumerate(reader.pages):
|
| 26 |
+
text = page.extract_text()
|
| 27 |
+
if len(text) > 0:
|
| 28 |
+
full_text += f"\n\n---- Page {idx} ----\n\n" + page.extract_text()
|
| 29 |
|
| 30 |
# Extract metadata
|
| 31 |
metadata = {
|