Spaces:
Runtime error
Runtime error
Liam Dyer
commited on
extract metadata before ocr
Browse files
app.py
CHANGED
|
@@ -8,6 +8,15 @@ import ocrmypdf
|
|
| 8 |
def convert(pdf_file):
|
| 9 |
reader = PdfReader(pdf_file)
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
# Check if there are any images
|
| 12 |
image_count = 0
|
| 13 |
for page in reader.pages:
|
|
@@ -27,15 +36,6 @@ def convert(pdf_file):
|
|
| 27 |
if len(text) > 0:
|
| 28 |
full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
|
| 29 |
|
| 30 |
-
# Extract metadata
|
| 31 |
-
metadata = {
|
| 32 |
-
"author": reader.metadata.author,
|
| 33 |
-
"creator": reader.metadata.creator,
|
| 34 |
-
"producer": reader.metadata.producer,
|
| 35 |
-
"subject": reader.metadata.subject,
|
| 36 |
-
"title": reader.metadata.title,
|
| 37 |
-
}
|
| 38 |
-
|
| 39 |
return full_text.strip(), metadata
|
| 40 |
|
| 41 |
|
|
|
|
| 8 |
def convert(pdf_file):
|
| 9 |
reader = PdfReader(pdf_file)
|
| 10 |
|
| 11 |
+
# Extract metadata
|
| 12 |
+
metadata = {
|
| 13 |
+
"author": reader.metadata.author,
|
| 14 |
+
"creator": reader.metadata.creator,
|
| 15 |
+
"producer": reader.metadata.producer,
|
| 16 |
+
"subject": reader.metadata.subject,
|
| 17 |
+
"title": reader.metadata.title,
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
# Check if there are any images
|
| 21 |
image_count = 0
|
| 22 |
for page in reader.pages:
|
|
|
|
| 36 |
if len(text) > 0:
|
| 37 |
full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
return full_text.strip(), metadata
|
| 40 |
|
| 41 |
|