Spaces:
Sleeping
Sleeping
Initial Commit 5.1.0
Browse files
app.py
CHANGED
|
@@ -14,6 +14,7 @@ import docx
|
|
| 14 |
from docx.shared import Inches
|
| 15 |
import logging
|
| 16 |
import base64
|
|
|
|
| 17 |
|
| 18 |
# Setup
|
| 19 |
API_KEY = os.getenv("PDF_API_KEY")
|
|
@@ -131,6 +132,7 @@ def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
|
|
| 131 |
|
| 132 |
with pdfplumber.open(file) as pdf:
|
| 133 |
for page_num, page in enumerate(pdf.pages):
|
|
|
|
| 134 |
page_title = f"Page {page_num + 1}"
|
| 135 |
toc.append(f"<li><a href='#page{page_num+1}'>{page_title}</a></li>")
|
| 136 |
html_output += f"<h2 id='page{page_num+1}'>{page_title}</h2>\n"
|
|
@@ -167,6 +169,6 @@ def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
|
|
| 167 |
docx_output.add_picture(buffer, width=Inches(5))
|
| 168 |
except Exception:
|
| 169 |
pass
|
| 170 |
-
|
| 171 |
full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
|
| 172 |
return full_html, docx_output
|
|
|
|
| 14 |
from docx.shared import Inches
|
| 15 |
import logging
|
| 16 |
import base64
|
| 17 |
+
import time
|
| 18 |
|
| 19 |
# Setup
|
| 20 |
API_KEY = os.getenv("PDF_API_KEY")
|
|
|
|
| 132 |
|
| 133 |
with pdfplumber.open(file) as pdf:
|
| 134 |
for page_num, page in enumerate(pdf.pages):
|
| 135 |
+
start = time.time()
|
| 136 |
page_title = f"Page {page_num + 1}"
|
| 137 |
toc.append(f"<li><a href='#page{page_num+1}'>{page_title}</a></li>")
|
| 138 |
html_output += f"<h2 id='page{page_num+1}'>{page_title}</h2>\n"
|
|
|
|
| 169 |
docx_output.add_picture(buffer, width=Inches(5))
|
| 170 |
except Exception:
|
| 171 |
pass
|
| 172 |
+
logger.info(f"Processed page {page_num + 1} in {time.time() - start:.2f}s")
|
| 173 |
full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
|
| 174 |
return full_html, docx_output
|