Spaces:

madankn79
/

pdf2htmlv51

Sleeping

madankn79 commited on Jun 11, 2025

Commit

a085c86

1 Parent(s): fe9658d

Initial Commit 5.1.0

Files changed (1) hide show

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ import docx
 from docx.shared import Inches
 import logging
 import base64
 # Setup
 API_KEY = os.getenv("PDF_API_KEY")
@@ -131,6 +132,7 @@ def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
     with pdfplumber.open(file) as pdf:
         for page_num, page in enumerate(pdf.pages):
             page_title = f"Page {page_num + 1}"
             toc.append(f"<li><a href='#page{page_num+1}'>{page_title}</a></li>")
             html_output += f"<h2 id='page{page_num+1}'>{page_title}</h2>\n"
@@ -167,6 +169,6 @@ def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
                     docx_output.add_picture(buffer, width=Inches(5))
                 except Exception:
                     pass
     full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
     return full_html, docx_output

 from docx.shared import Inches
 import logging
 import base64
+import time
 # Setup
 API_KEY = os.getenv("PDF_API_KEY")
     with pdfplumber.open(file) as pdf:
         for page_num, page in enumerate(pdf.pages):
+            start = time.time()
             page_title = f"Page {page_num + 1}"
             toc.append(f"<li><a href='#page{page_num+1}'>{page_title}</a></li>")
             html_output += f"<h2 id='page{page_num+1}'>{page_title}</h2>\n"
                     docx_output.add_picture(buffer, width=Inches(5))
                 except Exception:
                     pass
+            logger.info(f"Processed page {page_num + 1} in {time.time() - start:.2f}s")
     full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
     return full_html, docx_output