Spaces:
Runtime error
Runtime error
Cleans math rendering and TOC header stuff
Browse files
app.py
CHANGED
|
@@ -26,10 +26,20 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
| 26 |
model.to(device)
|
| 27 |
|
| 28 |
def process_pdf_to_html(pdf_file, title, author):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
pdf_path = pdf_file.name
|
| 30 |
doc = fitz.open(pdf_path)
|
| 31 |
num_pages = len(doc)
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
all_text = ""
|
| 34 |
cover_img_html = ""
|
| 35 |
|
|
@@ -101,28 +111,33 @@ def process_pdf_to_html(pdf_file, title, author):
|
|
| 101 |
|
| 102 |
print(f"Decoded content for page {page_num}: {decoded}")
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
def convert_latex(text):
|
| 107 |
-
import re
|
| 108 |
def replacer(match):
|
| 109 |
try:
|
| 110 |
return f"<math>{latex_to_mathml(match.group(1))}</math>"
|
| 111 |
except:
|
| 112 |
return html.escape(match.group(0))
|
| 113 |
-
# Convert \( ... \)
|
| 114 |
text = re.sub(r'\\\((.*?)\\\)', replacer, text)
|
| 115 |
-
# Convert \[ ... \]
|
| 116 |
text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
|
| 117 |
return text
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
if page_num == 1:
|
| 124 |
cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
|
| 125 |
|
|
|
|
| 126 |
mathjax_script = """
|
| 127 |
<script type="text/javascript" id="MathJax-script" async
|
| 128 |
src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
|
|
|
|
| 26 |
model.to(device)
|
| 27 |
|
| 28 |
def process_pdf_to_html(pdf_file, title, author):
|
| 29 |
+
import re
|
| 30 |
+
import markdown2
|
| 31 |
+
from latex2mathml.converter import convert as latex_to_mathml
|
| 32 |
+
|
| 33 |
pdf_path = pdf_file.name
|
| 34 |
doc = fitz.open(pdf_path)
|
| 35 |
num_pages = len(doc)
|
| 36 |
|
| 37 |
+
# Extract TOC as a dict: {page_number: [(level, title), ...]}
|
| 38 |
+
toc_entries = doc.get_toc()
|
| 39 |
+
toc_by_page = {}
|
| 40 |
+
for level, title, page in toc_entries:
|
| 41 |
+
toc_by_page.setdefault(page, []).append((level, title))
|
| 42 |
+
|
| 43 |
all_text = ""
|
| 44 |
cover_img_html = ""
|
| 45 |
|
|
|
|
| 111 |
|
| 112 |
print(f"Decoded content for page {page_num}: {decoded}")
|
| 113 |
|
| 114 |
+
# Convert inline and block LaTeX math to MathML
|
|
|
|
| 115 |
def convert_latex(text):
|
|
|
|
| 116 |
def replacer(match):
|
| 117 |
try:
|
| 118 |
return f"<math>{latex_to_mathml(match.group(1))}</math>"
|
| 119 |
except:
|
| 120 |
return html.escape(match.group(0))
|
|
|
|
| 121 |
text = re.sub(r'\\\((.*?)\\\)', replacer, text)
|
|
|
|
| 122 |
text = re.sub(r'\\\[(.*?)\\\]', replacer, text)
|
| 123 |
return text
|
| 124 |
|
| 125 |
+
math_converted = convert_latex(decoded)
|
| 126 |
+
markdown_converted = markdown2.markdown(math_converted)
|
| 127 |
+
html_page = markdown_converted.replace("\n", "<br>")
|
| 128 |
+
|
| 129 |
+
# Add TOC-derived headers if present on this page
|
| 130 |
+
if page_num in toc_by_page:
|
| 131 |
+
for level, header in toc_by_page[page_num]:
|
| 132 |
+
tag = f"h{min(level, 6)}" # Limit to h6
|
| 133 |
+
html_page = f"<{tag}>{html.escape(header)}</{tag}>\n" + html_page
|
| 134 |
+
|
| 135 |
+
all_text += f"<div>{html_page}</div>\n"
|
| 136 |
|
| 137 |
if page_num == 1:
|
| 138 |
cover_img_html = f'<img src="data:image/png;base64,{image_base64}" alt="cover" style="max-width:100%; height:auto;"><hr>'
|
| 139 |
|
| 140 |
+
# MathJax fallback in case MathML fails (some browsers prefer it)
|
| 141 |
mathjax_script = """
|
| 142 |
<script type="text/javascript" id="MathJax-script" async
|
| 143 |
src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js">
|