Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -40,6 +40,19 @@ def clean_page_headers(text):
|
|
| 40 |
cleaned.append(line)
|
| 41 |
return "\n".join(cleaned)
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
def process_pdf_to_html(pdf_file, title, author):
|
| 44 |
pdf_path = pdf_file.name
|
| 45 |
doc = fitz.open(pdf_path)
|
|
@@ -117,15 +130,13 @@ def process_pdf_to_html(pdf_file, title, author):
|
|
| 117 |
print(f"Decoded content for page {page_num}: {decoded}")
|
| 118 |
|
| 119 |
cleaned_text = clean_page_headers(decoded)
|
|
|
|
|
|
|
|
|
|
| 120 |
mathml_converted = convert_latex_to_mathml(cleaned_text)
|
| 121 |
markdown_converted = markdown2.markdown(mathml_converted)
|
| 122 |
html_page = markdown_converted.replace("\n", "<br>")
|
| 123 |
|
| 124 |
-
if page_num in toc_by_page:
|
| 125 |
-
for level, header in toc_by_page[page_num]:
|
| 126 |
-
tag = f"h{min(level, 6)}"
|
| 127 |
-
html_page = f"<{tag}>{html.escape(header)}</{tag}>\n" + html_page
|
| 128 |
-
|
| 129 |
all_text += f"<div>{html_page}</div>\n"
|
| 130 |
|
| 131 |
if page_num == 1:
|
|
|
|
| 40 |
cleaned.append(line)
|
| 41 |
return "\n".join(cleaned)
|
| 42 |
|
| 43 |
+
def replace_headers_in_text(text, page_headers):
|
| 44 |
+
lines = text.split("\n")
|
| 45 |
+
for level, header in page_headers:
|
| 46 |
+
tag = f"h{min(level, 6)}"
|
| 47 |
+
pattern = re.compile(re.escape(header.strip()), re.IGNORECASE)
|
| 48 |
+
for idx, line in enumerate(lines):
|
| 49 |
+
if pattern.fullmatch(line.strip()):
|
| 50 |
+
lines[idx] = f"<{tag}>{html.escape(header.strip())}</{tag}>"
|
| 51 |
+
break # only replace first match
|
| 52 |
+
else:
|
| 53 |
+
lines.insert(0, f"<{tag}>{html.escape(header.strip())}</{tag}>") # fallback insert
|
| 54 |
+
return "\n".join(lines)
|
| 55 |
+
|
| 56 |
def process_pdf_to_html(pdf_file, title, author):
|
| 57 |
pdf_path = pdf_file.name
|
| 58 |
doc = fitz.open(pdf_path)
|
|
|
|
| 130 |
print(f"Decoded content for page {page_num}: {decoded}")
|
| 131 |
|
| 132 |
cleaned_text = clean_page_headers(decoded)
|
| 133 |
+
if page_num in toc_by_page:
|
| 134 |
+
cleaned_text = replace_headers_in_text(cleaned_text, toc_by_page[page_num])
|
| 135 |
+
|
| 136 |
mathml_converted = convert_latex_to_mathml(cleaned_text)
|
| 137 |
markdown_converted = markdown2.markdown(mathml_converted)
|
| 138 |
html_page = markdown_converted.replace("\n", "<br>")
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
all_text += f"<div>{html_page}</div>\n"
|
| 141 |
|
| 142 |
if page_num == 1:
|