ZhouChuYue
commited on
Commit
Β·
8255ade
1
Parent(s):
1238b33
URL fetch support and LaTeX rendering in Markdown
Browse files- app.py +53 -6
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -5,9 +5,32 @@ A unified HTML parser optimized for extracting mathematical content.
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import gradio as gr
|
|
|
|
| 8 |
from ultradata_math_parser import GeneralParser
|
| 9 |
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
def parse_html(
|
| 12 |
html_content: str,
|
| 13 |
base_url: str = "",
|
|
@@ -341,16 +364,28 @@ with gr.Blocks(title="UltraData Math Parser") as demo:
|
|
| 341 |
with gr.Column(scale=1):
|
| 342 |
gr.HTML('<div class="section-header">π₯ Input</div>')
|
| 343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
html_input = gr.Textbox(
|
| 345 |
label="HTML Content",
|
| 346 |
-
placeholder="Paste your HTML content here...",
|
| 347 |
-
lines=
|
| 348 |
-
max_lines=
|
| 349 |
value=EXAMPLE_HTML,
|
| 350 |
)
|
| 351 |
|
| 352 |
base_url_input = gr.Textbox(
|
| 353 |
-
label="Base URL (
|
| 354 |
placeholder="https://example.com/page",
|
| 355 |
lines=1,
|
| 356 |
)
|
|
@@ -396,6 +431,12 @@ with gr.Blocks(title="UltraData Math Parser") as demo:
|
|
| 396 |
markdown_output = gr.Markdown(
|
| 397 |
label="Markdown Preview",
|
| 398 |
elem_classes=["markdown-box"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
)
|
| 400 |
with gr.TabItem("π Plain Text"):
|
| 401 |
text_output = gr.Textbox(
|
|
@@ -415,6 +456,12 @@ with gr.Blocks(title="UltraData Math Parser") as demo:
|
|
| 415 |
)
|
| 416 |
|
| 417 |
# Event handlers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
parse_btn.click(
|
| 419 |
fn=process_input,
|
| 420 |
inputs=[html_input, base_url_input, process_math, include_tables, enable_forum, html_type],
|
|
@@ -422,11 +469,11 @@ with gr.Blocks(title="UltraData Math Parser") as demo:
|
|
| 422 |
)
|
| 423 |
|
| 424 |
def clear_all():
|
| 425 |
-
return "", "", "", "", "", ""
|
| 426 |
|
| 427 |
clear_btn.click(
|
| 428 |
fn=clear_all,
|
| 429 |
-
outputs=[html_input, base_url_input, title_output, html_output, text_output, markdown_output],
|
| 430 |
)
|
| 431 |
|
| 432 |
# Footer info
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
+
import requests
|
| 9 |
from ultradata_math_parser import GeneralParser
|
| 10 |
|
| 11 |
|
| 12 |
+
def fetch_url_content(url: str) -> tuple:
|
| 13 |
+
"""Fetch HTML content from a URL."""
|
| 14 |
+
if not url or not url.strip():
|
| 15 |
+
return "", "Please enter a URL"
|
| 16 |
+
|
| 17 |
+
url = url.strip()
|
| 18 |
+
if not url.startswith(("http://", "https://")):
|
| 19 |
+
url = "https://" + url
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
headers = {
|
| 23 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
| 24 |
+
}
|
| 25 |
+
response = requests.get(url, headers=headers, timeout=15)
|
| 26 |
+
response.raise_for_status()
|
| 27 |
+
return response.text, url
|
| 28 |
+
except requests.exceptions.Timeout:
|
| 29 |
+
return "", f"Request timed out for {url}"
|
| 30 |
+
except requests.exceptions.RequestException as e:
|
| 31 |
+
return "", f"Failed to fetch URL: {str(e)}"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
def parse_html(
|
| 35 |
html_content: str,
|
| 36 |
base_url: str = "",
|
|
|
|
| 364 |
with gr.Column(scale=1):
|
| 365 |
gr.HTML('<div class="section-header">π₯ Input</div>')
|
| 366 |
|
| 367 |
+
with gr.Tabs():
|
| 368 |
+
with gr.TabItem("π URL"):
|
| 369 |
+
url_input = gr.Textbox(
|
| 370 |
+
label="URL",
|
| 371 |
+
placeholder="Enter URL to fetch (e.g., https://example.com/math-article)",
|
| 372 |
+
lines=1,
|
| 373 |
+
)
|
| 374 |
+
fetch_btn = gr.Button("π₯ Fetch URL", variant="secondary")
|
| 375 |
+
|
| 376 |
+
with gr.TabItem("π HTML"):
|
| 377 |
+
pass # HTML input will be below, shared between tabs
|
| 378 |
+
|
| 379 |
html_input = gr.Textbox(
|
| 380 |
label="HTML Content",
|
| 381 |
+
placeholder="Paste your HTML content here or fetch from URL above...",
|
| 382 |
+
lines=12,
|
| 383 |
+
max_lines=25,
|
| 384 |
value=EXAMPLE_HTML,
|
| 385 |
)
|
| 386 |
|
| 387 |
base_url_input = gr.Textbox(
|
| 388 |
+
label="Base URL (Auto-filled from URL fetch)",
|
| 389 |
placeholder="https://example.com/page",
|
| 390 |
lines=1,
|
| 391 |
)
|
|
|
|
| 431 |
markdown_output = gr.Markdown(
|
| 432 |
label="Markdown Preview",
|
| 433 |
elem_classes=["markdown-box"],
|
| 434 |
+
latex_delimiters=[
|
| 435 |
+
{"left": "$$", "right": "$$", "display": True},
|
| 436 |
+
{"left": "$", "right": "$", "display": False},
|
| 437 |
+
{"left": "\\[", "right": "\\]", "display": True},
|
| 438 |
+
{"left": "\\(", "right": "\\)", "display": False},
|
| 439 |
+
],
|
| 440 |
)
|
| 441 |
with gr.TabItem("π Plain Text"):
|
| 442 |
text_output = gr.Textbox(
|
|
|
|
| 456 |
)
|
| 457 |
|
| 458 |
# Event handlers
|
| 459 |
+
fetch_btn.click(
|
| 460 |
+
fn=fetch_url_content,
|
| 461 |
+
inputs=[url_input],
|
| 462 |
+
outputs=[html_input, base_url_input],
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
parse_btn.click(
|
| 466 |
fn=process_input,
|
| 467 |
inputs=[html_input, base_url_input, process_math, include_tables, enable_forum, html_type],
|
|
|
|
| 469 |
)
|
| 470 |
|
| 471 |
def clear_all():
|
| 472 |
+
return "", "", "", "", "", "", ""
|
| 473 |
|
| 474 |
clear_btn.click(
|
| 475 |
fn=clear_all,
|
| 476 |
+
outputs=[url_input, html_input, base_url_input, title_output, html_output, text_output, markdown_output],
|
| 477 |
)
|
| 478 |
|
| 479 |
# Footer info
|
requirements.txt
CHANGED
|
@@ -7,3 +7,4 @@ numpy
|
|
| 7 |
py_asciimath
|
| 8 |
urllib3
|
| 9 |
tldextract
|
|
|
|
|
|
| 7 |
py_asciimath
|
| 8 |
urllib3
|
| 9 |
tldextract
|
| 10 |
+
requests
|