ZhouChuYue commited on
Commit
8255ade
Β·
1 Parent(s): 1238b33

URL fetch support and LaTeX rendering in Markdown

Browse files
Files changed (2) hide show
  1. app.py +53 -6
  2. requirements.txt +1 -0
app.py CHANGED
@@ -5,9 +5,32 @@ A unified HTML parser optimized for extracting mathematical content.
5
  """
6
 
7
  import gradio as gr
 
8
  from ultradata_math_parser import GeneralParser
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def parse_html(
12
  html_content: str,
13
  base_url: str = "",
@@ -341,16 +364,28 @@ with gr.Blocks(title="UltraData Math Parser") as demo:
341
  with gr.Column(scale=1):
342
  gr.HTML('<div class="section-header">πŸ“₯ Input</div>')
343
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  html_input = gr.Textbox(
345
  label="HTML Content",
346
- placeholder="Paste your HTML content here...",
347
- lines=15,
348
- max_lines=30,
349
  value=EXAMPLE_HTML,
350
  )
351
 
352
  base_url_input = gr.Textbox(
353
- label="Base URL (Optional)",
354
  placeholder="https://example.com/page",
355
  lines=1,
356
  )
@@ -396,6 +431,12 @@ with gr.Blocks(title="UltraData Math Parser") as demo:
396
  markdown_output = gr.Markdown(
397
  label="Markdown Preview",
398
  elem_classes=["markdown-box"],
 
 
 
 
 
 
399
  )
400
  with gr.TabItem("πŸ“„ Plain Text"):
401
  text_output = gr.Textbox(
@@ -415,6 +456,12 @@ with gr.Blocks(title="UltraData Math Parser") as demo:
415
  )
416
 
417
  # Event handlers
 
 
 
 
 
 
418
  parse_btn.click(
419
  fn=process_input,
420
  inputs=[html_input, base_url_input, process_math, include_tables, enable_forum, html_type],
@@ -422,11 +469,11 @@ with gr.Blocks(title="UltraData Math Parser") as demo:
422
  )
423
 
424
  def clear_all():
425
- return "", "", "", "", "", ""
426
 
427
  clear_btn.click(
428
  fn=clear_all,
429
- outputs=[html_input, base_url_input, title_output, html_output, text_output, markdown_output],
430
  )
431
 
432
  # Footer info
 
5
  """
6
 
7
  import gradio as gr
8
+ import requests
9
  from ultradata_math_parser import GeneralParser
10
 
11
 
12
+ def fetch_url_content(url: str) -> tuple:
13
+ """Fetch HTML content from a URL."""
14
+ if not url or not url.strip():
15
+ return "", "Please enter a URL"
16
+
17
+ url = url.strip()
18
+ if not url.startswith(("http://", "https://")):
19
+ url = "https://" + url
20
+
21
+ try:
22
+ headers = {
23
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
24
+ }
25
+ response = requests.get(url, headers=headers, timeout=15)
26
+ response.raise_for_status()
27
+ return response.text, url
28
+ except requests.exceptions.Timeout:
29
+ return "", f"Request timed out for {url}"
30
+ except requests.exceptions.RequestException as e:
31
+ return "", f"Failed to fetch URL: {str(e)}"
32
+
33
+
34
  def parse_html(
35
  html_content: str,
36
  base_url: str = "",
 
364
  with gr.Column(scale=1):
365
  gr.HTML('<div class="section-header">πŸ“₯ Input</div>')
366
 
367
+ with gr.Tabs():
368
+ with gr.TabItem("πŸ”— URL"):
369
+ url_input = gr.Textbox(
370
+ label="URL",
371
+ placeholder="Enter URL to fetch (e.g., https://example.com/math-article)",
372
+ lines=1,
373
+ )
374
+ fetch_btn = gr.Button("πŸ“₯ Fetch URL", variant="secondary")
375
+
376
+ with gr.TabItem("πŸ“ HTML"):
377
+ pass # HTML input will be below, shared between tabs
378
+
379
  html_input = gr.Textbox(
380
  label="HTML Content",
381
+ placeholder="Paste your HTML content here or fetch from URL above...",
382
+ lines=12,
383
+ max_lines=25,
384
  value=EXAMPLE_HTML,
385
  )
386
 
387
  base_url_input = gr.Textbox(
388
+ label="Base URL (Auto-filled from URL fetch)",
389
  placeholder="https://example.com/page",
390
  lines=1,
391
  )
 
431
  markdown_output = gr.Markdown(
432
  label="Markdown Preview",
433
  elem_classes=["markdown-box"],
434
+ latex_delimiters=[
435
+ {"left": "$$", "right": "$$", "display": True},
436
+ {"left": "$", "right": "$", "display": False},
437
+ {"left": "\\[", "right": "\\]", "display": True},
438
+ {"left": "\\(", "right": "\\)", "display": False},
439
+ ],
440
  )
441
  with gr.TabItem("πŸ“„ Plain Text"):
442
  text_output = gr.Textbox(
 
456
  )
457
 
458
  # Event handlers
459
+ fetch_btn.click(
460
+ fn=fetch_url_content,
461
+ inputs=[url_input],
462
+ outputs=[html_input, base_url_input],
463
+ )
464
+
465
  parse_btn.click(
466
  fn=process_input,
467
  inputs=[html_input, base_url_input, process_math, include_tables, enable_forum, html_type],
 
469
  )
470
 
471
  def clear_all():
472
+ return "", "", "", "", "", "", ""
473
 
474
  clear_btn.click(
475
  fn=clear_all,
476
+ outputs=[url_input, html_input, base_url_input, title_output, html_output, text_output, markdown_output],
477
  )
478
 
479
  # Footer info
requirements.txt CHANGED
@@ -7,3 +7,4 @@ numpy
7
  py_asciimath
8
  urllib3
9
  tldextract
 
 
7
  py_asciimath
8
  urllib3
9
  tldextract
10
+ requests