|
|
import gradio as gr |
|
|
import requests |
|
|
from markdownify import markdownify |
|
|
import traceback |
|
|
|
|
|
|
|
|
DEFAULT_TIMEOUT = 15 |
|
|
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} |
|
|
|
|
|
def html_to_markdown_converter(url: str, html_input: str) -> str: |
|
|
""" |
|
|
Converts HTML (from URL or direct input) to Markdown. |
|
|
Prioritizes URL input if provided. |
|
|
""" |
|
|
html_content = "" |
|
|
source = "" |
|
|
|
|
|
|
|
|
url = url.strip() if url else "" |
|
|
html_input = html_input.strip() if html_input else "" |
|
|
|
|
|
try: |
|
|
|
|
|
if url: |
|
|
source = f"URL ({url})" |
|
|
print(f"Attempting to fetch HTML from URL: {url}") |
|
|
try: |
|
|
response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True) |
|
|
response.raise_for_status() |
|
|
|
|
|
response.encoding = response.apparent_encoding or 'utf-8' |
|
|
html_content = response.text |
|
|
print(f"Successfully fetched {len(html_content)} bytes from URL.") |
|
|
except requests.exceptions.Timeout: |
|
|
return f"β **Error:** Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`" |
|
|
except requests.exceptions.RequestException as e: |
|
|
print(f"Request failed: {e}") |
|
|
return f"β **Error:** Failed to fetch content from URL: `{url}`\n```\n{e}\n```" |
|
|
except Exception as e: |
|
|
print(f"An unexpected error occurred during fetch: {e}") |
|
|
return f"β **Error:** An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```" |
|
|
|
|
|
elif html_input: |
|
|
source = "Direct HTML Input" |
|
|
print(f"Using direct HTML input ({len(html_input)} bytes).") |
|
|
html_content = html_input |
|
|
else: |
|
|
return "β Please provide a URL or paste HTML content in the fields above." |
|
|
|
|
|
|
|
|
if not html_content: |
|
|
return f"β No HTML content found from {source}." |
|
|
|
|
|
print(f"Attempting to convert HTML from {source} to Markdown...") |
|
|
try: |
|
|
|
|
|
|
|
|
markdown_output = markdownify(html_content, heading_style="ATX") |
|
|
print(f"Conversion successful. Markdown length: {len(markdown_output)}") |
|
|
|
|
|
|
|
|
return markdown_output |
|
|
except Exception as e: |
|
|
print(f"Markdown conversion failed: {e}") |
|
|
|
|
|
return f"β **Error:** Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```" |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print(f"An unexpected error occurred: {e}") |
|
|
return f"β **Error:** An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```" |
|
|
|
|
|
|
|
|
title = "HTML to Markdown Converter" |
|
|
description = """ |
|
|
Enter a URL **or** paste HTML code directly into the text box below. |
|
|
The tool will fetch the HTML (if URL is provided) and convert it into Markdown. |
|
|
The converted Markdown will be displayed below. Priority is given to the URL input if both fields are filled. |
|
|
""" |
|
|
article = """ |
|
|
**How it works:** |
|
|
1. Uses the `requests` library to fetch content from URLs. |
|
|
2. Uses the `markdownify` library to convert HTML source code into Markdown text. |
|
|
3. The output is displayed in a rendered Markdown format. |
|
|
|
|
|
**Note on 'Beautification':** The `markdownify` library aims to produce clean, standard Markdown. The rendering in the output box provides visual clarity. No additional styling rules are applied beyond standard Markdown conversion. |
|
|
""" |
|
|
|
|
|
|
|
|
url_input = gr.Textbox( |
|
|
label="Enter URL (gets priority)", |
|
|
placeholder="e.g., https://en.wikipedia.org/wiki/Markdown" |
|
|
) |
|
|
html_input_area = gr.Textbox( |
|
|
label="Or Paste HTML Code Here", |
|
|
lines=10, |
|
|
placeholder="e.g., <h1>Hello</h1><p>This is <b>bold</b>.</p>" |
|
|
) |
|
|
|
|
|
|
|
|
markdown_output_display = gr.Markdown(label="Converted Markdown Output") |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=html_to_markdown_converter, |
|
|
inputs=[url_input, html_input_area], |
|
|
outputs=markdown_output_display, |
|
|
title=title, |
|
|
description=description, |
|
|
article=article, |
|
|
allow_flagging='never', |
|
|
examples=[ |
|
|
["https://gradio.app/quickstart/", ""], |
|
|
["", "<h2>Example HTML</h2><p>Convert <em>this</em> snippet.</p><ul><li>Item 1</li><li>Item 2</li></ul>"], |
|
|
["https://httpbin.org/delay/20", ""], |
|
|
["https://invalid-url-that-does-not-exist-probably.xyz", ""] |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch() |