scrape2MD / app.py
13ze's picture
Update app.py
9dacd92 verified
import gradio as gr
import requests
from markdownify import markdownify
import traceback # To help format potential errors
# Configure requests with a timeout and user-agent
DEFAULT_TIMEOUT = 15 # seconds
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} # Be polite
def html_to_markdown_converter(url: str, html_input: str) -> str:
"""
Converts HTML (from URL or direct input) to Markdown.
Prioritizes URL input if provided.
"""
html_content = ""
source = ""
# Clean up inputs
url = url.strip() if url else ""
html_input = html_input.strip() if html_input else ""
try:
# --- Step 1: Get HTML Content ---
if url:
source = f"URL ({url})"
print(f"Attempting to fetch HTML from URL: {url}")
try:
response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
# Try to decode using apparent encoding, fallback to utf-8
response.encoding = response.apparent_encoding or 'utf-8'
html_content = response.text
print(f"Successfully fetched {len(html_content)} bytes from URL.")
except requests.exceptions.Timeout:
return f"❌ **Error:** Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}")
return f"❌ **Error:** Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
except Exception as e:
print(f"An unexpected error occurred during fetch: {e}")
return f"❌ **Error:** An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```"
elif html_input:
source = "Direct HTML Input"
print(f"Using direct HTML input ({len(html_input)} bytes).")
html_content = html_input
else:
return "❓ Please provide a URL or paste HTML content in the fields above."
# --- Step 2: Convert to Markdown ---
if not html_content:
return f"❓ No HTML content found from {source}."
print(f"Attempting to convert HTML from {source} to Markdown...")
try:
# Use markdownify to convert
# You can pass options here if needed, e.g., heading_style="ATX"
markdown_output = markdownify(html_content, heading_style="ATX")
print(f"Conversion successful. Markdown length: {len(markdown_output)}")
# The markdown_output is already "beautified" in the sense of standard Markdown.
# The gr.Markdown component will render it nicely.
return markdown_output
except Exception as e:
print(f"Markdown conversion failed: {e}")
# Return error in a Markdown code block for readability
return f"❌ **Error:** Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```"
except Exception as e:
# Catch any unexpected errors in the overall logic
print(f"An unexpected error occurred: {e}")
return f"❌ **Error:** An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```"
# --- Gradio Interface ---
title = "HTML to Markdown Converter"
description = """
Enter a URL **or** paste HTML code directly into the text box below.
The tool will fetch the HTML (if URL is provided) and convert it into Markdown.
The converted Markdown will be displayed below. Priority is given to the URL input if both fields are filled.
"""
article = """
**How it works:**
1. Uses the `requests` library to fetch content from URLs.
2. Uses the `markdownify` library to convert HTML source code into Markdown text.
3. The output is displayed in a rendered Markdown format.
**Note on 'Beautification':** The `markdownify` library aims to produce clean, standard Markdown. The rendering in the output box provides visual clarity. No additional styling rules are applied beyond standard Markdown conversion.
"""
# Define input components
url_input = gr.Textbox(
label="Enter URL (gets priority)",
placeholder="e.g., https://en.wikipedia.org/wiki/Markdown"
)
html_input_area = gr.Textbox(
label="Or Paste HTML Code Here",
lines=10,
placeholder="e.g., <h1>Hello</h1><p>This is <b>bold</b>.</p>"
)
# Define output component
markdown_output_display = gr.Markdown(label="Converted Markdown Output")
# Create the Gradio interface
iface = gr.Interface(
fn=html_to_markdown_converter,
inputs=[url_input, html_input_area],
outputs=markdown_output_display,
title=title,
description=description,
article=article,
allow_flagging='never',
examples=[
["https://gradio.app/quickstart/", ""], # Example using URL
["", "<h2>Example HTML</h2><p>Convert <em>this</em> snippet.</p><ul><li>Item 1</li><li>Item 2</li></ul>"], # Example using direct HTML
["https://httpbin.org/delay/20", ""], # Example slow URL (might timeout)
["https://invalid-url-that-does-not-exist-probably.xyz", ""] # Example invalid URL
]
)
# Launch the app (for local testing or Hugging Face Spaces)
if __name__ == "__main__":
iface.launch()