import gradio as gr import requests from markdownify import markdownify import traceback # To help format potential errors # Configure requests with a timeout and user-agent DEFAULT_TIMEOUT = 15 # seconds HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} # Be polite def html_to_markdown_converter(url: str, html_input: str) -> str: """ Converts HTML (from URL or direct input) to Markdown. Prioritizes URL input if provided. """ html_content = "" source = "" # Clean up inputs url = url.strip() if url else "" html_input = html_input.strip() if html_input else "" try: # --- Step 1: Get HTML Content --- if url: source = f"URL ({url})" print(f"Attempting to fetch HTML from URL: {url}") try: response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True) response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) # Try to decode using apparent encoding, fallback to utf-8 response.encoding = response.apparent_encoding or 'utf-8' html_content = response.text print(f"Successfully fetched {len(html_content)} bytes from URL.") except requests.exceptions.Timeout: return f"❌ **Error:** Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`" except requests.exceptions.RequestException as e: print(f"Request failed: {e}") return f"❌ **Error:** Failed to fetch content from URL: `{url}`\n```\n{e}\n```" except Exception as e: print(f"An unexpected error occurred during fetch: {e}") return f"❌ **Error:** An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```" elif html_input: source = "Direct HTML Input" print(f"Using direct HTML input ({len(html_input)} bytes).") html_content = html_input else: return "❓ Please provide a URL or paste HTML content in the fields above." # --- Step 2: Convert to Markdown --- if not html_content: return f"❓ No HTML content found from {source}." print(f"Attempting to convert HTML from {source} to Markdown...") try: # Use markdownify to convert # You can pass options here if needed, e.g., heading_style="ATX" markdown_output = markdownify(html_content, heading_style="ATX") print(f"Conversion successful. Markdown length: {len(markdown_output)}") # The markdown_output is already "beautified" in the sense of standard Markdown. # The gr.Markdown component will render it nicely. return markdown_output except Exception as e: print(f"Markdown conversion failed: {e}") # Return error in a Markdown code block for readability return f"❌ **Error:** Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```" except Exception as e: # Catch any unexpected errors in the overall logic print(f"An unexpected error occurred: {e}") return f"❌ **Error:** An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```" # --- Gradio Interface --- title = "HTML to Markdown Converter" description = """ Enter a URL **or** paste HTML code directly into the text box below. The tool will fetch the HTML (if URL is provided) and convert it into Markdown. The converted Markdown will be displayed below. Priority is given to the URL input if both fields are filled. """ article = """ **How it works:** 1. Uses the `requests` library to fetch content from URLs. 2. Uses the `markdownify` library to convert HTML source code into Markdown text. 3. The output is displayed in a rendered Markdown format. **Note on 'Beautification':** The `markdownify` library aims to produce clean, standard Markdown. The rendering in the output box provides visual clarity. No additional styling rules are applied beyond standard Markdown conversion. """ # Define input components url_input = gr.Textbox( label="Enter URL (gets priority)", placeholder="e.g., https://en.wikipedia.org/wiki/Markdown" ) html_input_area = gr.Textbox( label="Or Paste HTML Code Here", lines=10, placeholder="e.g.,

Hello

This is bold.

" ) # Define output component markdown_output_display = gr.Markdown(label="Converted Markdown Output") # Create the Gradio interface iface = gr.Interface( fn=html_to_markdown_converter, inputs=[url_input, html_input_area], outputs=markdown_output_display, title=title, description=description, article=article, allow_flagging='never', examples=[ ["https://gradio.app/quickstart/", ""], # Example using URL ["", "

Example HTML

Convert this snippet.

"], # Example using direct HTML ["https://httpbin.org/delay/20", ""], # Example slow URL (might timeout) ["https://invalid-url-that-does-not-exist-probably.xyz", ""] # Example invalid URL ] ) # Launch the app (for local testing or Hugging Face Spaces) if __name__ == "__main__": iface.launch()