md1 / app.py
hf1agideia's picture
Create app.py
904a38a verified
import gradio as gr
import requests
from markdownify import markdownify
import traceback
from readability import Document
from bs4 import BeautifulSoup
# Configurações globais
DEFAULT_TIMEOUT = 15 # segundos
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}
def html_to_markdown_converter(url: str, html_input: str) -> str:
"""
Converte HTML (via URL ou input direto) para Markdown.
Tenta extrair o conteúdo principal com Readability.
"""
html_content = ""
source = ""
use_readability = True
url = url.strip() if url else ""
html_input = html_input.strip() if html_input else ""
try:
# --- Obtenção do conteúdo HTML ---
if url:
source = f"URL ({url})"
print(f"Fetching HTML from URL: {url}")
try:
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
print(f"Prepended https:// => {url}")
response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
response.raise_for_status()
response.encoding = response.apparent_encoding or 'utf-8'
html_content = response.text
print(f"Fetched {len(html_content)} bytes.")
except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL):
return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
except requests.exceptions.Timeout:
return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds for URL: `{url}`"
except requests.exceptions.RequestException as e:
return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
except Exception as e:
return f"❌ Unexpected error fetching URL.\n```\n{traceback.format_exc()}\n```"
elif html_input:
source = "Direct HTML Input"
print(f"Using direct HTML input ({len(html_input)} bytes).")
html_content = html_input
else:
return "❓ Please provide a URL or paste HTML content above."
# --- Extração com Readability ---
if not html_content:
return f"❓ No HTML content found from {source}."
processed_html = html_content
article_title = ""
if use_readability:
print("Trying Readability content extraction...")
try:
doc = Document(html_content)
article_title = doc.title().strip()
processed_html_summary = doc.summary()
soup = BeautifulSoup(processed_html_summary, 'html.parser')
if not soup.text.strip():
print("Readability returned empty summary. Using full HTML.")
else:
processed_html = processed_html_summary
print(f"Extracted title: {article_title}")
except Exception as e:
print("Readability failed. Using full HTML.")
# --- Conversão para Markdown ---
if not processed_html.strip():
return "❓ Processed HTML is empty."
print(f"Converting HTML ({len(processed_html)} chars) to Markdown...")
try:
markdown_output = markdownify(processed_html, heading_style="ATX", bullets='*')
print(f"Markdown generated ({len(markdown_output)} chars).")
final_output = f"# {article_title}\n\n{markdown_output}" if article_title else markdown_output
if not final_output.strip():
return "ℹ️ Conversion resulted in empty Markdown."
return final_output
except Exception:
return f"❌ Markdown conversion failed.\n```\n{traceback.format_exc()}\n```"
except Exception:
return f"❌ Unexpected processing error.\n```\n{traceback.format_exc()}\n```"
# Gradio UI
title = "HTML to Markdown Converter (Smart Extraction)"
description = """
Enter a URL **or** paste HTML code below.
This tool uses Mozilla's Readability to extract the main content and converts it to Markdown.
"""
article = """
**How it works:**
- Fetches the HTML using `requests`
- Extracts main content using `readability-lxml`
- Converts to Markdown using `markdownify`
"""
url_input = gr.Textbox(label="Enter URL (takes priority)", placeholder="e.g., en.wikipedia.org/wiki/Markdown")
html_input = gr.Textbox(label="Or Paste HTML Code Here", lines=10, placeholder="<h1>Hello</h1><p>Example content.</p>")
markdown_output = gr.Textbox(label="Converted Markdown Output", lines=20, interactive=False, show_copy_button=True)
iface = gr.Interface(
fn=html_to_markdown_converter,
inputs=[url_input, html_input],
outputs=markdown_output,
title=title,
description=description,
article=article,
examples=[
["https://gradio.app/quickstart/", ""],
["https://en.wikipedia.org/wiki/Python_(programming_language)", ""],
["https://www.bbc.com/news", ""],
["", "<body><main><h1>Main Title</h1><p>Article content here.</p></main></body>"],
["https://httpbin.org/delay/5", ""],
["invalid-url", ""],
["", "<p>Just a simple paragraph.</p>"]
],
cache_examples=False,
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch()