Spaces:
Sleeping
Sleeping
File size: 5,434 Bytes
904a38a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | import gradio as gr
import requests
from markdownify import markdownify
import traceback
from readability import Document
from bs4 import BeautifulSoup
# Configurações globais
DEFAULT_TIMEOUT = 15 # segundos
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}
def html_to_markdown_converter(url: str, html_input: str) -> str:
"""
Converte HTML (via URL ou input direto) para Markdown.
Tenta extrair o conteúdo principal com Readability.
"""
html_content = ""
source = ""
use_readability = True
url = url.strip() if url else ""
html_input = html_input.strip() if html_input else ""
try:
# --- Obtenção do conteúdo HTML ---
if url:
source = f"URL ({url})"
print(f"Fetching HTML from URL: {url}")
try:
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
print(f"Prepended https:// => {url}")
response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
response.raise_for_status()
response.encoding = response.apparent_encoding or 'utf-8'
html_content = response.text
print(f"Fetched {len(html_content)} bytes.")
except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL):
return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
except requests.exceptions.Timeout:
return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds for URL: `{url}`"
except requests.exceptions.RequestException as e:
return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
except Exception as e:
return f"❌ Unexpected error fetching URL.\n```\n{traceback.format_exc()}\n```"
elif html_input:
source = "Direct HTML Input"
print(f"Using direct HTML input ({len(html_input)} bytes).")
html_content = html_input
else:
return "❓ Please provide a URL or paste HTML content above."
# --- Extração com Readability ---
if not html_content:
return f"❓ No HTML content found from {source}."
processed_html = html_content
article_title = ""
if use_readability:
print("Trying Readability content extraction...")
try:
doc = Document(html_content)
article_title = doc.title().strip()
processed_html_summary = doc.summary()
soup = BeautifulSoup(processed_html_summary, 'html.parser')
if not soup.text.strip():
print("Readability returned empty summary. Using full HTML.")
else:
processed_html = processed_html_summary
print(f"Extracted title: {article_title}")
except Exception as e:
print("Readability failed. Using full HTML.")
# --- Conversão para Markdown ---
if not processed_html.strip():
return "❓ Processed HTML is empty."
print(f"Converting HTML ({len(processed_html)} chars) to Markdown...")
try:
markdown_output = markdownify(processed_html, heading_style="ATX", bullets='*')
print(f"Markdown generated ({len(markdown_output)} chars).")
final_output = f"# {article_title}\n\n{markdown_output}" if article_title else markdown_output
if not final_output.strip():
return "ℹ️ Conversion resulted in empty Markdown."
return final_output
except Exception:
return f"❌ Markdown conversion failed.\n```\n{traceback.format_exc()}\n```"
except Exception:
return f"❌ Unexpected processing error.\n```\n{traceback.format_exc()}\n```"
# Gradio UI
title = "HTML to Markdown Converter (Smart Extraction)"
description = """
Enter a URL **or** paste HTML code below.
This tool uses Mozilla's Readability to extract the main content and converts it to Markdown.
"""
article = """
**How it works:**
- Fetches the HTML using `requests`
- Extracts main content using `readability-lxml`
- Converts to Markdown using `markdownify`
"""
url_input = gr.Textbox(label="Enter URL (takes priority)", placeholder="e.g., en.wikipedia.org/wiki/Markdown")
html_input = gr.Textbox(label="Or Paste HTML Code Here", lines=10, placeholder="<h1>Hello</h1><p>Example content.</p>")
markdown_output = gr.Textbox(label="Converted Markdown Output", lines=20, interactive=False, show_copy_button=True)
iface = gr.Interface(
fn=html_to_markdown_converter,
inputs=[url_input, html_input],
outputs=markdown_output,
title=title,
description=description,
article=article,
examples=[
["https://gradio.app/quickstart/", ""],
["https://en.wikipedia.org/wiki/Python_(programming_language)", ""],
["https://www.bbc.com/news", ""],
["", "<body><main><h1>Main Title</h1><p>Article content here.</p></main></body>"],
["https://httpbin.org/delay/5", ""],
["invalid-url", ""],
["", "<p>Just a simple paragraph.</p>"]
],
cache_examples=False,
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch()
|