Spaces:

hf1agideia
/

md1

Sleeping

App Files Files Community

md1 / app.py

hf1agideia

Create app.py

904a38a verified 9 months ago

raw

history blame contribute delete

5.43 kB

	import gradio as gr
	import requests
	from markdownify import markdownify
	import traceback
	from readability import Document
	from bs4 import BeautifulSoup

	# Configurações globais
	DEFAULT_TIMEOUT = 15 # segundos
	HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}

	def html_to_markdown_converter(url: str, html_input: str) -> str:
	"""
	Converte HTML (via URL ou input direto) para Markdown.
	Tenta extrair o conteúdo principal com Readability.
	"""
	html_content = ""
	source = ""
	use_readability = True

	url = url.strip() if url else ""
	html_input = html_input.strip() if html_input else ""

	try:
	# --- Obtenção do conteúdo HTML ---
	if url:
	source = f"URL ({url})"
	print(f"Fetching HTML from URL: {url}")
	try:
	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url
	print(f"Prepended https:// => {url}")

	response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
	response.raise_for_status()
	response.encoding = response.apparent_encoding or 'utf-8'
	html_content = response.text
	print(f"Fetched {len(html_content)} bytes.")
	except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL):
	return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
	except requests.exceptions.Timeout:
	return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds for URL: `{url}`"
	except requests.exceptions.RequestException as e:
	return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
	except Exception as e:
	return f"❌ Unexpected error fetching URL.\n```\n{traceback.format_exc()}\n```"
	elif html_input:
	source = "Direct HTML Input"
	print(f"Using direct HTML input ({len(html_input)} bytes).")
	html_content = html_input
	else:
	return "❓ Please provide a URL or paste HTML content above."

	# --- Extração com Readability ---
	if not html_content:
	return f"❓ No HTML content found from {source}."

	processed_html = html_content
	article_title = ""

	if use_readability:
	print("Trying Readability content extraction...")
	try:
	doc = Document(html_content)
	article_title = doc.title().strip()
	processed_html_summary = doc.summary()
	soup = BeautifulSoup(processed_html_summary, 'html.parser')
	if not soup.text.strip():
	print("Readability returned empty summary. Using full HTML.")
	else:
	processed_html = processed_html_summary
	print(f"Extracted title: {article_title}")
	except Exception as e:
	print("Readability failed. Using full HTML.")

	# --- Conversão para Markdown ---
	if not processed_html.strip():
	return "❓ Processed HTML is empty."

	print(f"Converting HTML ({len(processed_html)} chars) to Markdown...")
	try:
	markdown_output = markdownify(processed_html, heading_style="ATX", bullets='*')
	print(f"Markdown generated ({len(markdown_output)} chars).")

	final_output = f"# {article_title}\n\n{markdown_output}" if article_title else markdown_output

	if not final_output.strip():
	return "ℹ️ Conversion resulted in empty Markdown."

	return final_output
	except Exception:
	return f"❌ Markdown conversion failed.\n```\n{traceback.format_exc()}\n```"

	except Exception:
	return f"❌ Unexpected processing error.\n```\n{traceback.format_exc()}\n```"

	# Gradio UI
	title = "HTML to Markdown Converter (Smart Extraction)"
	description = """
	Enter a URL or paste HTML code below.
	This tool uses Mozilla's Readability to extract the main content and converts it to Markdown.
	"""
	article = """
	How it works:
	- Fetches the HTML using `requests`
	- Extracts main content using `readability-lxml`
	- Converts to Markdown using `markdownify`
	"""

	url_input = gr.Textbox(label="Enter URL (takes priority)", placeholder="e.g., en.wikipedia.org/wiki/Markdown")
	html_input = gr.Textbox(label="Or Paste HTML Code Here", lines=10, placeholder="<h1>Hello</h1><p>Example content.</p>")
	markdown_output = gr.Textbox(label="Converted Markdown Output", lines=20, interactive=False, show_copy_button=True)

	iface = gr.Interface(
	fn=html_to_markdown_converter,
	inputs=[url_input, html_input],
	outputs=markdown_output,
	title=title,
	description=description,
	article=article,
	examples=[
	["https://gradio.app/quickstart/", ""],
	["https://en.wikipedia.org/wiki/Python_(programming_language)", ""],
	["https://www.bbc.com/news", ""],
	["", "<body><main><h1>Main Title</h1><p>Article content here.</p></main></body>"],
	["https://httpbin.org/delay/5", ""],
	["invalid-url", ""],
	["", "<p>Just a simple paragraph.</p>"]
	],
	cache_examples=False,
	allow_flagging="never"
	)

	if __name__ == "__main__":
	iface.launch()