import gradio as gr import requests from markdownify import markdownify import traceback from readability import Document from bs4 import BeautifulSoup # Configurações globais DEFAULT_TIMEOUT = 15 # segundos HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} def html_to_markdown_converter(url: str, html_input: str) -> str: """ Converte HTML (via URL ou input direto) para Markdown. Tenta extrair o conteúdo principal com Readability. """ html_content = "" source = "" use_readability = True url = url.strip() if url else "" html_input = html_input.strip() if html_input else "" try: # --- Obtenção do conteúdo HTML --- if url: source = f"URL ({url})" print(f"Fetching HTML from URL: {url}") try: if not url.startswith(('http://', 'https://')): url = 'https://' + url print(f"Prepended https:// => {url}") response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True) response.raise_for_status() response.encoding = response.apparent_encoding or 'utf-8' html_content = response.text print(f"Fetched {len(html_content)} bytes.") except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL): return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`." except requests.exceptions.Timeout: return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds for URL: `{url}`" except requests.exceptions.RequestException as e: return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```" except Exception as e: return f"❌ Unexpected error fetching URL.\n```\n{traceback.format_exc()}\n```" elif html_input: source = "Direct HTML Input" print(f"Using direct HTML input ({len(html_input)} bytes).") html_content = html_input else: return "❓ Please provide a URL or paste HTML content above." # --- Extração com Readability --- if not html_content: return f"❓ No HTML content found from {source}." processed_html = html_content article_title = "" if use_readability: print("Trying Readability content extraction...") try: doc = Document(html_content) article_title = doc.title().strip() processed_html_summary = doc.summary() soup = BeautifulSoup(processed_html_summary, 'html.parser') if not soup.text.strip(): print("Readability returned empty summary. Using full HTML.") else: processed_html = processed_html_summary print(f"Extracted title: {article_title}") except Exception as e: print("Readability failed. Using full HTML.") # --- Conversão para Markdown --- if not processed_html.strip(): return "❓ Processed HTML is empty." print(f"Converting HTML ({len(processed_html)} chars) to Markdown...") try: markdown_output = markdownify(processed_html, heading_style="ATX", bullets='*') print(f"Markdown generated ({len(markdown_output)} chars).") final_output = f"# {article_title}\n\n{markdown_output}" if article_title else markdown_output if not final_output.strip(): return "ℹ️ Conversion resulted in empty Markdown." return final_output except Exception: return f"❌ Markdown conversion failed.\n```\n{traceback.format_exc()}\n```" except Exception: return f"❌ Unexpected processing error.\n```\n{traceback.format_exc()}\n```" # Gradio UI title = "HTML to Markdown Converter (Smart Extraction)" description = """ Enter a URL **or** paste HTML code below. This tool uses Mozilla's Readability to extract the main content and converts it to Markdown. """ article = """ **How it works:** - Fetches the HTML using `requests` - Extracts main content using `readability-lxml` - Converts to Markdown using `markdownify` """ url_input = gr.Textbox(label="Enter URL (takes priority)", placeholder="e.g., en.wikipedia.org/wiki/Markdown") html_input = gr.Textbox(label="Or Paste HTML Code Here", lines=10, placeholder="

Hello

Example content.

") markdown_output = gr.Textbox(label="Converted Markdown Output", lines=20, interactive=False, show_copy_button=True) iface = gr.Interface( fn=html_to_markdown_converter, inputs=[url_input, html_input], outputs=markdown_output, title=title, description=description, article=article, examples=[ ["https://gradio.app/quickstart/", ""], ["https://en.wikipedia.org/wiki/Python_(programming_language)", ""], ["https://www.bbc.com/news", ""], ["", "

Main Title

Article content here.

"], ["https://httpbin.org/delay/5", ""], ["invalid-url", ""], ["", "

Just a simple paragraph.

"] ], cache_examples=False, allow_flagging="never" ) if __name__ == "__main__": iface.launch()