Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| from markdownify import markdownify | |
| import traceback | |
| from readability import Document | |
| from bs4 import BeautifulSoup | |
| # Configurações globais | |
| DEFAULT_TIMEOUT = 15 # segundos | |
| HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} | |
| def html_to_markdown_converter(url: str, html_input: str) -> str: | |
| """ | |
| Converte HTML (via URL ou input direto) para Markdown. | |
| Tenta extrair o conteúdo principal com Readability. | |
| """ | |
| html_content = "" | |
| source = "" | |
| use_readability = True | |
| url = url.strip() if url else "" | |
| html_input = html_input.strip() if html_input else "" | |
| try: | |
| # --- Obtenção do conteúdo HTML --- | |
| if url: | |
| source = f"URL ({url})" | |
| print(f"Fetching HTML from URL: {url}") | |
| try: | |
| if not url.startswith(('http://', 'https://')): | |
| url = 'https://' + url | |
| print(f"Prepended https:// => {url}") | |
| response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True) | |
| response.raise_for_status() | |
| response.encoding = response.apparent_encoding or 'utf-8' | |
| html_content = response.text | |
| print(f"Fetched {len(html_content)} bytes.") | |
| except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL): | |
| return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`." | |
| except requests.exceptions.Timeout: | |
| return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds for URL: `{url}`" | |
| except requests.exceptions.RequestException as e: | |
| return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```" | |
| except Exception as e: | |
| return f"❌ Unexpected error fetching URL.\n```\n{traceback.format_exc()}\n```" | |
| elif html_input: | |
| source = "Direct HTML Input" | |
| print(f"Using direct HTML input ({len(html_input)} bytes).") | |
| html_content = html_input | |
| else: | |
| return "❓ Please provide a URL or paste HTML content above." | |
| # --- Extração com Readability --- | |
| if not html_content: | |
| return f"❓ No HTML content found from {source}." | |
| processed_html = html_content | |
| article_title = "" | |
| if use_readability: | |
| print("Trying Readability content extraction...") | |
| try: | |
| doc = Document(html_content) | |
| article_title = doc.title().strip() | |
| processed_html_summary = doc.summary() | |
| soup = BeautifulSoup(processed_html_summary, 'html.parser') | |
| if not soup.text.strip(): | |
| print("Readability returned empty summary. Using full HTML.") | |
| else: | |
| processed_html = processed_html_summary | |
| print(f"Extracted title: {article_title}") | |
| except Exception as e: | |
| print("Readability failed. Using full HTML.") | |
| # --- Conversão para Markdown --- | |
| if not processed_html.strip(): | |
| return "❓ Processed HTML is empty." | |
| print(f"Converting HTML ({len(processed_html)} chars) to Markdown...") | |
| try: | |
| markdown_output = markdownify(processed_html, heading_style="ATX", bullets='*') | |
| print(f"Markdown generated ({len(markdown_output)} chars).") | |
| final_output = f"# {article_title}\n\n{markdown_output}" if article_title else markdown_output | |
| if not final_output.strip(): | |
| return "ℹ️ Conversion resulted in empty Markdown." | |
| return final_output | |
| except Exception: | |
| return f"❌ Markdown conversion failed.\n```\n{traceback.format_exc()}\n```" | |
| except Exception: | |
| return f"❌ Unexpected processing error.\n```\n{traceback.format_exc()}\n```" | |
| # Gradio UI | |
| title = "HTML to Markdown Converter (Smart Extraction)" | |
| description = """ | |
| Enter a URL **or** paste HTML code below. | |
| This tool uses Mozilla's Readability to extract the main content and converts it to Markdown. | |
| """ | |
| article = """ | |
| **How it works:** | |
| - Fetches the HTML using `requests` | |
| - Extracts main content using `readability-lxml` | |
| - Converts to Markdown using `markdownify` | |
| """ | |
| url_input = gr.Textbox(label="Enter URL (takes priority)", placeholder="e.g., en.wikipedia.org/wiki/Markdown") | |
| html_input = gr.Textbox(label="Or Paste HTML Code Here", lines=10, placeholder="<h1>Hello</h1><p>Example content.</p>") | |
| markdown_output = gr.Textbox(label="Converted Markdown Output", lines=20, interactive=False, show_copy_button=True) | |
| iface = gr.Interface( | |
| fn=html_to_markdown_converter, | |
| inputs=[url_input, html_input], | |
| outputs=markdown_output, | |
| title=title, | |
| description=description, | |
| article=article, | |
| examples=[ | |
| ["https://gradio.app/quickstart/", ""], | |
| ["https://en.wikipedia.org/wiki/Python_(programming_language)", ""], | |
| ["https://www.bbc.com/news", ""], | |
| ["", "<body><main><h1>Main Title</h1><p>Article content here.</p></main></body>"], | |
| ["https://httpbin.org/delay/5", ""], | |
| ["invalid-url", ""], | |
| ["", "<p>Just a simple paragraph.</p>"] | |
| ], | |
| cache_examples=False, | |
| allow_flagging="never" | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |