Spaces:

hf1agideia
/

md1

Sleeping

App Files Files Community

hf1agideia commited on Apr 21, 2025

Commit

904a38a

verified ·

1 Parent(s): 9b3e1b2

Create app.py

Browse files

Files changed (1) hide show

app.py +135 -0

app.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import gradio as gr
+import requests
+from markdownify import markdownify
+import traceback
+from readability import Document
+from bs4 import BeautifulSoup
+# Configurações globais
+DEFAULT_TIMEOUT = 15  # segundos
+HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}
+def html_to_markdown_converter(url: str, html_input: str) -> str:
+    """
+    Converte HTML (via URL ou input direto) para Markdown.
+    Tenta extrair o conteúdo principal com Readability.
+    """
+    html_content = ""
+    source = ""
+    use_readability = True
+    url = url.strip() if url else ""
+    html_input = html_input.strip() if html_input else ""
+    try:
+        # --- Obtenção do conteúdo HTML ---
+        if url:
+            source = f"URL ({url})"
+            print(f"Fetching HTML from URL: {url}")
+            try:
+                if not url.startswith(('http://', 'https://')):
+                    url = 'https://' + url
+                    print(f"Prepended https:// => {url}")
+                response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
+                response.raise_for_status()
+                response.encoding = response.apparent_encoding or 'utf-8'
+                html_content = response.text
+                print(f"Fetched {len(html_content)} bytes.")
+            except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL):
+                return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
+            except requests.exceptions.Timeout:
+                return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds for URL: `{url}`"
+            except requests.exceptions.RequestException as e:
+                return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
+            except Exception as e:
+                return f"❌ Unexpected error fetching URL.\n```\n{traceback.format_exc()}\n```"
+        elif html_input:
+            source = "Direct HTML Input"
+            print(f"Using direct HTML input ({len(html_input)} bytes).")
+            html_content = html_input
+        else:
+            return "❓ Please provide a URL or paste HTML content above."
+        # --- Extração com Readability ---
+        if not html_content:
+            return f"❓ No HTML content found from {source}."
+        processed_html = html_content
+        article_title = ""
+        if use_readability:
+            print("Trying Readability content extraction...")
+            try:
+                doc = Document(html_content)
+                article_title = doc.title().strip()
+                processed_html_summary = doc.summary()
+                soup = BeautifulSoup(processed_html_summary, 'html.parser')
+                if not soup.text.strip():
+                    print("Readability returned empty summary. Using full HTML.")
+                else:
+                    processed_html = processed_html_summary
+                    print(f"Extracted title: {article_title}")
+            except Exception as e:
+                print("Readability failed. Using full HTML.")
+        # --- Conversão para Markdown ---
+        if not processed_html.strip():
+            return "❓ Processed HTML is empty."
+        print(f"Converting HTML ({len(processed_html)} chars) to Markdown...")
+        try:
+            markdown_output = markdownify(processed_html, heading_style="ATX", bullets='*')
+            print(f"Markdown generated ({len(markdown_output)} chars).")
+            final_output = f"# {article_title}\n\n{markdown_output}" if article_title else markdown_output
+            if not final_output.strip():
+                return "ℹ️ Conversion resulted in empty Markdown."
+            return final_output
+        except Exception:
+            return f"❌ Markdown conversion failed.\n```\n{traceback.format_exc()}\n```"
+    except Exception:
+        return f"❌ Unexpected processing error.\n```\n{traceback.format_exc()}\n```"
+# Gradio UI
+title = "HTML to Markdown Converter (Smart Extraction)"
+description = """
+Enter a URL **or** paste HTML code below.
+This tool uses Mozilla's Readability to extract the main content and converts it to Markdown.
+"""
+article = """
+**How it works:**
+- Fetches the HTML using `requests`
+- Extracts main content using `readability-lxml`
+- Converts to Markdown using `markdownify`
+"""
+url_input = gr.Textbox(label="Enter URL (takes priority)", placeholder="e.g., en.wikipedia.org/wiki/Markdown")
+html_input = gr.Textbox(label="Or Paste HTML Code Here", lines=10, placeholder="<h1>Hello</h1><p>Example content.</p>")
+markdown_output = gr.Textbox(label="Converted Markdown Output", lines=20, interactive=False, show_copy_button=True)
+iface = gr.Interface(
+    fn=html_to_markdown_converter,
+    inputs=[url_input, html_input],
+    outputs=markdown_output,
+    title=title,
+    description=description,
+    article=article,
+    examples=[
+        ["https://gradio.app/quickstart/", ""],
+        ["https://en.wikipedia.org/wiki/Python_(programming_language)", ""],
+        ["https://www.bbc.com/news", ""],
+        ["", "<body><main><h1>Main Title</h1><p>Article content here.</p></main></body>"],
+        ["https://httpbin.org/delay/5", ""],
+        ["invalid-url", ""],
+        ["", "<p>Just a simple paragraph.</p>"]
+    ],
+    cache_examples=False,
+    allow_flagging="never"
+)
+if __name__ == "__main__":
+    iface.launch()