Spaces:

hf1agideia
/

md1

Sleeping

File size: 5,434 Bytes

904a38a

import gradio as gr
import requests
from markdownify import markdownify
import traceback
from readability import Document
from bs4 import BeautifulSoup

# Configurações globais
DEFAULT_TIMEOUT = 15  # segundos
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}

def html_to_markdown_converter(url: str, html_input: str) -> str:
    """
    Converte HTML (via URL ou input direto) para Markdown.
    Tenta extrair o conteúdo principal com Readability.
    """
    html_content = ""
    source = ""
    use_readability = True

    url = url.strip() if url else ""
    html_input = html_input.strip() if html_input else ""

    try:
        # --- Obtenção do conteúdo HTML ---
        if url:
            source = f"URL ({url})"
            print(f"Fetching HTML from URL: {url}")
            try:
                if not url.startswith(('http://', 'https://')):
                    url = 'https://' + url
                    print(f"Prepended https:// => {url}")

                response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
                response.raise_for_status()
                response.encoding = response.apparent_encoding or 'utf-8'
                html_content = response.text
                print(f"Fetched {len(html_content)} bytes.")
            except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL):
                return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
            except requests.exceptions.Timeout:
                return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds for URL: `{url}`"
            except requests.exceptions.RequestException as e:
                return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
            except Exception as e:
                return f"❌ Unexpected error fetching URL.\n```\n{traceback.format_exc()}\n```"
        elif html_input:
            source = "Direct HTML Input"
            print(f"Using direct HTML input ({len(html_input)} bytes).")
            html_content = html_input
        else:
            return "❓ Please provide a URL or paste HTML content above."

        # --- Extração com Readability ---
        if not html_content:
            return f"❓ No HTML content found from {source}."

        processed_html = html_content
        article_title = ""

        if use_readability:
            print("Trying Readability content extraction...")
            try:
                doc = Document(html_content)
                article_title = doc.title().strip()
                processed_html_summary = doc.summary()
                soup = BeautifulSoup(processed_html_summary, 'html.parser')
                if not soup.text.strip():
                    print("Readability returned empty summary. Using full HTML.")
                else:
                    processed_html = processed_html_summary
                    print(f"Extracted title: {article_title}")
            except Exception as e:
                print("Readability failed. Using full HTML.")

        # --- Conversão para Markdown ---
        if not processed_html.strip():
            return "❓ Processed HTML is empty."

        print(f"Converting HTML ({len(processed_html)} chars) to Markdown...")
        try:
            markdown_output = markdownify(processed_html, heading_style="ATX", bullets='*')
            print(f"Markdown generated ({len(markdown_output)} chars).")

            final_output = f"# {article_title}\n\n{markdown_output}" if article_title else markdown_output

            if not final_output.strip():
                return "ℹ️ Conversion resulted in empty Markdown."

            return final_output
        except Exception:
            return f"❌ Markdown conversion failed.\n```\n{traceback.format_exc()}\n```"

    except Exception:
        return f"❌ Unexpected processing error.\n```\n{traceback.format_exc()}\n```"

# Gradio UI
title = "HTML to Markdown Converter (Smart Extraction)"
description = """
Enter a URL **or** paste HTML code below.
This tool uses Mozilla's Readability to extract the main content and converts it to Markdown.
"""
article = """
**How it works:**
- Fetches the HTML using `requests`
- Extracts main content using `readability-lxml`
- Converts to Markdown using `markdownify`
"""

url_input = gr.Textbox(label="Enter URL (takes priority)", placeholder="e.g., en.wikipedia.org/wiki/Markdown")
html_input = gr.Textbox(label="Or Paste HTML Code Here", lines=10, placeholder="<h1>Hello</h1><p>Example content.</p>")
markdown_output = gr.Textbox(label="Converted Markdown Output", lines=20, interactive=False, show_copy_button=True)

iface = gr.Interface(
    fn=html_to_markdown_converter,
    inputs=[url_input, html_input],
    outputs=markdown_output,
    title=title,
    description=description,
    article=article,
    examples=[
        ["https://gradio.app/quickstart/", ""],
        ["https://en.wikipedia.org/wiki/Python_(programming_language)", ""],
        ["https://www.bbc.com/news", ""],
        ["", "<body><main><h1>Main Title</h1><p>Article content here.</p></main></body>"],
        ["https://httpbin.org/delay/5", ""],
        ["invalid-url", ""],
        ["", "<p>Just a simple paragraph.</p>"]
    ],
    cache_examples=False,
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch()