File size: 5,434 Bytes
904a38a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import gradio as gr
import requests
from markdownify import markdownify
import traceback
from readability import Document
from bs4 import BeautifulSoup

# Configurações globais
DEFAULT_TIMEOUT = 15  # segundos
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}

def html_to_markdown_converter(url: str, html_input: str) -> str:
    """
    Converte HTML (via URL ou input direto) para Markdown.
    Tenta extrair o conteúdo principal com Readability.
    """
    html_content = ""
    source = ""
    use_readability = True

    url = url.strip() if url else ""
    html_input = html_input.strip() if html_input else ""

    try:
        # --- Obtenção do conteúdo HTML ---
        if url:
            source = f"URL ({url})"
            print(f"Fetching HTML from URL: {url}")
            try:
                if not url.startswith(('http://', 'https://')):
                    url = 'https://' + url
                    print(f"Prepended https:// => {url}")

                response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
                response.raise_for_status()
                response.encoding = response.apparent_encoding or 'utf-8'
                html_content = response.text
                print(f"Fetched {len(html_content)} bytes.")
            except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL):
                return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
            except requests.exceptions.Timeout:
                return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds for URL: `{url}`"
            except requests.exceptions.RequestException as e:
                return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
            except Exception as e:
                return f"❌ Unexpected error fetching URL.\n```\n{traceback.format_exc()}\n```"
        elif html_input:
            source = "Direct HTML Input"
            print(f"Using direct HTML input ({len(html_input)} bytes).")
            html_content = html_input
        else:
            return "❓ Please provide a URL or paste HTML content above."

        # --- Extração com Readability ---
        if not html_content:
            return f"❓ No HTML content found from {source}."

        processed_html = html_content
        article_title = ""

        if use_readability:
            print("Trying Readability content extraction...")
            try:
                doc = Document(html_content)
                article_title = doc.title().strip()
                processed_html_summary = doc.summary()
                soup = BeautifulSoup(processed_html_summary, 'html.parser')
                if not soup.text.strip():
                    print("Readability returned empty summary. Using full HTML.")
                else:
                    processed_html = processed_html_summary
                    print(f"Extracted title: {article_title}")
            except Exception as e:
                print("Readability failed. Using full HTML.")

        # --- Conversão para Markdown ---
        if not processed_html.strip():
            return "❓ Processed HTML is empty."

        print(f"Converting HTML ({len(processed_html)} chars) to Markdown...")
        try:
            markdown_output = markdownify(processed_html, heading_style="ATX", bullets='*')
            print(f"Markdown generated ({len(markdown_output)} chars).")

            final_output = f"# {article_title}\n\n{markdown_output}" if article_title else markdown_output

            if not final_output.strip():
                return "ℹ️ Conversion resulted in empty Markdown."

            return final_output
        except Exception:
            return f"❌ Markdown conversion failed.\n```\n{traceback.format_exc()}\n```"

    except Exception:
        return f"❌ Unexpected processing error.\n```\n{traceback.format_exc()}\n```"

# Gradio UI
title = "HTML to Markdown Converter (Smart Extraction)"
description = """
Enter a URL **or** paste HTML code below.
This tool uses Mozilla's Readability to extract the main content and converts it to Markdown.
"""
article = """
**How it works:**
- Fetches the HTML using `requests`
- Extracts main content using `readability-lxml`
- Converts to Markdown using `markdownify`
"""

url_input = gr.Textbox(label="Enter URL (takes priority)", placeholder="e.g., en.wikipedia.org/wiki/Markdown")
html_input = gr.Textbox(label="Or Paste HTML Code Here", lines=10, placeholder="<h1>Hello</h1><p>Example content.</p>")
markdown_output = gr.Textbox(label="Converted Markdown Output", lines=20, interactive=False, show_copy_button=True)

iface = gr.Interface(
    fn=html_to_markdown_converter,
    inputs=[url_input, html_input],
    outputs=markdown_output,
    title=title,
    description=description,
    article=article,
    examples=[
        ["https://gradio.app/quickstart/", ""],
        ["https://en.wikipedia.org/wiki/Python_(programming_language)", ""],
        ["https://www.bbc.com/news", ""],
        ["", "<body><main><h1>Main Title</h1><p>Article content here.</p></main></body>"],
        ["https://httpbin.org/delay/5", ""],
        ["invalid-url", ""],
        ["", "<p>Just a simple paragraph.</p>"]
    ],
    cache_examples=False,
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch()