hf1agideia commited on
Commit
904a38a
·
verified ·
1 Parent(s): 9b3e1b2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -0
app.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from markdownify import markdownify
4
+ import traceback
5
+ from readability import Document
6
+ from bs4 import BeautifulSoup
7
+
8
+ # Configurações globais
9
+ DEFAULT_TIMEOUT = 15 # segundos
10
+ HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}
11
+
12
+ def html_to_markdown_converter(url: str, html_input: str) -> str:
13
+ """
14
+ Converte HTML (via URL ou input direto) para Markdown.
15
+ Tenta extrair o conteúdo principal com Readability.
16
+ """
17
+ html_content = ""
18
+ source = ""
19
+ use_readability = True
20
+
21
+ url = url.strip() if url else ""
22
+ html_input = html_input.strip() if html_input else ""
23
+
24
+ try:
25
+ # --- Obtenção do conteúdo HTML ---
26
+ if url:
27
+ source = f"URL ({url})"
28
+ print(f"Fetching HTML from URL: {url}")
29
+ try:
30
+ if not url.startswith(('http://', 'https://')):
31
+ url = 'https://' + url
32
+ print(f"Prepended https:// => {url}")
33
+
34
+ response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
35
+ response.raise_for_status()
36
+ response.encoding = response.apparent_encoding or 'utf-8'
37
+ html_content = response.text
38
+ print(f"Fetched {len(html_content)} bytes.")
39
+ except (requests.exceptions.MissingSchema, requests.exceptions.InvalidURL):
40
+ return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
41
+ except requests.exceptions.Timeout:
42
+ return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds for URL: `{url}`"
43
+ except requests.exceptions.RequestException as e:
44
+ return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
45
+ except Exception as e:
46
+ return f"❌ Unexpected error fetching URL.\n```\n{traceback.format_exc()}\n```"
47
+ elif html_input:
48
+ source = "Direct HTML Input"
49
+ print(f"Using direct HTML input ({len(html_input)} bytes).")
50
+ html_content = html_input
51
+ else:
52
+ return "❓ Please provide a URL or paste HTML content above."
53
+
54
+ # --- Extração com Readability ---
55
+ if not html_content:
56
+ return f"❓ No HTML content found from {source}."
57
+
58
+ processed_html = html_content
59
+ article_title = ""
60
+
61
+ if use_readability:
62
+ print("Trying Readability content extraction...")
63
+ try:
64
+ doc = Document(html_content)
65
+ article_title = doc.title().strip()
66
+ processed_html_summary = doc.summary()
67
+ soup = BeautifulSoup(processed_html_summary, 'html.parser')
68
+ if not soup.text.strip():
69
+ print("Readability returned empty summary. Using full HTML.")
70
+ else:
71
+ processed_html = processed_html_summary
72
+ print(f"Extracted title: {article_title}")
73
+ except Exception as e:
74
+ print("Readability failed. Using full HTML.")
75
+
76
+ # --- Conversão para Markdown ---
77
+ if not processed_html.strip():
78
+ return "❓ Processed HTML is empty."
79
+
80
+ print(f"Converting HTML ({len(processed_html)} chars) to Markdown...")
81
+ try:
82
+ markdown_output = markdownify(processed_html, heading_style="ATX", bullets='*')
83
+ print(f"Markdown generated ({len(markdown_output)} chars).")
84
+
85
+ final_output = f"# {article_title}\n\n{markdown_output}" if article_title else markdown_output
86
+
87
+ if not final_output.strip():
88
+ return "ℹ️ Conversion resulted in empty Markdown."
89
+
90
+ return final_output
91
+ except Exception:
92
+ return f"❌ Markdown conversion failed.\n```\n{traceback.format_exc()}\n```"
93
+
94
+ except Exception:
95
+ return f"❌ Unexpected processing error.\n```\n{traceback.format_exc()}\n```"
96
+
97
+ # Gradio UI
98
+ title = "HTML to Markdown Converter (Smart Extraction)"
99
+ description = """
100
+ Enter a URL **or** paste HTML code below.
101
+ This tool uses Mozilla's Readability to extract the main content and converts it to Markdown.
102
+ """
103
+ article = """
104
+ **How it works:**
105
+ - Fetches the HTML using `requests`
106
+ - Extracts main content using `readability-lxml`
107
+ - Converts to Markdown using `markdownify`
108
+ """
109
+
110
+ url_input = gr.Textbox(label="Enter URL (takes priority)", placeholder="e.g., en.wikipedia.org/wiki/Markdown")
111
+ html_input = gr.Textbox(label="Or Paste HTML Code Here", lines=10, placeholder="<h1>Hello</h1><p>Example content.</p>")
112
+ markdown_output = gr.Textbox(label="Converted Markdown Output", lines=20, interactive=False, show_copy_button=True)
113
+
114
+ iface = gr.Interface(
115
+ fn=html_to_markdown_converter,
116
+ inputs=[url_input, html_input],
117
+ outputs=markdown_output,
118
+ title=title,
119
+ description=description,
120
+ article=article,
121
+ examples=[
122
+ ["https://gradio.app/quickstart/", ""],
123
+ ["https://en.wikipedia.org/wiki/Python_(programming_language)", ""],
124
+ ["https://www.bbc.com/news", ""],
125
+ ["", "<body><main><h1>Main Title</h1><p>Article content here.</p></main></body>"],
126
+ ["https://httpbin.org/delay/5", ""],
127
+ ["invalid-url", ""],
128
+ ["", "<p>Just a simple paragraph.</p>"]
129
+ ],
130
+ cache_examples=False,
131
+ allow_flagging="never"
132
+ )
133
+
134
+ if __name__ == "__main__":
135
+ iface.launch()