13ze commited on
Commit
9dacd92
·
verified ·
1 Parent(s): 6ee0735

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -57
app.py CHANGED
@@ -1,70 +1,122 @@
1
  import gradio as gr
2
- from markdownify import markdownify as md
3
- from bs4 import BeautifulSoup
4
- from playwright.sync_api import sync_playwright
5
- import re
6
 
7
- def is_url(text: str) -> bool:
8
- return text.strip().lower().startswith(("http://", "https://"))
 
9
 
10
- def beautify_markdown(markdown_text: str) -> str:
11
- markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)
12
- markdown_text = re.sub(r'[ \t]+$', '', markdown_text, flags=re.MULTILINE)
13
- return markdown_text.strip()
 
 
 
14
 
15
- def convert_to_markdown(input_text: str, strip_tags: list[str], request: gr.Request):
16
- if not input_text.strip():
17
- return "# Por favor, insira uma URL ou HTML."
18
 
19
- if is_url(input_text):
20
- try:
21
- user_agent = request.headers.get("user-agent", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- with sync_playwright() as p:
24
- browser = p.chromium.launch(
25
- args=["--single-process", "--no-zygote", "--no-sandbox",
26
- "--disable-gpu", "--disable-dev-shm-usage", "--headless=new"]
27
- )
28
- context = browser.new_context(user_agent=user_agent)
29
- page = context.new_page()
30
- response = page.goto(url=input_text)
31
- content = page.content()
32
- title = page.title()
33
- browser.close()
34
 
35
- soup = BeautifulSoup(content, "html.parser")
36
- for tag in ["script", "style"]:
37
- for t in soup.find_all(tag):
38
- t.decompose()
39
 
40
- html_part = soup.find("main") or soup.find("body")
41
- markdown = md(str(html_part), strip=strip_tags)
42
- return beautify_markdown(f"# {title}\n\n{markdown}")
43
- except Exception as e:
44
- return f"# Erro ao carregar a URL\n\n```\n{e}\n```"
45
- else:
46
  try:
47
- markdown = md(input_text, heading_style="ATX")
48
- return beautify_markdown(markdown)
 
 
 
 
 
49
  except Exception as e:
50
- return f"# Erro ao converter HTML\n\n```\n{e}\n```"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- gr.Interface(
53
- fn=convert_to_markdown,
54
- inputs=[
55
- gr.Code(label="URL ou HTML", language="html"),
56
- gr.CheckboxGroup(
57
- label="Ignorar tags (válido apenas para URL)",
58
- choices=["a", "img", "noscript"],
59
- value=[]
60
- )
61
- ],
62
- outputs=gr.Code(label="Markdown", language="markdown"),
63
- title="URL ou HTML → Markdown",
64
- description="Cole uma URL ou HTML abaixo. A conversão será feita automaticamente.",
65
- allow_flagging="never",
 
 
 
 
 
 
 
 
 
 
 
 
66
  examples=[
67
- ["https://www.exemplo.com", []],
68
- ['<h1>Título</h1><p>Texto <strong>negrito</strong></p>', []],
 
 
69
  ]
70
- ).launch()
 
 
 
 
 
1
  import gradio as gr
2
+ import requests
3
+ from markdownify import markdownify
4
+ import traceback # To help format potential errors
 
5
 
6
+ # Configure requests with a timeout and user-agent
7
+ DEFAULT_TIMEOUT = 15 # seconds
8
+ HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} # Be polite
9
 
10
+ def html_to_markdown_converter(url: str, html_input: str) -> str:
11
+ """
12
+ Converts HTML (from URL or direct input) to Markdown.
13
+ Prioritizes URL input if provided.
14
+ """
15
+ html_content = ""
16
+ source = ""
17
 
18
+ # Clean up inputs
19
+ url = url.strip() if url else ""
20
+ html_input = html_input.strip() if html_input else ""
21
 
22
+ try:
23
+ # --- Step 1: Get HTML Content ---
24
+ if url:
25
+ source = f"URL ({url})"
26
+ print(f"Attempting to fetch HTML from URL: {url}")
27
+ try:
28
+ response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
29
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
30
+ # Try to decode using apparent encoding, fallback to utf-8
31
+ response.encoding = response.apparent_encoding or 'utf-8'
32
+ html_content = response.text
33
+ print(f"Successfully fetched {len(html_content)} bytes from URL.")
34
+ except requests.exceptions.Timeout:
35
+ return f"❌ **Error:** Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
36
+ except requests.exceptions.RequestException as e:
37
+ print(f"Request failed: {e}")
38
+ return f"❌ **Error:** Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
39
+ except Exception as e:
40
+ print(f"An unexpected error occurred during fetch: {e}")
41
+ return f"❌ **Error:** An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```"
42
 
43
+ elif html_input:
44
+ source = "Direct HTML Input"
45
+ print(f"Using direct HTML input ({len(html_input)} bytes).")
46
+ html_content = html_input
47
+ else:
48
+ return "❓ Please provide a URL or paste HTML content in the fields above."
 
 
 
 
 
49
 
50
+ # --- Step 2: Convert to Markdown ---
51
+ if not html_content:
52
+ return f"❓ No HTML content found from {source}."
 
53
 
54
+ print(f"Attempting to convert HTML from {source} to Markdown...")
 
 
 
 
 
55
  try:
56
+ # Use markdownify to convert
57
+ # You can pass options here if needed, e.g., heading_style="ATX"
58
+ markdown_output = markdownify(html_content, heading_style="ATX")
59
+ print(f"Conversion successful. Markdown length: {len(markdown_output)}")
60
+ # The markdown_output is already "beautified" in the sense of standard Markdown.
61
+ # The gr.Markdown component will render it nicely.
62
+ return markdown_output
63
  except Exception as e:
64
+ print(f"Markdown conversion failed: {e}")
65
+ # Return error in a Markdown code block for readability
66
+ return f"❌ **Error:** Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```"
67
+
68
+ except Exception as e:
69
+ # Catch any unexpected errors in the overall logic
70
+ print(f"An unexpected error occurred: {e}")
71
+ return f"❌ **Error:** An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```"
72
+
73
+ # --- Gradio Interface ---
74
+ title = "HTML to Markdown Converter"
75
+ description = """
76
+ Enter a URL **or** paste HTML code directly into the text box below.
77
+ The tool will fetch the HTML (if URL is provided) and convert it into Markdown.
78
+ The converted Markdown will be displayed below. Priority is given to the URL input if both fields are filled.
79
+ """
80
+ article = """
81
+ **How it works:**
82
+ 1. Uses the `requests` library to fetch content from URLs.
83
+ 2. Uses the `markdownify` library to convert HTML source code into Markdown text.
84
+ 3. The output is displayed in a rendered Markdown format.
85
 
86
+ **Note on 'Beautification':** The `markdownify` library aims to produce clean, standard Markdown. The rendering in the output box provides visual clarity. No additional styling rules are applied beyond standard Markdown conversion.
87
+ """
88
+
89
+ # Define input components
90
+ url_input = gr.Textbox(
91
+ label="Enter URL (gets priority)",
92
+ placeholder="e.g., https://en.wikipedia.org/wiki/Markdown"
93
+ )
94
+ html_input_area = gr.Textbox(
95
+ label="Or Paste HTML Code Here",
96
+ lines=10,
97
+ placeholder="e.g., <h1>Hello</h1><p>This is <b>bold</b>.</p>"
98
+ )
99
+
100
+ # Define output component
101
+ markdown_output_display = gr.Markdown(label="Converted Markdown Output")
102
+
103
+ # Create the Gradio interface
104
+ iface = gr.Interface(
105
+ fn=html_to_markdown_converter,
106
+ inputs=[url_input, html_input_area],
107
+ outputs=markdown_output_display,
108
+ title=title,
109
+ description=description,
110
+ article=article,
111
+ allow_flagging='never',
112
  examples=[
113
+ ["https://gradio.app/quickstart/", ""], # Example using URL
114
+ ["", "<h2>Example HTML</h2><p>Convert <em>this</em> snippet.</p><ul><li>Item 1</li><li>Item 2</li></ul>"], # Example using direct HTML
115
+ ["https://httpbin.org/delay/20", ""], # Example slow URL (might timeout)
116
+ ["https://invalid-url-that-does-not-exist-probably.xyz", ""] # Example invalid URL
117
  ]
118
+ )
119
+
120
+ # Launch the app (for local testing or Hugging Face Spaces)
121
+ if __name__ == "__main__":
122
+ iface.launch()