Spaces:

13ze
/

scrape2MD

Sleeping

App Files Files Community

13ze commited on Apr 19, 2025

Commit

9dacd92

verified ·

1 Parent(s): 6ee0735

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -57

app.py CHANGED Viewed

@@ -1,70 +1,122 @@
 import gradio as gr
-from markdownify import markdownify as md
-from bs4 import BeautifulSoup
-from playwright.sync_api import sync_playwright
-import re
-def is_url(text: str) -> bool:
-    return text.strip().lower().startswith(("http://", "https://"))
-def beautify_markdown(markdown_text: str) -> str:
-    markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)
-    markdown_text = re.sub(r'[ \t]+$', '', markdown_text, flags=re.MULTILINE)
-    return markdown_text.strip()
-def convert_to_markdown(input_text: str, strip_tags: list[str], request: gr.Request):
-    if not input_text.strip():
-        return "# Por favor, insira uma URL ou HTML."
-    if is_url(input_text):
-        try:
-            user_agent = request.headers.get("user-agent", "")
-            with sync_playwright() as p:
-                browser = p.chromium.launch(
-                    args=["--single-process", "--no-zygote", "--no-sandbox",
-                          "--disable-gpu", "--disable-dev-shm-usage", "--headless=new"]
-                )
-                context = browser.new_context(user_agent=user_agent)
-                page = context.new_page()
-                response = page.goto(url=input_text)
-                content = page.content()
-                title = page.title()
-                browser.close()
-            soup = BeautifulSoup(content, "html.parser")
-            for tag in ["script", "style"]:
-                for t in soup.find_all(tag):
-                    t.decompose()
-            html_part = soup.find("main") or soup.find("body")
-            markdown = md(str(html_part), strip=strip_tags)
-            return beautify_markdown(f"# {title}\n\n{markdown}")
-        except Exception as e:
-            return f"# Erro ao carregar a URL\n\n```\n{e}\n```"
-    else:
         try:
-            markdown = md(input_text, heading_style="ATX")
-            return beautify_markdown(markdown)
         except Exception as e:
-            return f"# Erro ao converter HTML\n\n```\n{e}\n```"
-gr.Interface(
-    fn=convert_to_markdown,
-    inputs=[
-        gr.Code(label="URL ou HTML", language="html"),
-        gr.CheckboxGroup(
-            label="Ignorar tags (válido apenas para URL)",
-            choices=["a", "img", "noscript"],
-            value=[]
-        )
-    ],
-    outputs=gr.Code(label="Markdown", language="markdown"),
-    title="URL ou HTML → Markdown",
-    description="Cole uma URL ou HTML abaixo. A conversão será feita automaticamente.",
-    allow_flagging="never",
     examples=[
-        ["https://www.exemplo.com", []],
-        ['<h1>Título</h1><p>Texto <strong>negrito</strong></p>', []],
     ]
-).launch()

 import gradio as gr
+import requests
+from markdownify import markdownify
+import traceback # To help format potential errors
+# Configure requests with a timeout and user-agent
+DEFAULT_TIMEOUT = 15 # seconds
+HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'} # Be polite
+def html_to_markdown_converter(url: str, html_input: str) -> str:
+    """
+    Converts HTML (from URL or direct input) to Markdown.
+    Prioritizes URL input if provided.
+    """
+    html_content = ""
+    source = ""
+    # Clean up inputs
+    url = url.strip() if url else ""
+    html_input = html_input.strip() if html_input else ""
+    try:
+        # --- Step 1: Get HTML Content ---
+        if url:
+            source = f"URL ({url})"
+            print(f"Attempting to fetch HTML from URL: {url}")
+            try:
+                response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True)
+                response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
+                # Try to decode using apparent encoding, fallback to utf-8
+                response.encoding = response.apparent_encoding or 'utf-8'
+                html_content = response.text
+                print(f"Successfully fetched {len(html_content)} bytes from URL.")
+            except requests.exceptions.Timeout:
+                return f"❌ **Error:** Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
+            except requests.exceptions.RequestException as e:
+                print(f"Request failed: {e}")
+                return f"❌ **Error:** Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
+            except Exception as e:
+                print(f"An unexpected error occurred during fetch: {e}")
+                return f"❌ **Error:** An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```"
+        elif html_input:
+            source = "Direct HTML Input"
+            print(f"Using direct HTML input ({len(html_input)} bytes).")
+            html_content = html_input
+        else:
+            return "❓ Please provide a URL or paste HTML content in the fields above."
+        # --- Step 2: Convert to Markdown ---
+        if not html_content:
+            return f"❓ No HTML content found from {source}."
+        print(f"Attempting to convert HTML from {source} to Markdown...")
         try:
+            # Use markdownify to convert
+            # You can pass options here if needed, e.g., heading_style="ATX"
+            markdown_output = markdownify(html_content, heading_style="ATX")
+            print(f"Conversion successful. Markdown length: {len(markdown_output)}")
+            # The markdown_output is already "beautified" in the sense of standard Markdown.
+            # The gr.Markdown component will render it nicely.
+            return markdown_output
         except Exception as e:
+            print(f"Markdown conversion failed: {e}")
+            # Return error in a Markdown code block for readability
+            return f"❌ **Error:** Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```"
+    except Exception as e:
+        # Catch any unexpected errors in the overall logic
+        print(f"An unexpected error occurred: {e}")
+        return f"❌ **Error:** An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```"
+# --- Gradio Interface ---
+title = "HTML to Markdown Converter"
+description = """
+Enter a URL **or** paste HTML code directly into the text box below.
+The tool will fetch the HTML (if URL is provided) and convert it into Markdown.
+The converted Markdown will be displayed below. Priority is given to the URL input if both fields are filled.
+"""
+article = """
+**How it works:**
+1.  Uses the `requests` library to fetch content from URLs.
+2.  Uses the `markdownify` library to convert HTML source code into Markdown text.
+3.  The output is displayed in a rendered Markdown format.
+**Note on 'Beautification':** The `markdownify` library aims to produce clean, standard Markdown. The rendering in the output box provides visual clarity. No additional styling rules are applied beyond standard Markdown conversion.
+"""
+# Define input components
+url_input = gr.Textbox(
+    label="Enter URL (gets priority)",
+    placeholder="e.g., https://en.wikipedia.org/wiki/Markdown"
+)
+html_input_area = gr.Textbox(
+    label="Or Paste HTML Code Here",
+    lines=10,
+    placeholder="e.g., <h1>Hello</h1><p>This is <b>bold</b>.</p>"
+)
+# Define output component
+markdown_output_display = gr.Markdown(label="Converted Markdown Output")
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=html_to_markdown_converter,
+    inputs=[url_input, html_input_area],
+    outputs=markdown_output_display,
+    title=title,
+    description=description,
+    article=article,
+    allow_flagging='never',
     examples=[
+        ["https://gradio.app/quickstart/", ""], # Example using URL
+        ["", "<h2>Example HTML</h2><p>Convert <em>this</em> snippet.</p><ul><li>Item 1</li><li>Item 2</li></ul>"], # Example using direct HTML
+        ["https://httpbin.org/delay/20", ""], # Example slow URL (might timeout)
+        ["https://invalid-url-that-does-not-exist-probably.xyz", ""] # Example invalid URL
     ]
+)
+# Launch the app (for local testing or Hugging Face Spaces)
+if __name__ == "__main__":
+    iface.launch()