Spaces:

13ze
/

smart-scrape-html-to-md

Runtime error

App Files Files Community

13ze commited on Apr 19, 2025

Commit

9d2b078

verified ·

1 Parent(s): 9f2b249

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -91

app.py CHANGED Viewed

@@ -4,27 +4,26 @@ from markdownify import markdownify
 import traceback # To help format potential errors
 from readability import Document
 from bs4 import BeautifulSoup
 # Configure requests with a timeout and user-agent
-DEFAULT_TIMEOUT = 20 # Increased timeout slightly for potentially slower sites
 HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}
-# Função MODIFICADA para retornar APENAS a string de resultado
 def html_to_markdown_converter(url: str, html_input: str) -> str:
     """
     Converts HTML (from URL or direct input) to Markdown.
     Attempts to extract main content using readability.
     Returns the resulting Markdown string or an error message.
     """
     html_content = ""
     source = ""
-    use_readability = True # Flag to control if readability is used
-    # Clean up inputs
     url = url.strip() if url else ""
     html_input = html_input.strip() if html_input else ""
-    # --- Start processing ---
     try:
         # --- Step 1: Get HTML Content ---
         if url:
@@ -41,123 +40,135 @@ def html_to_markdown_converter(url: str, html_input: str) -> str:
                 html_content = response.text
                 print(f"Successfully fetched {len(html_content)} bytes from URL.")
             except requests.exceptions.MissingSchema:
-                 error_msg = f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
-                 print(error_msg)
-                 return error_msg
             except requests.exceptions.Timeout:
-                error_msg = f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
-                print(error_msg)
-                return error_msg
             except requests.exceptions.RequestException as e:
-                print(f"Request failed: {e}")
-                error_msg = f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
-                return error_msg
             except Exception as e:
-                print(f"An unexpected error occurred during fetch: {e}")
-                error_msg = f"❌ Error: An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```"
-                return error_msg
         elif html_input:
             source = "Direct HTML Input"
             print(f"Using direct HTML input ({len(html_input)} bytes).")
             html_content = html_input
         else:
-            info_msg = "❓ Please provide a URL or paste HTML content in the fields above."
-            return info_msg
-        # --- Step 2: Extract Main Content (using Readability) ---
-        if not html_content:
-             error_msg = f"❓ No HTML content found from {source}."
-             print(error_msg)
-             return error_msg
-        processed_html = html_content
-        article_title = ""
         if use_readability:
             print("Attempting to extract main content using Readability...")
             try:
-                # Add basic cleaning before readability for potentially problematic tags
-                soup_pre = BeautifulSoup(html_content, 'html.parser')
-                for tag in soup_pre(['script', 'style', 'iframe', 'svg', 'noscript']):
-                    tag.decompose()
-                cleaned_html_for_readability = str(soup_pre)
-                doc = Document(cleaned_html_for_readability) # Use cleaned HTML
-                article_title = doc.title()
                 processed_html_summary = doc.summary()
-                soup = BeautifulSoup(processed_html_summary, 'html.parser')
-                if not soup.text.strip():
-                    print("Readability summary was empty. Falling back to full HTML.")
-                    processed_html = html_content # Fallback to original if summary empty
-                    article_title = ""
                 else:
-                     processed_html = processed_html_summary
-                     print(f"Readability extracted title: '{article_title}'. Using summary.")
             except Exception as e:
-                print(f"Readability processing failed: {e}. Falling back to full HTML.")
-                processed_html = html_content # Fallback on error
-                article_title = ""
-        else:
-             print("Skipping Readability step.")
-             processed_html = html_content
         # --- Step 3: Convert the Processed HTML to Markdown ---
         if not processed_html.strip():
-             error_msg = f"❓ The HTML content (after potential processing) appears to be empty."
-             print(error_msg)
-             return error_msg
-        print(f"Attempting to convert processed HTML (length: {len(processed_html)}) to Markdown...")
         try:
-            # Using markdownify options to potentially strip unwanted tags that readability missed
             markdown_output = markdownify(
                 processed_html,
                 heading_style="ATX",
-                bullets='*',
-                strip=['a', 'img'] if not article_title else [], # Optional: remove links/images if title wasn't found (less likely main content)
-                escape_codes=True # Ensure code blocks are escaped properly
-            )
-            print(f"Conversion successful. Markdown length: {len(markdown_output)}")
-            # Prepend title if found and readability summary was used
-            if article_title and processed_html != html_content:
-                final_output = f"# {article_title}\n\n{markdown_output}"
             else:
-                final_output = markdown_output
-            if not final_output.strip():
-                 info_msg = f"ℹ️ The conversion resulted in empty Markdown."
-                 print(info_msg)
-                 return info_msg
-            # SUCCESS: Return the final markdown string
-            return final_output.strip() # Strip leading/trailing whitespace from final output
         except Exception as e:
-            print(f"Markdown conversion failed: {e}")
-            error_msg = f"❌ Error: Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```"
-            return error_msg
     except Exception as e:
-        print(f"An unexpected error occurred in the main handler: {e}")
-        error_msg = f"❌ Error: An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```"
-        return error_msg
 # --- Gradio Interface (Standard) ---
 title = "HTML to Markdown Converter (Smart Extraction)"
 description = """
 Enter a URL **or** paste HTML code directly into the text box below.
-The tool attempts to extract the main article content using Mozilla's Readability library and converts it to Markdown.
 The resulting Markdown code is displayed below. Use the **copy icon** (📋) in the output box to copy the code.
 """
 article = """
 **How it works:**
-1.  Uses `requests` to fetch content from URLs.
-2.  Uses `readability-lxml` to attempt extracting the main article content. Falls back to full HTML if needed. Some basic pre-cleaning is done before Readability.
-3.  Uses `markdownify` to convert the processed HTML into Markdown.
-4.  The **raw Markdown code** is displayed in the output text box below.
-5.  Click the standard **copy icon** (📋) provided by Gradio in the top-right corner of the output box to copy the Markdown code.
 """
 # Define input components
@@ -188,23 +199,18 @@ iface = gr.Interface(
     description=description,
     article=article,
     allow_flagging='never',
-    # --- UPDATED EXAMPLES ---
     examples=[
         ["https://psychedelic.com.br/profissoes-boneca-barbie/", ""],
         ["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""],
-        ["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"]
     ],
-    # --- END OF UPDATED EXAMPLES ---
-    cache_examples=False # Keep False as we fetch live URLs
 )
 # Launch the app
 if __name__ == "__main__":
-    # Reminder: requirements.txt should be:
-    # gradio
-    # requests
-    # markdownify
-    # beautifulsoup4
-    # readability-lxml
-    # lxml[html_clean]
     iface.launch()

 import traceback # To help format potential errors
 from readability import Document
 from bs4 import BeautifulSoup
+import re # Import regex for potentially cleaning readability titles
 # Configure requests with a timeout and user-agent
+DEFAULT_TIMEOUT = 20
 HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}
 def html_to_markdown_converter(url: str, html_input: str) -> str:
     """
     Converts HTML (from URL or direct input) to Markdown.
     Attempts to extract main content using readability.
+    Uses readability title, falls back to first H1 if needed, and prevents duplication.
     Returns the resulting Markdown string or an error message.
     """
     html_content = ""
     source = ""
+    use_readability = True
     url = url.strip() if url else ""
     html_input = html_input.strip() if html_input else ""
     try:
         # --- Step 1: Get HTML Content ---
         if url:
                 html_content = response.text
                 print(f"Successfully fetched {len(html_content)} bytes from URL.")
             except requests.exceptions.MissingSchema:
+                 return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
             except requests.exceptions.Timeout:
+                return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
             except requests.exceptions.RequestException as e:
+                return f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
             except Exception as e:
+                return f"❌ Error: An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```"
         elif html_input:
             source = "Direct HTML Input"
             print(f"Using direct HTML input ({len(html_input)} bytes).")
             html_content = html_input
         else:
+            return "❓ Please provide a URL or paste HTML content in the fields above."
+        # --- Pre-cleaning before Readability ---
+        if not html_content: return f"❓ No HTML content found from {source}."
+        print("Pre-cleaning HTML...")
+        soup_pre = BeautifulSoup(html_content, 'html.parser')
+        for tag in soup_pre(['script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside']): # More aggressive cleaning
+            tag.decompose()
+        cleaned_html = str(soup_pre) # Use this cleaned version going forward
+        # --- Step 2: Extract Main Content and Title (using Readability) ---
+        processed_html = cleaned_html # Default to cleaned HTML
+        readability_title = None
+        final_title = None # <<< Title to be used in the final output
         if use_readability:
             print("Attempting to extract main content using Readability...")
             try:
+                doc = Document(cleaned_html) # Use cleaned HTML
+                readability_title = doc.title()
                 processed_html_summary = doc.summary()
+                # Check if readability summary is valid
+                soup_summary_check = BeautifulSoup(processed_html_summary, 'html.parser')
+                if soup_summary_check.text.strip():
+                     processed_html = processed_html_summary # Use summary if valid
+                     print(f"Readability extracted title: '{readability_title}'. Using summary.")
                 else:
+                     print("Readability summary was empty. Falling back to cleaned full HTML.")
+                     # processed_html remains cleaned_html
+                     readability_title = None # Discard title if summary failed
             except Exception as e:
+                print(f"Readability processing failed: {e}. Falling back to cleaned full HTML.")
+                # processed_html remains cleaned_html
+                readability_title = None
+        # --- Title Decision Logic ---
+        # Priority 1: Readability title (if good)
+        if readability_title and len(readability_title) > 3 and not readability_title.startswith('[') : # Basic check for valid title
+             final_title = readability_title.strip()
+             print(f"Using Readability title: '{final_title}'")
+        # Priority 2: Fallback to first H1 from CLEANED HTML if no good Readability title
+        if not final_title:
+            print("Readability title not suitable or not found. Looking for H1 fallback...")
+            soup_for_h1 = BeautifulSoup(cleaned_html, 'html.parser')
+            h1_tag = soup_for_h1.find('h1')
+            if h1_tag:
+                h1_text = h1_tag.get_text(strip=True)
+                if h1_text:
+                    final_title = h1_text
+                    print(f"Using H1 fallback title: '{final_title}'")
+        # --- Prevent Title Duplication in Content ---
+        if final_title:
+            print(f"Checking for title duplication in processed HTML (first H1)...")
+            soup_proc = BeautifulSoup(processed_html, 'html.parser')
+            first_h1_in_proc = soup_proc.find('h1')
+            if first_h1_in_proc:
+                h1_proc_text = first_h1_in_proc.get_text(strip=True)
+                # Check if the H1 text in content matches the final title we decided on
+                if h1_proc_text == final_title:
+                    print(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.")
+                    first_h1_in_proc.decompose() # Remove the H1 tag
+                    processed_html = str(soup_proc) # Update the HTML string to be converted
         # --- Step 3: Convert the Processed HTML to Markdown ---
         if not processed_html.strip():
+             return f"❓ The HTML content (after processing) appears to be empty."
+        print(f"Attempting to convert final processed HTML (length: {len(processed_html)}) to Markdown...")
         try:
             markdown_output = markdownify(
                 processed_html,
                 heading_style="ATX",
+                bullets='*'
+            ).strip() # Strip whitespace from markdown output
+            # Assemble final output
+            if final_title:
+                # Prepend the decided title if one exists
+                final_markdown = f"# {final_title}\n\n{markdown_output}"
             else:
+                # Otherwise, just use the converted markdown
+                final_markdown = markdown_output
+            if not final_markdown.strip():
+                 return f"ℹ️ The conversion resulted in empty Markdown."
+            return final_markdown.strip() # Return final cleaned string
         except Exception as e:
+            return f"❌ Error: Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```"
     except Exception as e:
+        return f"❌ Error: An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```"
 # --- Gradio Interface (Standard) ---
 title = "HTML to Markdown Converter (Smart Extraction)"
 description = """
 Enter a URL **or** paste HTML code directly into the text box below.
+The tool attempts to extract the main article content, identifies a title (using page title or first H1 as fallback), and converts it to Markdown.
 The resulting Markdown code is displayed below. Use the **copy icon** (📋) in the output box to copy the code.
 """
 article = """
 **How it works:**
+1.  Fetches HTML from URL or uses pasted input.
+2.  Performs basic cleaning (removes scripts, styles, headers, footers, etc.).
+3.  Uses `readability-lxml` to extract the main content and attempt to find a page title.
+4.  **Title Logic:** Prefers the title found by `readability`. If none is found or it seems invalid, it looks for the first `<h1>` tag in the cleaned HTML as a fallback.
+5.  **Deduplication:** If a title is determined, the tool checks if the *first* `<h1>` tag within the extracted main content matches this title. If so, it removes that `<h1>` tag *before* conversion to prevent the title appearing twice.
+6.  Uses `markdownify` to convert the processed HTML (potentially without its first H1) into Markdown.
+7.  Prepends the determined title (if any) to the final Markdown output.
+8.  Displays the raw Markdown code in the output box with a copy button.
 """
 # Define input components
     description=description,
     article=article,
     allow_flagging='never',
     examples=[
         ["https://psychedelic.com.br/profissoes-boneca-barbie/", ""],
         ["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""],
+        ["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"],
+        # Add an example without H1 to test no-title scenario
+        ["", "<p>Um parágrafo sem título H1.</p><div><p>Outro conteúdo.</p></div>"]
     ],
+    cache_examples=False
 )
 # Launch the app
 if __name__ == "__main__":
+    # Reminder: requirements.txt includes:
+    # gradio, requests, markdownify, beautifulsoup4, readability-lxml, lxml[html_clean]
     iface.launch()