scrape-to-markdown-expert-mode-cpu

Sleeping

File size: 17,971 Bytes

import gradio as gr
import requests
from markdownify import markdownify
import traceback
from readability import Document
from bs4 import BeautifulSoup
import logging
import socket
import ipaddress
from urllib.parse import urlparse

# --- Configuration Constants ---
DEFAULT_TIMEOUT = 20 # seconds
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.2 (+https://hf.space)'} # Updated version
MAX_CONTENT_SIZE_BYTES = 10 * 1024 * 1024
MIN_TITLE_LENGTH = 4
PRECLEAN_TAGS_TO_REMOVE = [
    'script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'input', 'textarea', 'select', 'option', 'label'
]
GENERIC_ERROR_MESSAGE = "❌ Error: An unexpected internal error occurred. Please check logs or try again later."
SOURCE_URL_PREFIX = "URL" # Identifier for URL source
SOURCE_DIRECT_INPUT = "Direct HTML Input" # Identifier for direct input

# --- Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Helper Functions ---

def _is_ip_allowed(hostname: str) -> bool:
    """Verifica se o IP resolvido do hostname é permitido (não privado/local)."""
    try:
        addr_info = socket.getaddrinfo(hostname, None)
        ip_addr_str = addr_info[0][4][0]
        ip_addr = ipaddress.ip_address(ip_addr_str)
        if ip_addr.is_private or ip_addr.is_loopback or ip_addr.is_link_local:
            logging.warning(f"Blocked attempt to access internal/private IP: {ip_addr_str} for hostname {hostname}")
            return False
        logging.info(f"Hostname {hostname} resolved to allowed public IP {ip_addr_str}.")
        return True
    except socket.gaierror as e:
        logging.error(f"Could not resolve hostname: {hostname} - {e}")
        return False
    except Exception as e:
        logging.error(f"Unexpected error during IP validation for {hostname}: {e}", exc_info=True)
        return False

def _fetch_and_clean_html(url: str, html_input: str) -> tuple[str | None, str | None, str | None]:
    """
    Busca HTML da URL ou usa input direto, faz pré-limpeza.
    Retorna uma tupla: (cleaned_html, source_description, error_message)
    Retorna (None, source, error_message) em caso de erro.
    Retorna (None, None, error_message) se nenhuma entrada foi fornecida.
    """
    html_content = ""
    source = None # Initialize source

    if url:
        source = f"{SOURCE_URL_PREFIX} ({url})" # Use constant prefix
        logging.info(f"Attempting to fetch HTML from URL: {url}")
        try:
            # ... (mesma lógica de fetch, validação de IP, tamanho, etc.)...
            # 1. Prepend Scheme
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url
                logging.info(f"Scheme missing, prepended https://. New URL: {url}")
            # 2. Validate URL structure and check for forbidden IPs
            parsed_url = urlparse(url)
            if not parsed_url.scheme or not parsed_url.netloc:
                 raise ValueError("Invalid URL structure.")
            if not _is_ip_allowed(parsed_url.hostname):
                 # Pass source back even on error
                 return None, source, f"❌ Error: Access to this URL's IP address is not allowed for security reasons."
            # 3. Fetch content
            response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True, stream=True)
            response.raise_for_status()
            # 4. Check Content-Length
            content_length = response.headers.get('Content-Length')
            if content_length and int(content_length) > MAX_CONTENT_SIZE_BYTES:
                logging.warning(f"Content-Length {content_length} exceeds limit for URL: {url}")
                return None, source, f"❌ Error: Content exceeds maximum allowed size ({MAX_CONTENT_SIZE_BYTES // 1024 // 1024}MB)."
            # 5. Read content
            response.encoding = response.apparent_encoding or 'utf-8'
            html_content = response.text
            if len(html_content.encode(response.encoding, errors='ignore')) > MAX_CONTENT_SIZE_BYTES * 1.1:
                logging.warning(f"Decoded content size exceeds limit for URL: {url}")
                return None, source, f"❌ Error: Decoded content exceeds estimated maximum size."
            logging.info(f"Successfully fetched {len(html_content)} bytes from {url}.")

        except ValueError as e:
             logging.error(f"Invalid URL provided: {url} - {e}")
             return None, source, f"❌ Error: Invalid URL format: `{url}`."
        except requests.exceptions.MissingSchema:
            logging.error(f"Invalid URL (Missing Schema): {url}")
            return None, source, f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
        except requests.exceptions.Timeout:
            logging.warning(f"Request timed out for URL: {url}")
            return None, source, f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
        except requests.exceptions.RequestException as e:
            logging.error(f"Failed to fetch URL: {url} - {e}")
            return None, source, f"❌ Error: Failed to fetch content from URL: `{url}`\nDetails: {e}"
        except Exception as e:
            logging.error(f"Unexpected error fetching URL {url}: {traceback.format_exc()}")
            return None, source, GENERIC_ERROR_MESSAGE

    elif html_input:
        source = SOURCE_DIRECT_INPUT # Use constant
        logging.info(f"Using {source} ({len(html_input)} bytes).")
        if len(html_input) > MAX_CONTENT_SIZE_BYTES * 1.2:
             logging.warning(f"Direct HTML input size {len(html_input)} exceeds limit.")
             # Pass source back even on error
             return None, source, f"❌ Error: Pasted HTML exceeds maximum allowed size."
        html_content = html_input
    else:
        # No input provided
        return None, None, "❓ Please provide a URL or paste HTML content in the fields above."

    # --- Pre-cleaning ---
    if not html_content: # Should only happen if logic above fails unexpectedly
        logging.error("Reached pre-cleaning stage with no HTML content.")
        return None, source, f"❓ No HTML content found from {source}."

    logging.info("Pre-cleaning HTML...")
    try:
        soup_pre = BeautifulSoup(html_content, 'lxml')
        for tag in soup_pre(PRECLEAN_TAGS_TO_REMOVE):
            tag.decompose()
        cleaned_html = str(soup_pre)
        logging.info(f"HTML pre-cleaned. Size reduced to {len(cleaned_html)} bytes.")
        # Return cleaned_html, source, and None for error message
        return cleaned_html, source, None
    except Exception as e:
        logging.error(f"Error during HTML pre-cleaning: {traceback.format_exc()}")
        # Pass source back even on error
        return None, source, "❌ Error: Failed during HTML pre-cleaning step."


# **MODIFIED**
def _extract_content_and_title(cleaned_html: str, source: str) -> tuple[str | None, str | None]:
    """
    Extrai conteúdo principal com Readability (APENAS para URLs) e determina o título.
    Retorna (processed_html, final_title).
    """
    processed_html = cleaned_html # Default to cleaned HTML (importante para Direct Input)
    readability_title = None
    final_title = None
    use_readability = True # Internal flag, could be user option later

    # **Execute Readability ONLY if requested AND the source is a URL**
    if use_readability and source and source.startswith(SOURCE_URL_PREFIX):
        logging.info("Source is URL. Attempting to extract main content using Readability...")
        try:
            doc = Document(cleaned_html)
            readability_title = doc.title()
            processed_html_summary = doc.summary()
            soup_summary_check = BeautifulSoup(processed_html_summary, 'lxml')
            if soup_summary_check.text.strip():
                processed_html = processed_html_summary # Use summary ONLY IF valid AND source is URL
                logging.info(f"Readability extracted title: '{readability_title}'. Using summary content for URL.")
            else:
                logging.warning("Readability summary was empty for URL. Falling back to cleaned full HTML.")
                readability_title = None # Discard title if summary failed
                # processed_html remains cleaned_html

        except Exception as e:
            logging.warning(f"Readability processing failed for URL: {e}. Falling back to cleaned full HTML.")
            readability_title = None
            # processed_html remains cleaned_html
    elif source == SOURCE_DIRECT_INPUT:
        logging.info("Source is Direct HTML Input. Skipping Readability content extraction.")
        # processed_html is already set to cleaned_html, which is correct.
        readability_title = None # Ensure no accidental title carry-over
    else:
         logging.warning(f"Source type '{source}' unknown or missing, skipping Readability.")
         readability_title = None

    # --- Title Decision Logic ---
    # Priority 1: Readability title (only possible if source was URL and Readability ran)
    if readability_title and len(readability_title) >= MIN_TITLE_LENGTH and not readability_title.strip().startswith('['):
        final_title = readability_title.strip()
        logging.info(f"Using Readability title: '{final_title}'")

    # Priority 2: Fallback to first H1 from CLEANED HTML (runs for BOTH URL and Direct Input if no Readability title)
    if not final_title:
        # Log difference based on source
        if source and source.startswith(SOURCE_URL_PREFIX):
            logging.info("Readability title not suitable or not found for URL. Looking for H1 fallback in cleaned HTML...")
        else: # Includes Direct Input and unknowns
             logging.info("Looking for H1 title in cleaned HTML...")

        try:
            soup_for_h1 = BeautifulSoup(cleaned_html, 'lxml')
            h1_tag = soup_for_h1.find('h1')
            if h1_tag:
                h1_text = h1_tag.get_text(strip=True)
                if h1_text:
                    final_title = h1_text
                    logging.info(f"Using H1 fallback title: '{final_title}'")
                else:
                    logging.info("Found H1 tag but it was empty.")
            else:
                 logging.info("No H1 tag found in cleaned HTML for fallback title.")
        except Exception as e:
             logging.error(f"Error searching for H1 fallback title: {traceback.format_exc()}")

    # Return the HTML to be converted (either Readability summary or cleaned_html) and the determined title
    return processed_html, final_title


def _convert_to_markdown(processed_html: str, final_title: str | None) -> tuple[str | None, str | None]:
    """
    Remove título duplicado do HTML processado (se necessário) e converte para Markdown.
    Retorna (final_markdown, None) ou (None, error_message).
    """
    # ... (mesma lógica de verificação de H1 duplicado e conversão com markdownify) ...
    html_to_convert = processed_html

    if final_title:
        logging.info(f"Checking for title duplication (first H1 in processed content)...")
        try:
            soup_proc = BeautifulSoup(processed_html, 'lxml')
            first_h1_in_proc = soup_proc.find('h1')
            if first_h1_in_proc:
                h1_proc_text = first_h1_in_proc.get_text(strip=True)
                if h1_proc_text == final_title:
                    logging.info(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.")
                    first_h1_in_proc.decompose()
                    html_to_convert = str(soup_proc)
                else:
                    logging.info(f"First H1 content ('{h1_proc_text}') does not match final title ('{final_title}'). Keeping H1.")
            else:
                logging.info("No H1 found in processed content to check for duplication.")
        except Exception as e:
            logging.error(f"Error during title duplication check: {traceback.format_exc()}")

    if not html_to_convert.strip():
        logging.warning("HTML content (after processing) is empty. Cannot convert.")
        return None, f"❓ The HTML content (after processing) appears to be empty."

    logging.info(f"Attempting to convert final processed HTML (length: {len(html_to_convert)}) to Markdown...")
    try:
        markdown_output = markdownify(
            html_to_convert,
            heading_style="ATX",
            bullets='*'
        ).strip()

        if final_title:
            final_markdown = f"# {final_title}\n\n{markdown_output}"
        else:
            final_markdown = markdown_output

        if not final_markdown.strip():
            logging.warning("Markdown conversion resulted in empty output.")
            return None, f"ℹ️ The conversion resulted in empty Markdown."

        logging.info(f"Successfully converted to Markdown (length: {len(final_markdown)}).")
        return final_markdown.strip(), None

    except Exception as e:
        logging.error(f"Failed to convert HTML to Markdown: {traceback.format_exc()}")
        return None, "❌ Error: Failed during the final Markdown conversion step."


# --- Main Gradio Function (Orchestrator) ---
# **MODIFIED**
def html_to_markdown_converter(url: str, html_input: str) -> str:
    """
    Converts HTML (from URL or direct input) to Markdown using helper functions.
    Handles overall workflow and top-level errors.
    """
    url = url.strip() if url else ""
    html_input = html_input.strip() if html_input else ""

    try:
        # 1. Fetch and Clean HTML
        # Now returns: cleaned_html, source, error_message
        cleaned_html, source, error_msg = _fetch_and_clean_html(url, html_input)
        if error_msg: # Check if fetch/clean returned an error message
            return error_msg
        if cleaned_html is None or source is None: # Should not happen if error_msg is None, but check anyway
             logging.error("Fetching/cleaning returned None HTML/source without error message.")
             return GENERIC_ERROR_MESSAGE

        # 2. Extract Content and Title (pass source)
        # Now takes cleaned_html and source
        processed_html, final_title = _extract_content_and_title(cleaned_html, source)

        if processed_html is None:
             logging.error("Processed HTML became None unexpectedly after extraction step.")
             return GENERIC_ERROR_MESSAGE

        # 3. Convert to Markdown
        final_markdown, convert_error_msg = _convert_to_markdown(processed_html, final_title)
        if convert_error_msg:
            return convert_error_msg
        else:
            return final_markdown # Success

    except Exception as e:
        logging.error(f"FATAL: Unexpected error in main converter function: {traceback.format_exc()}")
        return GENERIC_ERROR_MESSAGE


# --- Gradio Interface Definition (Adjust description slightly) ---
title = "Smart Scrape Any URL or Website to Markdown [Expert CPU Mode]"
description = """
Enter a URL **or** paste HTML code directly into the text box below.
- For **URLs**, the tool attempts to extract the main article content using `readability` before converting.
- For **Pasted HTML**, the tool converts the *entire* provided HTML (after basic cleaning) without using `readability`'s content extraction.
It identifies a title (page title or first H1 fallback) and converts to Markdown. Includes security checks and size limits.
Use the **copy icon** (📋) in the output box to copy the code.
"""
article = """
**How it works (v1.2):**
1.  **Input:** Accepts URL or direct HTML.
2.  **Fetch/Clean:** Gets HTML, performs security checks (IP block, size limit), removes basic tags (`<script>`, `<nav>`, etc.). Determines if source is URL or Direct Input.
3.  **Content Processing:**
    *   **If Source is URL:** Attempts `readability-lxml` extraction (`doc.summary()`). Falls back to cleaned HTML if extraction fails/is empty.
    *   **If Source is Direct Input:** **Skips** `readability-lxml` extraction. Uses the cleaned HTML directly.
4.  **Title Logic:** Tries Readability title (if URL source). Falls back to first `<h1>` in *cleaned* HTML otherwise.
5.  **Deduplication:** Removes the first `<h1>` from the *processed content* if it matches the determined title.
6.  **Conversion:** Uses `markdownify` to convert the final processed HTML to Markdown.
7.  **Output:** Prepends title (if found) and returns Markdown or error message.
8.  **Logging:** Uses Python's `logging`.
"""

# Define input/output components (No changes needed)
url_input = gr.Textbox(...)
html_input_area = gr.Textbox(...)
markdown_output_textbox = gr.Textbox(...)

# Create the Gradio interface (No changes needed in the call)
iface = gr.Interface(
    fn=html_to_markdown_converter,
    inputs=[url_input, html_input_area],
    outputs=markdown_output_textbox,
    title=title,
    description=description,
    article=article,
    allow_flagging='never',
    examples=[
        # Examples using URLs (should use Readability)
        ["https://psychedelic.com.br/profissoes-boneca-barbie/", ""],
        ["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""],
        # Example with direct HTML INCLUDING list (should now work)
        ["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"],
        # Example direct HTML without H1
        ["", "<p>Um parágrafo sem título H1.</p><div><p>Outro conteúdo.</p></div>"]
    ],
    cache_examples=False
)

# Launch the app
if __name__ == "__main__":
    # Reminder: requirements: gradio, requests, markdownify, beautifulsoup4, readability-lxml, lxml
    iface.launch()