File size: 17,971 Bytes
48fcfd2
 
 
1809cad
48fcfd2
 
1809cad
 
 
 
48fcfd2
1809cad
 
 
 
 
 
 
 
 
 
 
48fcfd2
1809cad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48fcfd2
1809cad
 
 
 
48fcfd2
 
1809cad
48fcfd2
1809cad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48fcfd2
1809cad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48fcfd2
1809cad
 
 
 
 
 
 
 
 
9d2b078
1809cad
 
 
 
 
 
 
 
9d2b078
1809cad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d2b078
1809cad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d2b078
 
 
 
 
1809cad
 
 
 
 
 
 
9d2b078
1809cad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d2b078
 
 
 
1809cad
 
 
 
 
 
 
 
 
48fcfd2
1809cad
 
 
48fcfd2
1809cad
 
 
 
 
 
 
48fcfd2
1809cad
 
 
 
48fcfd2
1809cad
 
 
48fcfd2
1809cad
 
48fcfd2
 
1809cad
 
48fcfd2
fd8df2d
1809cad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba11ba7
fd8df2d
 
1809cad
 
 
 
fd8df2d
 
1809cad
 
 
 
 
 
 
 
 
 
 
1d79c85
 
1809cad
 
 
 
6166ec0
1809cad
6166ec0
bd8c0ac
 
 
6166ec0
 
 
 
bd8c0ac
1809cad
7b911da
 
1809cad
9d2b078
1809cad
ba11ba7
6166ec0
9d2b078
6166ec0
 
 
fd8df2d
1809cad
6166ec0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
import gradio as gr
import requests
from markdownify import markdownify
import traceback
from readability import Document
from bs4 import BeautifulSoup
import logging
import socket
import ipaddress
from urllib.parse import urlparse

# --- Configuration Constants ---
DEFAULT_TIMEOUT = 20 # seconds
HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.2 (+https://hf.space)'} # Updated version
MAX_CONTENT_SIZE_BYTES = 10 * 1024 * 1024
MIN_TITLE_LENGTH = 4
PRECLEAN_TAGS_TO_REMOVE = [
    'script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside', 'form', 'button', 'input', 'textarea', 'select', 'option', 'label'
]
GENERIC_ERROR_MESSAGE = "❌ Error: An unexpected internal error occurred. Please check logs or try again later."
SOURCE_URL_PREFIX = "URL" # Identifier for URL source
SOURCE_DIRECT_INPUT = "Direct HTML Input" # Identifier for direct input

# --- Logging Setup ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Helper Functions ---

def _is_ip_allowed(hostname: str) -> bool:
    """Verifica se o IP resolvido do hostname é permitido (não privado/local)."""
    try:
        addr_info = socket.getaddrinfo(hostname, None)
        ip_addr_str = addr_info[0][4][0]
        ip_addr = ipaddress.ip_address(ip_addr_str)
        if ip_addr.is_private or ip_addr.is_loopback or ip_addr.is_link_local:
            logging.warning(f"Blocked attempt to access internal/private IP: {ip_addr_str} for hostname {hostname}")
            return False
        logging.info(f"Hostname {hostname} resolved to allowed public IP {ip_addr_str}.")
        return True
    except socket.gaierror as e:
        logging.error(f"Could not resolve hostname: {hostname} - {e}")
        return False
    except Exception as e:
        logging.error(f"Unexpected error during IP validation for {hostname}: {e}", exc_info=True)
        return False

def _fetch_and_clean_html(url: str, html_input: str) -> tuple[str | None, str | None, str | None]:
    """
    Busca HTML da URL ou usa input direto, faz pré-limpeza.
    Retorna uma tupla: (cleaned_html, source_description, error_message)
    Retorna (None, source, error_message) em caso de erro.
    Retorna (None, None, error_message) se nenhuma entrada foi fornecida.
    """
    html_content = ""
    source = None # Initialize source

    if url:
        source = f"{SOURCE_URL_PREFIX} ({url})" # Use constant prefix
        logging.info(f"Attempting to fetch HTML from URL: {url}")
        try:
            # ... (mesma lógica de fetch, validação de IP, tamanho, etc.)...
            # 1. Prepend Scheme
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url
                logging.info(f"Scheme missing, prepended https://. New URL: {url}")
            # 2. Validate URL structure and check for forbidden IPs
            parsed_url = urlparse(url)
            if not parsed_url.scheme or not parsed_url.netloc:
                 raise ValueError("Invalid URL structure.")
            if not _is_ip_allowed(parsed_url.hostname):
                 # Pass source back even on error
                 return None, source, f"❌ Error: Access to this URL's IP address is not allowed for security reasons."
            # 3. Fetch content
            response = requests.get(url, timeout=DEFAULT_TIMEOUT, headers=HEADERS, allow_redirects=True, stream=True)
            response.raise_for_status()
            # 4. Check Content-Length
            content_length = response.headers.get('Content-Length')
            if content_length and int(content_length) > MAX_CONTENT_SIZE_BYTES:
                logging.warning(f"Content-Length {content_length} exceeds limit for URL: {url}")
                return None, source, f"❌ Error: Content exceeds maximum allowed size ({MAX_CONTENT_SIZE_BYTES // 1024 // 1024}MB)."
            # 5. Read content
            response.encoding = response.apparent_encoding or 'utf-8'
            html_content = response.text
            if len(html_content.encode(response.encoding, errors='ignore')) > MAX_CONTENT_SIZE_BYTES * 1.1:
                logging.warning(f"Decoded content size exceeds limit for URL: {url}")
                return None, source, f"❌ Error: Decoded content exceeds estimated maximum size."
            logging.info(f"Successfully fetched {len(html_content)} bytes from {url}.")

        except ValueError as e:
             logging.error(f"Invalid URL provided: {url} - {e}")
             return None, source, f"❌ Error: Invalid URL format: `{url}`."
        except requests.exceptions.MissingSchema:
            logging.error(f"Invalid URL (Missing Schema): {url}")
            return None, source, f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
        except requests.exceptions.Timeout:
            logging.warning(f"Request timed out for URL: {url}")
            return None, source, f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
        except requests.exceptions.RequestException as e:
            logging.error(f"Failed to fetch URL: {url} - {e}")
            return None, source, f"❌ Error: Failed to fetch content from URL: `{url}`\nDetails: {e}"
        except Exception as e:
            logging.error(f"Unexpected error fetching URL {url}: {traceback.format_exc()}")
            return None, source, GENERIC_ERROR_MESSAGE

    elif html_input:
        source = SOURCE_DIRECT_INPUT # Use constant
        logging.info(f"Using {source} ({len(html_input)} bytes).")
        if len(html_input) > MAX_CONTENT_SIZE_BYTES * 1.2:
             logging.warning(f"Direct HTML input size {len(html_input)} exceeds limit.")
             # Pass source back even on error
             return None, source, f"❌ Error: Pasted HTML exceeds maximum allowed size."
        html_content = html_input
    else:
        # No input provided
        return None, None, "❓ Please provide a URL or paste HTML content in the fields above."

    # --- Pre-cleaning ---
    if not html_content: # Should only happen if logic above fails unexpectedly
        logging.error("Reached pre-cleaning stage with no HTML content.")
        return None, source, f"❓ No HTML content found from {source}."

    logging.info("Pre-cleaning HTML...")
    try:
        soup_pre = BeautifulSoup(html_content, 'lxml')
        for tag in soup_pre(PRECLEAN_TAGS_TO_REMOVE):
            tag.decompose()
        cleaned_html = str(soup_pre)
        logging.info(f"HTML pre-cleaned. Size reduced to {len(cleaned_html)} bytes.")
        # Return cleaned_html, source, and None for error message
        return cleaned_html, source, None
    except Exception as e:
        logging.error(f"Error during HTML pre-cleaning: {traceback.format_exc()}")
        # Pass source back even on error
        return None, source, "❌ Error: Failed during HTML pre-cleaning step."


# **MODIFIED**
def _extract_content_and_title(cleaned_html: str, source: str) -> tuple[str | None, str | None]:
    """
    Extrai conteúdo principal com Readability (APENAS para URLs) e determina o título.
    Retorna (processed_html, final_title).
    """
    processed_html = cleaned_html # Default to cleaned HTML (importante para Direct Input)
    readability_title = None
    final_title = None
    use_readability = True # Internal flag, could be user option later

    # **Execute Readability ONLY if requested AND the source is a URL**
    if use_readability and source and source.startswith(SOURCE_URL_PREFIX):
        logging.info("Source is URL. Attempting to extract main content using Readability...")
        try:
            doc = Document(cleaned_html)
            readability_title = doc.title()
            processed_html_summary = doc.summary()
            soup_summary_check = BeautifulSoup(processed_html_summary, 'lxml')
            if soup_summary_check.text.strip():
                processed_html = processed_html_summary # Use summary ONLY IF valid AND source is URL
                logging.info(f"Readability extracted title: '{readability_title}'. Using summary content for URL.")
            else:
                logging.warning("Readability summary was empty for URL. Falling back to cleaned full HTML.")
                readability_title = None # Discard title if summary failed
                # processed_html remains cleaned_html

        except Exception as e:
            logging.warning(f"Readability processing failed for URL: {e}. Falling back to cleaned full HTML.")
            readability_title = None
            # processed_html remains cleaned_html
    elif source == SOURCE_DIRECT_INPUT:
        logging.info("Source is Direct HTML Input. Skipping Readability content extraction.")
        # processed_html is already set to cleaned_html, which is correct.
        readability_title = None # Ensure no accidental title carry-over
    else:
         logging.warning(f"Source type '{source}' unknown or missing, skipping Readability.")
         readability_title = None

    # --- Title Decision Logic ---
    # Priority 1: Readability title (only possible if source was URL and Readability ran)
    if readability_title and len(readability_title) >= MIN_TITLE_LENGTH and not readability_title.strip().startswith('['):
        final_title = readability_title.strip()
        logging.info(f"Using Readability title: '{final_title}'")

    # Priority 2: Fallback to first H1 from CLEANED HTML (runs for BOTH URL and Direct Input if no Readability title)
    if not final_title:
        # Log difference based on source
        if source and source.startswith(SOURCE_URL_PREFIX):
            logging.info("Readability title not suitable or not found for URL. Looking for H1 fallback in cleaned HTML...")
        else: # Includes Direct Input and unknowns
             logging.info("Looking for H1 title in cleaned HTML...")

        try:
            soup_for_h1 = BeautifulSoup(cleaned_html, 'lxml')
            h1_tag = soup_for_h1.find('h1')
            if h1_tag:
                h1_text = h1_tag.get_text(strip=True)
                if h1_text:
                    final_title = h1_text
                    logging.info(f"Using H1 fallback title: '{final_title}'")
                else:
                    logging.info("Found H1 tag but it was empty.")
            else:
                 logging.info("No H1 tag found in cleaned HTML for fallback title.")
        except Exception as e:
             logging.error(f"Error searching for H1 fallback title: {traceback.format_exc()}")

    # Return the HTML to be converted (either Readability summary or cleaned_html) and the determined title
    return processed_html, final_title


def _convert_to_markdown(processed_html: str, final_title: str | None) -> tuple[str | None, str | None]:
    """
    Remove título duplicado do HTML processado (se necessário) e converte para Markdown.
    Retorna (final_markdown, None) ou (None, error_message).
    """
    # ... (mesma lógica de verificação de H1 duplicado e conversão com markdownify) ...
    html_to_convert = processed_html

    if final_title:
        logging.info(f"Checking for title duplication (first H1 in processed content)...")
        try:
            soup_proc = BeautifulSoup(processed_html, 'lxml')
            first_h1_in_proc = soup_proc.find('h1')
            if first_h1_in_proc:
                h1_proc_text = first_h1_in_proc.get_text(strip=True)
                if h1_proc_text == final_title:
                    logging.info(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.")
                    first_h1_in_proc.decompose()
                    html_to_convert = str(soup_proc)
                else:
                    logging.info(f"First H1 content ('{h1_proc_text}') does not match final title ('{final_title}'). Keeping H1.")
            else:
                logging.info("No H1 found in processed content to check for duplication.")
        except Exception as e:
            logging.error(f"Error during title duplication check: {traceback.format_exc()}")

    if not html_to_convert.strip():
        logging.warning("HTML content (after processing) is empty. Cannot convert.")
        return None, f"❓ The HTML content (after processing) appears to be empty."

    logging.info(f"Attempting to convert final processed HTML (length: {len(html_to_convert)}) to Markdown...")
    try:
        markdown_output = markdownify(
            html_to_convert,
            heading_style="ATX",
            bullets='*'
        ).strip()

        if final_title:
            final_markdown = f"# {final_title}\n\n{markdown_output}"
        else:
            final_markdown = markdown_output

        if not final_markdown.strip():
            logging.warning("Markdown conversion resulted in empty output.")
            return None, f"ℹ️ The conversion resulted in empty Markdown."

        logging.info(f"Successfully converted to Markdown (length: {len(final_markdown)}).")
        return final_markdown.strip(), None

    except Exception as e:
        logging.error(f"Failed to convert HTML to Markdown: {traceback.format_exc()}")
        return None, "❌ Error: Failed during the final Markdown conversion step."


# --- Main Gradio Function (Orchestrator) ---
# **MODIFIED**
def html_to_markdown_converter(url: str, html_input: str) -> str:
    """
    Converts HTML (from URL or direct input) to Markdown using helper functions.
    Handles overall workflow and top-level errors.
    """
    url = url.strip() if url else ""
    html_input = html_input.strip() if html_input else ""

    try:
        # 1. Fetch and Clean HTML
        # Now returns: cleaned_html, source, error_message
        cleaned_html, source, error_msg = _fetch_and_clean_html(url, html_input)
        if error_msg: # Check if fetch/clean returned an error message
            return error_msg
        if cleaned_html is None or source is None: # Should not happen if error_msg is None, but check anyway
             logging.error("Fetching/cleaning returned None HTML/source without error message.")
             return GENERIC_ERROR_MESSAGE

        # 2. Extract Content and Title (pass source)
        # Now takes cleaned_html and source
        processed_html, final_title = _extract_content_and_title(cleaned_html, source)

        if processed_html is None:
             logging.error("Processed HTML became None unexpectedly after extraction step.")
             return GENERIC_ERROR_MESSAGE

        # 3. Convert to Markdown
        final_markdown, convert_error_msg = _convert_to_markdown(processed_html, final_title)
        if convert_error_msg:
            return convert_error_msg
        else:
            return final_markdown # Success

    except Exception as e:
        logging.error(f"FATAL: Unexpected error in main converter function: {traceback.format_exc()}")
        return GENERIC_ERROR_MESSAGE


# --- Gradio Interface Definition (Adjust description slightly) ---
title = "Smart Scrape Any URL or Website to Markdown [Expert CPU Mode]"
description = """
Enter a URL **or** paste HTML code directly into the text box below.
- For **URLs**, the tool attempts to extract the main article content using `readability` before converting.
- For **Pasted HTML**, the tool converts the *entire* provided HTML (after basic cleaning) without using `readability`'s content extraction.
It identifies a title (page title or first H1 fallback) and converts to Markdown. Includes security checks and size limits.
Use the **copy icon** (📋) in the output box to copy the code.
"""
article = """
**How it works (v1.2):**
1.  **Input:** Accepts URL or direct HTML.
2.  **Fetch/Clean:** Gets HTML, performs security checks (IP block, size limit), removes basic tags (`<script>`, `<nav>`, etc.). Determines if source is URL or Direct Input.
3.  **Content Processing:**
    *   **If Source is URL:** Attempts `readability-lxml` extraction (`doc.summary()`). Falls back to cleaned HTML if extraction fails/is empty.
    *   **If Source is Direct Input:** **Skips** `readability-lxml` extraction. Uses the cleaned HTML directly.
4.  **Title Logic:** Tries Readability title (if URL source). Falls back to first `<h1>` in *cleaned* HTML otherwise.
5.  **Deduplication:** Removes the first `<h1>` from the *processed content* if it matches the determined title.
6.  **Conversion:** Uses `markdownify` to convert the final processed HTML to Markdown.
7.  **Output:** Prepends title (if found) and returns Markdown or error message.
8.  **Logging:** Uses Python's `logging`.
"""

# Define input/output components (No changes needed)
url_input = gr.Textbox(...)
html_input_area = gr.Textbox(...)
markdown_output_textbox = gr.Textbox(...)

# Create the Gradio interface (No changes needed in the call)
iface = gr.Interface(
    fn=html_to_markdown_converter,
    inputs=[url_input, html_input_area],
    outputs=markdown_output_textbox,
    title=title,
    description=description,
    article=article,
    allow_flagging='never',
    examples=[
        # Examples using URLs (should use Readability)
        ["https://psychedelic.com.br/profissoes-boneca-barbie/", ""],
        ["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""],
        # Example with direct HTML INCLUDING list (should now work)
        ["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"],
        # Example direct HTML without H1
        ["", "<p>Um parágrafo sem título H1.</p><div><p>Outro conteúdo.</p></div>"]
    ],
    cache_examples=False
)

# Launch the app
if __name__ == "__main__":
    # Reminder: requirements: gradio, requests, markdownify, beautifulsoup4, readability-lxml, lxml
    iface.launch()