13ze commited on
Commit
9d2b078
·
verified ·
1 Parent(s): 9f2b249

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -91
app.py CHANGED
@@ -4,27 +4,26 @@ from markdownify import markdownify
4
  import traceback # To help format potential errors
5
  from readability import Document
6
  from bs4 import BeautifulSoup
 
7
 
8
  # Configure requests with a timeout and user-agent
9
- DEFAULT_TIMEOUT = 20 # Increased timeout slightly for potentially slower sites
10
  HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}
11
 
12
- # Função MODIFICADA para retornar APENAS a string de resultado
13
  def html_to_markdown_converter(url: str, html_input: str) -> str:
14
  """
15
  Converts HTML (from URL or direct input) to Markdown.
16
  Attempts to extract main content using readability.
 
17
  Returns the resulting Markdown string or an error message.
18
  """
19
  html_content = ""
20
  source = ""
21
- use_readability = True # Flag to control if readability is used
22
 
23
- # Clean up inputs
24
  url = url.strip() if url else ""
25
  html_input = html_input.strip() if html_input else ""
26
 
27
- # --- Start processing ---
28
  try:
29
  # --- Step 1: Get HTML Content ---
30
  if url:
@@ -41,123 +40,135 @@ def html_to_markdown_converter(url: str, html_input: str) -> str:
41
  html_content = response.text
42
  print(f"Successfully fetched {len(html_content)} bytes from URL.")
43
  except requests.exceptions.MissingSchema:
44
- error_msg = f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
45
- print(error_msg)
46
- return error_msg
47
  except requests.exceptions.Timeout:
48
- error_msg = f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
49
- print(error_msg)
50
- return error_msg
51
  except requests.exceptions.RequestException as e:
52
- print(f"Request failed: {e}")
53
- error_msg = f"❌ Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
54
- return error_msg
55
  except Exception as e:
56
- print(f"An unexpected error occurred during fetch: {e}")
57
- error_msg = f"❌ Error: An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```"
58
- return error_msg
59
 
60
  elif html_input:
61
  source = "Direct HTML Input"
62
  print(f"Using direct HTML input ({len(html_input)} bytes).")
63
  html_content = html_input
64
  else:
65
- info_msg = "❓ Please provide a URL or paste HTML content in the fields above."
66
- return info_msg
67
 
68
- # --- Step 2: Extract Main Content (using Readability) ---
69
- if not html_content:
70
- error_msg = f" No HTML content found from {source}."
71
- print(error_msg)
72
- return error_msg
 
 
 
 
 
 
 
73
 
74
- processed_html = html_content
75
- article_title = ""
76
  if use_readability:
77
  print("Attempting to extract main content using Readability...")
78
  try:
79
- # Add basic cleaning before readability for potentially problematic tags
80
- soup_pre = BeautifulSoup(html_content, 'html.parser')
81
- for tag in soup_pre(['script', 'style', 'iframe', 'svg', 'noscript']):
82
- tag.decompose()
83
- cleaned_html_for_readability = str(soup_pre)
84
-
85
- doc = Document(cleaned_html_for_readability) # Use cleaned HTML
86
- article_title = doc.title()
87
  processed_html_summary = doc.summary()
88
- soup = BeautifulSoup(processed_html_summary, 'html.parser')
89
- if not soup.text.strip():
90
- print("Readability summary was empty. Falling back to full HTML.")
91
- processed_html = html_content # Fallback to original if summary empty
92
- article_title = ""
 
93
  else:
94
- processed_html = processed_html_summary
95
- print(f"Readability extracted title: '{article_title}'. Using summary.")
 
 
96
  except Exception as e:
97
- print(f"Readability processing failed: {e}. Falling back to full HTML.")
98
- processed_html = html_content # Fallback on error
99
- article_title = ""
100
- else:
101
- print("Skipping Readability step.")
102
- processed_html = html_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  # --- Step 3: Convert the Processed HTML to Markdown ---
105
  if not processed_html.strip():
106
- error_msg = f"❓ The HTML content (after potential processing) appears to be empty."
107
- print(error_msg)
108
- return error_msg
109
 
110
- print(f"Attempting to convert processed HTML (length: {len(processed_html)}) to Markdown...")
111
  try:
112
- # Using markdownify options to potentially strip unwanted tags that readability missed
113
  markdown_output = markdownify(
114
  processed_html,
115
  heading_style="ATX",
116
- bullets='*',
117
- strip=['a', 'img'] if not article_title else [], # Optional: remove links/images if title wasn't found (less likely main content)
118
- escape_codes=True # Ensure code blocks are escaped properly
119
- )
120
- print(f"Conversion successful. Markdown length: {len(markdown_output)}")
121
-
122
- # Prepend title if found and readability summary was used
123
- if article_title and processed_html != html_content:
124
- final_output = f"# {article_title}\n\n{markdown_output}"
125
  else:
126
- final_output = markdown_output
 
127
 
128
- if not final_output.strip():
129
- info_msg = f"ℹ️ The conversion resulted in empty Markdown."
130
- print(info_msg)
131
- return info_msg
132
 
133
- # SUCCESS: Return the final markdown string
134
- return final_output.strip() # Strip leading/trailing whitespace from final output
135
 
136
  except Exception as e:
137
- print(f"Markdown conversion failed: {e}")
138
- error_msg = f"❌ Error: Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```"
139
- return error_msg
140
 
141
  except Exception as e:
142
- print(f"An unexpected error occurred in the main handler: {e}")
143
- error_msg = f"❌ Error: An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```"
144
- return error_msg
145
 
146
 
147
  # --- Gradio Interface (Standard) ---
148
  title = "HTML to Markdown Converter (Smart Extraction)"
149
  description = """
150
  Enter a URL **or** paste HTML code directly into the text box below.
151
- The tool attempts to extract the main article content using Mozilla's Readability library and converts it to Markdown.
152
  The resulting Markdown code is displayed below. Use the **copy icon** (📋) in the output box to copy the code.
153
  """
154
  article = """
155
  **How it works:**
156
- 1. Uses `requests` to fetch content from URLs.
157
- 2. Uses `readability-lxml` to attempt extracting the main article content. Falls back to full HTML if needed. Some basic pre-cleaning is done before Readability.
158
- 3. Uses `markdownify` to convert the processed HTML into Markdown.
159
- 4. The **raw Markdown code** is displayed in the output text box below.
160
- 5. Click the standard **copy icon** (📋) provided by Gradio in the top-right corner of the output box to copy the Markdown code.
 
 
 
161
  """
162
 
163
  # Define input components
@@ -188,23 +199,18 @@ iface = gr.Interface(
188
  description=description,
189
  article=article,
190
  allow_flagging='never',
191
- # --- UPDATED EXAMPLES ---
192
  examples=[
193
  ["https://psychedelic.com.br/profissoes-boneca-barbie/", ""],
194
  ["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""],
195
- ["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"]
 
 
196
  ],
197
- # --- END OF UPDATED EXAMPLES ---
198
- cache_examples=False # Keep False as we fetch live URLs
199
  )
200
 
201
  # Launch the app
202
  if __name__ == "__main__":
203
- # Reminder: requirements.txt should be:
204
- # gradio
205
- # requests
206
- # markdownify
207
- # beautifulsoup4
208
- # readability-lxml
209
- # lxml[html_clean]
210
  iface.launch()
 
4
  import traceback # To help format potential errors
5
  from readability import Document
6
  from bs4 import BeautifulSoup
7
+ import re # Import regex for potentially cleaning readability titles
8
 
9
  # Configure requests with a timeout and user-agent
10
+ DEFAULT_TIMEOUT = 20
11
  HEADERS = {'User-Agent': 'GradioHTMLtoMarkdownConverter/1.0 (+https://hf.space)'}
12
 
 
13
  def html_to_markdown_converter(url: str, html_input: str) -> str:
14
  """
15
  Converts HTML (from URL or direct input) to Markdown.
16
  Attempts to extract main content using readability.
17
+ Uses readability title, falls back to first H1 if needed, and prevents duplication.
18
  Returns the resulting Markdown string or an error message.
19
  """
20
  html_content = ""
21
  source = ""
22
+ use_readability = True
23
 
 
24
  url = url.strip() if url else ""
25
  html_input = html_input.strip() if html_input else ""
26
 
 
27
  try:
28
  # --- Step 1: Get HTML Content ---
29
  if url:
 
40
  html_content = response.text
41
  print(f"Successfully fetched {len(html_content)} bytes from URL.")
42
  except requests.exceptions.MissingSchema:
43
+ return f"❌ Error: Invalid URL: `{url}`. Please include `http://` or `https://`."
 
 
44
  except requests.exceptions.Timeout:
45
+ return f"❌ Error: Request timed out after {DEFAULT_TIMEOUT} seconds trying to fetch URL: `{url}`"
 
 
46
  except requests.exceptions.RequestException as e:
47
+ return f" Error: Failed to fetch content from URL: `{url}`\n```\n{e}\n```"
 
 
48
  except Exception as e:
49
+ return f"❌ Error: An unexpected error occurred while fetching the URL.\n```\n{traceback.format_exc()}\n```"
 
 
50
 
51
  elif html_input:
52
  source = "Direct HTML Input"
53
  print(f"Using direct HTML input ({len(html_input)} bytes).")
54
  html_content = html_input
55
  else:
56
+ return "❓ Please provide a URL or paste HTML content in the fields above."
 
57
 
58
+ # --- Pre-cleaning before Readability ---
59
+ if not html_content: return f"❓ No HTML content found from {source}."
60
+ print("Pre-cleaning HTML...")
61
+ soup_pre = BeautifulSoup(html_content, 'html.parser')
62
+ for tag in soup_pre(['script', 'style', 'iframe', 'svg', 'noscript', 'header', 'footer', 'nav', 'aside']): # More aggressive cleaning
63
+ tag.decompose()
64
+ cleaned_html = str(soup_pre) # Use this cleaned version going forward
65
+
66
+ # --- Step 2: Extract Main Content and Title (using Readability) ---
67
+ processed_html = cleaned_html # Default to cleaned HTML
68
+ readability_title = None
69
+ final_title = None # <<< Title to be used in the final output
70
 
 
 
71
  if use_readability:
72
  print("Attempting to extract main content using Readability...")
73
  try:
74
+ doc = Document(cleaned_html) # Use cleaned HTML
75
+ readability_title = doc.title()
 
 
 
 
 
 
76
  processed_html_summary = doc.summary()
77
+
78
+ # Check if readability summary is valid
79
+ soup_summary_check = BeautifulSoup(processed_html_summary, 'html.parser')
80
+ if soup_summary_check.text.strip():
81
+ processed_html = processed_html_summary # Use summary if valid
82
+ print(f"Readability extracted title: '{readability_title}'. Using summary.")
83
  else:
84
+ print("Readability summary was empty. Falling back to cleaned full HTML.")
85
+ # processed_html remains cleaned_html
86
+ readability_title = None # Discard title if summary failed
87
+
88
  except Exception as e:
89
+ print(f"Readability processing failed: {e}. Falling back to cleaned full HTML.")
90
+ # processed_html remains cleaned_html
91
+ readability_title = None
92
+
93
+ # --- Title Decision Logic ---
94
+ # Priority 1: Readability title (if good)
95
+ if readability_title and len(readability_title) > 3 and not readability_title.startswith('[') : # Basic check for valid title
96
+ final_title = readability_title.strip()
97
+ print(f"Using Readability title: '{final_title}'")
98
+
99
+ # Priority 2: Fallback to first H1 from CLEANED HTML if no good Readability title
100
+ if not final_title:
101
+ print("Readability title not suitable or not found. Looking for H1 fallback...")
102
+ soup_for_h1 = BeautifulSoup(cleaned_html, 'html.parser')
103
+ h1_tag = soup_for_h1.find('h1')
104
+ if h1_tag:
105
+ h1_text = h1_tag.get_text(strip=True)
106
+ if h1_text:
107
+ final_title = h1_text
108
+ print(f"Using H1 fallback title: '{final_title}'")
109
+
110
+ # --- Prevent Title Duplication in Content ---
111
+ if final_title:
112
+ print(f"Checking for title duplication in processed HTML (first H1)...")
113
+ soup_proc = BeautifulSoup(processed_html, 'html.parser')
114
+ first_h1_in_proc = soup_proc.find('h1')
115
+ if first_h1_in_proc:
116
+ h1_proc_text = first_h1_in_proc.get_text(strip=True)
117
+ # Check if the H1 text in content matches the final title we decided on
118
+ if h1_proc_text == final_title:
119
+ print(f"Found matching H1 ('{h1_proc_text}') in content. Removing it to prevent duplication.")
120
+ first_h1_in_proc.decompose() # Remove the H1 tag
121
+ processed_html = str(soup_proc) # Update the HTML string to be converted
122
 
123
  # --- Step 3: Convert the Processed HTML to Markdown ---
124
  if not processed_html.strip():
125
+ return f"❓ The HTML content (after processing) appears to be empty."
 
 
126
 
127
+ print(f"Attempting to convert final processed HTML (length: {len(processed_html)}) to Markdown...")
128
  try:
 
129
  markdown_output = markdownify(
130
  processed_html,
131
  heading_style="ATX",
132
+ bullets='*'
133
+ ).strip() # Strip whitespace from markdown output
134
+
135
+ # Assemble final output
136
+ if final_title:
137
+ # Prepend the decided title if one exists
138
+ final_markdown = f"# {final_title}\n\n{markdown_output}"
 
 
139
  else:
140
+ # Otherwise, just use the converted markdown
141
+ final_markdown = markdown_output
142
 
143
+ if not final_markdown.strip():
144
+ return f"ℹ️ The conversion resulted in empty Markdown."
 
 
145
 
146
+ return final_markdown.strip() # Return final cleaned string
 
147
 
148
  except Exception as e:
149
+ return f" Error: Failed to convert HTML to Markdown.\n```\n{traceback.format_exc()}\n```"
 
 
150
 
151
  except Exception as e:
152
+ return f"❌ Error: An unexpected error occurred during processing.\n```\n{traceback.format_exc()}\n```"
 
 
153
 
154
 
155
  # --- Gradio Interface (Standard) ---
156
  title = "HTML to Markdown Converter (Smart Extraction)"
157
  description = """
158
  Enter a URL **or** paste HTML code directly into the text box below.
159
+ The tool attempts to extract the main article content, identifies a title (using page title or first H1 as fallback), and converts it to Markdown.
160
  The resulting Markdown code is displayed below. Use the **copy icon** (📋) in the output box to copy the code.
161
  """
162
  article = """
163
  **How it works:**
164
+ 1. Fetches HTML from URL or uses pasted input.
165
+ 2. Performs basic cleaning (removes scripts, styles, headers, footers, etc.).
166
+ 3. Uses `readability-lxml` to extract the main content and attempt to find a page title.
167
+ 4. **Title Logic:** Prefers the title found by `readability`. If none is found or it seems invalid, it looks for the first `<h1>` tag in the cleaned HTML as a fallback.
168
+ 5. **Deduplication:** If a title is determined, the tool checks if the *first* `<h1>` tag within the extracted main content matches this title. If so, it removes that `<h1>` tag *before* conversion to prevent the title appearing twice.
169
+ 6. Uses `markdownify` to convert the processed HTML (potentially without its first H1) into Markdown.
170
+ 7. Prepends the determined title (if any) to the final Markdown output.
171
+ 8. Displays the raw Markdown code in the output box with a copy button.
172
  """
173
 
174
  # Define input components
 
199
  description=description,
200
  article=article,
201
  allow_flagging='never',
 
202
  examples=[
203
  ["https://psychedelic.com.br/profissoes-boneca-barbie/", ""],
204
  ["https://agideia.com.br/tutoriais/ai-inteligencia-artificial/integre-uma-ia-gratuita-gemma-2b-ao-seu-site-wordpress-usando-google-colab-e-cloudflare/", ""],
205
+ ["", "<h1>Título Simples</h1>\n<p>Este é um parágrafo de exemplo com <strong>texto em negrito</strong> e <em>texto em itálico</em>.</p>\n<ul>\n<li>Item 1</li>\n<li>Item 2</li>\n</ul>"],
206
+ # Add an example without H1 to test no-title scenario
207
+ ["", "<p>Um parágrafo sem título H1.</p><div><p>Outro conteúdo.</p></div>"]
208
  ],
209
+ cache_examples=False
 
210
  )
211
 
212
  # Launch the app
213
  if __name__ == "__main__":
214
+ # Reminder: requirements.txt includes:
215
+ # gradio, requests, markdownify, beautifulsoup4, readability-lxml, lxml[html_clean]
 
 
 
 
 
216
  iface.launch()