Em4e commited on
Commit
53eca1f
·
verified ·
1 Parent(s): 875975c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -30
app.py CHANGED
@@ -19,8 +19,8 @@ class WebpageContentProcessor:
19
 
20
  def fetch_and_convert_to_markdown(self, url: str) -> str:
21
  """
22
- Fetches HTML content from a URL, isolates the main content, aggressively
23
- removes boilerplate, and converts the result to Markdown.
24
  """
25
  try:
26
  headers = {
@@ -31,35 +31,16 @@ class WebpageContentProcessor:
31
  html_content = response.text
32
  soup = BeautifulSoup(html_content, 'html.parser')
33
 
34
- # First, try to find a specific main content container.
35
- main_content = soup.find('article') or soup.find('main') or \
36
- soup.find('div', class_=re.compile(r'(post|content|entry|main-content)')) or \
37
- soup.find('div', {'role': 'main'})
 
38
 
39
- # If a main content container is found, use it. Otherwise, fall back to the whole body.
40
- content_container = main_content if main_content else soup.find('body')
41
  if not content_container:
42
- return "Error: Could not find any processable content on the webpage."
43
-
44
- # Aggressively remove common boilerplate elements by tag, class, or role.
45
- unwanted_selectors = [
46
- 'nav', 'header', 'footer', 'aside', 'form', 'figure', 'figcaption',
47
- '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
48
- '[role="search"]', '[role="complementary"]',
49
- '.nav', '.navbar', '.header', '.footer', '.sidebar', '.aside',
50
- '.menu', '.pagination', '.breadcrumbs', '.comments', '.comment-list',
51
- '.social-links', '.share-buttons', '.cookie-notice', '.banner',
52
- '#nav', '#header', '#footer', '#sidebar', '#comments',
53
- '[class*="ad"]', '[id*="ad"]', '[class*="promo"]', '[id*="promo"]'
54
- ]
55
-
56
- for selector in unwanted_selectors:
57
- for element in content_container.select(selector):
58
- element.decompose()
59
-
60
- # Also specifically remove script and style tags which are never content.
61
- for tag in content_container.find_all(['script', 'style', 'noscript']):
62
- tag.decompose()
63
 
64
  # Convert the cleaned HTML to Markdown
65
  markdown_output = convert_to_markdown(str(content_container))
@@ -337,4 +318,4 @@ with tab2:
337
  st.rerun()
338
 
339
  st.subheader("Final Compiled Document")
340
- st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")
 
19
 
20
  def fetch_and_convert_to_markdown(self, url: str) -> str:
21
  """
22
+ Fetches HTML content, removes common boilerplate tags from the entire page,
23
+ and then converts the remaining body content to Markdown.
24
  """
25
  try:
26
  headers = {
 
31
  html_content = response.text
32
  soup = BeautifulSoup(html_content, 'html.parser')
33
 
34
+ # Remove common boilerplate and non-content tags from the entire document
35
+ tags_to_remove = ['nav', 'header', 'footer', 'aside', 'script', 'style', 'noscript', 'form']
36
+ for tag_name in tags_to_remove:
37
+ for element in soup.find_all(tag_name):
38
+ element.decompose()
39
 
40
+ # Process the entire remaining body
41
+ content_container = soup.find('body')
42
  if not content_container:
43
+ return "Error: Could not find the <body> of the webpage."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # Convert the cleaned HTML to Markdown
46
  markdown_output = convert_to_markdown(str(content_container))
 
318
  st.rerun()
319
 
320
  st.subheader("Final Compiled Document")
321
+ st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")