Em4e commited on
Commit
4c95011
·
verified ·
1 Parent(s): 5543eef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -19
app.py CHANGED
@@ -19,8 +19,8 @@ class WebpageContentProcessor:
19
 
20
  def fetch_and_convert_to_markdown(self, url: str) -> str:
21
  """
22
- Fetches HTML content from a URL, cleans it, and converts it to Markdown.
23
- It intelligently tries to find the main content block of the page.
24
  """
25
  try:
26
  headers = {
@@ -31,25 +31,21 @@ class WebpageContentProcessor:
31
  html_content = response.text
32
  soup = BeautifulSoup(html_content, 'html.parser')
33
 
34
- # Remove non-content tags like scripts and styles
35
- for tag_name in ['script', 'style', 'noscript', 'meta', 'link', 'header', 'footer', 'nav', 'aside']:
36
- for element in soup.find_all(tag_name):
37
- element.decompose()
38
-
39
- # Find the main content area of the webpage
40
- content_for_conversion = soup.find('article') or soup.find('main') or \
41
- soup.find('div', class_=re.compile(r'content|post|body')) or \
42
- soup.find('div', {'role': 'main'})
43
 
44
- # Fallback to the entire body if no main content is found
45
- if not content_for_conversion:
46
- content_for_conversion = soup.body
47
- if not content_for_conversion:
48
- return "Error: Could not find any content on the page."
 
49
 
50
- # Convert the cleaned HTML to Markdown
51
- markdown_output = convert_to_markdown(str(content_for_conversion))
52
- # Clean up excessive newlines
53
  markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output).strip()
54
  return markdown_output
55
 
 
19
 
20
  def fetch_and_convert_to_markdown(self, url: str) -> str:
21
  """
22
+ Fetches HTML content from a URL, starts from the <body>, removes common
23
+ boilerplate tags, and converts the remaining content to Markdown.
24
  """
25
  try:
26
  headers = {
 
31
  html_content = response.text
32
  soup = BeautifulSoup(html_content, 'html.parser')
33
 
34
+ # Find the body of the HTML document
35
+ body = soup.find('body')
36
+ if not body:
37
+ return "Error: Could not find the <body> of the webpage."
 
 
 
 
 
38
 
39
+ # Tags to remove from the content to reduce boilerplate
40
+ tags_to_remove = ['script', 'style', 'noscript', 'header', 'footer', 'nav', 'aside', 'form', 'figure']
41
+ for tag_name in tags_to_remove:
42
+ # Find all instances of the tag within the body and remove them
43
+ for element in body.find_all(tag_name):
44
+ element.decompose()
45
 
46
+ # Convert the cleaned body content to Markdown
47
+ markdown_output = convert_to_markdown(str(body))
48
+ # Clean up excessive newlines for better readability
49
  markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output).strip()
50
  return markdown_output
51