Spaces:

Em4e
/

chunk-based-text-editor

Sleeping

App Files Files Community

Em4e commited on Jun 9, 2025

Commit

4c95011

verified ·

1 Parent(s): 5543eef

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -19

app.py CHANGED Viewed

@@ -19,8 +19,8 @@ class WebpageContentProcessor:
     def fetch_and_convert_to_markdown(self, url: str) -> str:
         """
-        Fetches HTML content from a URL, cleans it, and converts it to Markdown.
-        It intelligently tries to find the main content block of the page.
         """
         try:
             headers = {
@@ -31,25 +31,21 @@ class WebpageContentProcessor:
             html_content = response.text
             soup = BeautifulSoup(html_content, 'html.parser')
-            # Remove non-content tags like scripts and styles
-            for tag_name in ['script', 'style', 'noscript', 'meta', 'link', 'header', 'footer', 'nav', 'aside']:
-                for element in soup.find_all(tag_name):
-                    element.decompose()
-            # Find the main content area of the webpage
-            content_for_conversion = soup.find('article') or soup.find('main') or \
-                                     soup.find('div', class_=re.compile(r'content|post|body')) or \
-                                     soup.find('div', {'role': 'main'})
-            # Fallback to the entire body if no main content is found
-            if not content_for_conversion:
-                content_for_conversion = soup.body
-                if not content_for_conversion:
-                    return "Error: Could not find any content on the page."
-            # Convert the cleaned HTML to Markdown
-            markdown_output = convert_to_markdown(str(content_for_conversion))
-            # Clean up excessive newlines
             markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output).strip()
             return markdown_output

     def fetch_and_convert_to_markdown(self, url: str) -> str:
         """
+        Fetches HTML content from a URL, starts from the <body>, removes common
+        boilerplate tags, and converts the remaining content to Markdown.
         """
         try:
             headers = {
             html_content = response.text
             soup = BeautifulSoup(html_content, 'html.parser')
+            # Find the body of the HTML document
+            body = soup.find('body')
+            if not body:
+                return "Error: Could not find the <body> of the webpage."
+            # Tags to remove from the content to reduce boilerplate
+            tags_to_remove = ['script', 'style', 'noscript', 'header', 'footer', 'nav', 'aside', 'form', 'figure']
+            for tag_name in tags_to_remove:
+                # Find all instances of the tag within the body and remove them
+                for element in body.find_all(tag_name):
+                    element.decompose()
+            # Convert the cleaned body content to Markdown
+            markdown_output = convert_to_markdown(str(body))
+            # Clean up excessive newlines for better readability
             markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output).strip()
             return markdown_output