Spaces:

Em4e
/

chunk-based-text-editor

Sleeping

App Files Files Community

Em4e commited on Jun 9, 2025

Commit

53eca1f

verified ·

1 Parent(s): 875975c

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -30

app.py CHANGED Viewed

@@ -19,8 +19,8 @@ class WebpageContentProcessor:
     def fetch_and_convert_to_markdown(self, url: str) -> str:
         """
-        Fetches HTML content from a URL, isolates the main content, aggressively
-        removes boilerplate, and converts the result to Markdown.
         """
         try:
             headers = {
@@ -31,35 +31,16 @@ class WebpageContentProcessor:
             html_content = response.text
             soup = BeautifulSoup(html_content, 'html.parser')
-            # First, try to find a specific main content container.
-            main_content = soup.find('article') or soup.find('main') or \
-                           soup.find('div', class_=re.compile(r'(post|content|entry|main-content)')) or \
-                           soup.find('div', {'role': 'main'})
-            # If a main content container is found, use it. Otherwise, fall back to the whole body.
-            content_container = main_content if main_content else soup.find('body')
             if not content_container:
-                return "Error: Could not find any processable content on the webpage."
-            # Aggressively remove common boilerplate elements by tag, class, or role.
-            unwanted_selectors = [
-                'nav', 'header', 'footer', 'aside', 'form', 'figure', 'figcaption',
-                '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
-                '[role="search"]', '[role="complementary"]',
-                '.nav', '.navbar', '.header', '.footer', '.sidebar', '.aside',
-                '.menu', '.pagination', '.breadcrumbs', '.comments', '.comment-list',
-                '.social-links', '.share-buttons', '.cookie-notice', '.banner',
-                '#nav', '#header', '#footer', '#sidebar', '#comments',
-                '[class*="ad"]', '[id*="ad"]', '[class*="promo"]', '[id*="promo"]'
-            ]
-            for selector in unwanted_selectors:
-                for element in content_container.select(selector):
-                    element.decompose()
-            # Also specifically remove script and style tags which are never content.
-            for tag in content_container.find_all(['script', 'style', 'noscript']):
-                tag.decompose()
             # Convert the cleaned HTML to Markdown
             markdown_output = convert_to_markdown(str(content_container))
@@ -337,4 +318,4 @@ with tab2:
             st.rerun()
     st.subheader("Final Compiled Document")
-    st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")

     def fetch_and_convert_to_markdown(self, url: str) -> str:
         """
+        Fetches HTML content, removes common boilerplate tags from the entire page,
+        and then converts the remaining body content to Markdown.
         """
         try:
             headers = {
             html_content = response.text
             soup = BeautifulSoup(html_content, 'html.parser')
+            # Remove common boilerplate and non-content tags from the entire document
+            tags_to_remove = ['nav', 'header', 'footer', 'aside', 'script', 'style', 'noscript', 'form']
+            for tag_name in tags_to_remove:
+                for element in soup.find_all(tag_name):
+                    element.decompose()
+            # Process the entire remaining body
+            content_container = soup.find('body')
             if not content_container:
+                return "Error: Could not find the <body> of the webpage."
             # Convert the cleaned HTML to Markdown
             markdown_output = convert_to_markdown(str(content_container))
             st.rerun()
     st.subheader("Final Compiled Document")
+    st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")