Spaces:

Em4e
/

chunk-based-text-editor

Sleeping

App Files Files Community

Em4e commited on Jun 9, 2025

Commit

875975c

verified ·

1 Parent(s): 7c8ac1e

Update app.py

Browse files

Files changed (1) hide show

app.py +2 -8

app.py CHANGED Viewed

@@ -42,7 +42,6 @@ class WebpageContentProcessor:
                 return "Error: Could not find any processable content on the webpage."
             # Aggressively remove common boilerplate elements by tag, class, or role.
-            # This list is more comprehensive to catch varied web designs.
             unwanted_selectors = [
                 'nav', 'header', 'footer', 'aside', 'form', 'figure', 'figcaption',
                 '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
@@ -51,7 +50,6 @@ class WebpageContentProcessor:
                 '.menu', '.pagination', '.breadcrumbs', '.comments', '.comment-list',
                 '.social-links', '.share-buttons', '.cookie-notice', '.banner',
                 '#nav', '#header', '#footer', '#sidebar', '#comments',
-                # Add specific selectors for common ad and promo blocks
                 '[class*="ad"]', '[id*="ad"]', '[class*="promo"]', '[id*="promo"]'
             ]
@@ -67,9 +65,7 @@ class WebpageContentProcessor:
             markdown_output = convert_to_markdown(str(content_container))
             # Post-processing to clean up the resulting Markdown
-            # Collapse extra newlines
             markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
-            # Remove empty list items or lines with just navigation symbols that might remain
             markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
             return markdown_output.strip()
@@ -189,7 +185,6 @@ class ChunkManager:
         if chunk:
             chunk["content"] = new_content
             self._add_stats_to_chunk(chunk)
-            # Update title if it was a placeholder
             if chunk["title"].startswith("["):
                  first_line = new_content.split('\n')[0].strip()
                  new_title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
@@ -206,7 +201,6 @@ class ChunkManager:
             return "No content to display."
         final_doc_parts = []
         for c in self._chunks:
-            # Check if title is a real header or just derived text
             is_header = re.match(r"^(#+)\s*(.*)", c['title'])
             if not c['title'].startswith("[") and not is_header:
                  final_doc_parts.append(f"## {c['title']}\n\n{c['content']}")
@@ -241,8 +235,8 @@ init_session_state()
 processor = st.session_state.processor
 manager = st.session_state.manager
-st.title("✨ Chunk Webpage Content Editor ✨")
-st.caption("A tool to fetch, chunk, and refine web content. Inspired by Andrea Volpini's [work on content chunking](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)")
 st.info(
     "**Note:** Some websites may block content scraping. This is an early version, so you might encounter bugs.",

                 return "Error: Could not find any processable content on the webpage."
             # Aggressively remove common boilerplate elements by tag, class, or role.
             unwanted_selectors = [
                 'nav', 'header', 'footer', 'aside', 'form', 'figure', 'figcaption',
                 '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
                 '.menu', '.pagination', '.breadcrumbs', '.comments', '.comment-list',
                 '.social-links', '.share-buttons', '.cookie-notice', '.banner',
                 '#nav', '#header', '#footer', '#sidebar', '#comments',
                 '[class*="ad"]', '[id*="ad"]', '[class*="promo"]', '[id*="promo"]'
             ]
             markdown_output = convert_to_markdown(str(content_container))
             # Post-processing to clean up the resulting Markdown
             markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
             markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
             return markdown_output.strip()
         if chunk:
             chunk["content"] = new_content
             self._add_stats_to_chunk(chunk)
             if chunk["title"].startswith("["):
                  first_line = new_content.split('\n')[0].strip()
                  new_title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
             return "No content to display."
         final_doc_parts = []
         for c in self._chunks:
             is_header = re.match(r"^(#+)\s*(.*)", c['title'])
             if not c['title'].startswith("[") and not is_header:
                  final_doc_parts.append(f"## {c['title']}\n\n{c['content']}")
 processor = st.session_state.processor
 manager = st.session_state.manager
+st.title("✨ Webpage Content Editor")
+st.caption("A tool to fetch, chunk, and refine web content.")
 st.info(
     "**Note:** Some websites may block content scraping. This is an early version, so you might encounter bugs.",