Spaces:

Em4e
/

chunk-based-text-editor

Sleeping

App Files Files Community

Em4e commited on Jun 9

Commit

66603bd

verified ·

1 Parent(s): 4c95011

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -46

app.py CHANGED Viewed

@@ -19,8 +19,8 @@ class WebpageContentProcessor:
     def fetch_and_convert_to_markdown(self, url: str) -> str:
         """
-        Fetches HTML content from a URL, starts from the <body>, removes common
-        boilerplate tags, and converts the remaining content to Markdown.
         """
         try:
             headers = {
@@ -31,23 +31,48 @@ class WebpageContentProcessor:
             html_content = response.text
             soup = BeautifulSoup(html_content, 'html.parser')
-            # Find the body of the HTML document
-            body = soup.find('body')
-            if not body:
-                return "Error: Could not find the <body> of the webpage."
-            # Tags to remove from the content to reduce boilerplate
-            tags_to_remove = ['script', 'style', 'noscript', 'header', 'footer', 'nav', 'aside', 'form', 'figure']
-            for tag_name in tags_to_remove:
-                # Find all instances of the tag within the body and remove them
-                for element in body.find_all(tag_name):
                     element.decompose()
-            # Convert the cleaned body content to Markdown
-            markdown_output = convert_to_markdown(str(body))
-            # Clean up excessive newlines for better readability
-            markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output).strip()
-            return markdown_output
         except requests.exceptions.Timeout:
             return "Error: The request timed out. The server is taking too long to respond."
@@ -64,7 +89,6 @@ class WebpageContentProcessor:
         if not markdown_content or "Error" in markdown_content:
             return []
-        # This parser understands Markdown structure (headings, lists) and splits accordingly.
         parser = MarkdownNodeParser(include_metadata=True)
         doc = Document(text=markdown_content)
         nodes = parser.get_nodes_from_documents([doc])
@@ -75,19 +99,15 @@ class WebpageContentProcessor:
             if not content:
                 continue
-            # Extract title from the markdown header if it exists
             title_match = re.match(r"^(#+)\s*(.*)", content)
             if title_match:
                 title = title_match.group(2).strip()
-                # The content should not include the title line itself
                 content_text = content[len(title_match.group(0)):].strip()
             else:
-                # If no header, use the first line as a fallback title
                 first_line = content.split('\n')[0].strip()
                 title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
                 content_text = content
-            # Ensure there's a title even for empty sections
             if not title:
                 title = f"[Chunk {i+1}]"
@@ -126,7 +146,7 @@ class ChunkManager:
             stats['word_count'] = textstat.lexicon_count(text, removepunct=True)
             stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
             stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
-        except (Exception, TypeError): # Catch potential errors from textstat
             stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0})
         return stats
@@ -168,25 +188,30 @@ class ChunkManager:
         chunk = self.get_chunk_by_id(chunk_id)
         if chunk:
             chunk["content"] = new_content
-            self._add_stats_to_chunk(chunk) # Recalculate stats after update
     def delete_chunk(self, chunk_id: int):
         self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
-        # Re-index remaining chunks to maintain sequential IDs
         for i, chunk in enumerate(self._chunks):
             chunk['id'] = i
     def get_final_markdown(self) -> str:
         if not self._chunks:
             return "No content to display."
-        # Compile final document, adding headers back for chunks that have them
         final_doc_parts = []
         for c in self._chunks:
-            title_is_header = re.match(r"^(#+)\s*(.*)", c['title']) is None
-            if not c['title'].startswith("[") and not title_is_header:
-                final_doc_parts.append(f"## {c['title']}\n\n{c['content']}")
             else:
-                final_doc_parts.append(c['content'])
         return "\n\n---\n\n".join(final_doc_parts)
@@ -195,14 +220,12 @@ class ChunkManager:
         self.target_grade_max = grade_max
         self.target_min_chunk_words = min_words
         self.target_max_chunk_words = max_words
-        # Recalculate stats for all chunks to reflect new targets
         self.set_chunks(self.get_chunks())
 # --- Streamlit UI Application ---
 st.set_page_config(layout="wide", page_title="Webpage Content Editor")
-# Initialize session state for managers and UI state
 def init_session_state():
     if 'processor' not in st.session_state:
         st.session_state.processor = WebpageContentProcessor()
@@ -218,8 +241,6 @@ init_session_state()
 processor = st.session_state.processor
 manager = st.session_state.manager
-# --- Page Layout ---
 st.title("✨ Webpage Content Editor")
 st.caption("A tool to fetch, chunk, and refine web content.")
@@ -228,7 +249,6 @@ st.info(
     icon="ℹ️"
 )
-# URL input and processing button
 url_input = st.text_input("Enter a webpage URL to start", key="url_input")
 if st.button("Process URL", use_container_width=True, type="primary"):
     if url_input:
@@ -247,14 +267,12 @@ if st.button("Process URL", use_container_width=True, type="primary"):
                 else:
                     st.session_state.status_message = "Could not extract any content chunks."
                     st.session_state.selected_chunk_id = None
-            st.rerun() # Rerun to update the UI with new state
-# Display status messages as toasts
 if st.session_state.status_message:
     st.toast(st.session_state.status_message)
-    st.session_state.status_message = "" # Clear after displaying
-# Main UI with tabs
 tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
 with tab1:
@@ -263,14 +281,12 @@ with tab1:
         st.write("Process a URL to begin editing content chunks.")
     else:
         chunk_ids = [c['id'] for c in chunks]
-        # Ensure the selected chunk ID is valid
         if st.session_state.selected_chunk_id not in chunk_ids:
             st.session_state.selected_chunk_id = chunk_ids[0] if chunk_ids else None
         if st.session_state.selected_chunk_id is not None:
             chunk_options = {c['id']: c['title'] for c in chunks}
-            # Dropdown to select a chunk for editing
             selected_id = st.selectbox(
                 "Select a chunk to edit",
                 options=chunk_ids,
@@ -278,7 +294,6 @@ with tab1:
                 index=chunk_ids.index(st.session_state.selected_chunk_id)
             )
-            # Update state if the selection changes
             if selected_id != st.session_state.selected_chunk_id:
                 st.session_state.selected_chunk_id = selected_id
                 st.rerun()
@@ -289,15 +304,13 @@ with tab1:
                 st.markdown(f"**Editing: {selected_chunk['title']}**")
                 st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
-                # Text area for editing the selected chunk's content
                 edited_content = st.text_area(
                     "Chunk Content",
                     value=selected_chunk['content'],
                     height=350,
-                    key=f"editor_{selected_chunk['id']}" # Unique key ensures the widget updates
                 )
-                # Action buttons for the selected chunk
                 col1, col2, _ = st.columns([1, 1, 5])
                 if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
                     manager.update_chunk_content(selected_chunk['id'], edited_content)
@@ -307,7 +320,6 @@ with tab1:
                 if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
                     manager.delete_chunk(selected_chunk['id'])
                     st.session_state.status_message = "Chunk deleted."
-                    # Select the next available chunk or reset
                     remaining_chunks = manager.get_chunks()
                     st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
                     st.rerun()

     def fetch_and_convert_to_markdown(self, url: str) -> str:
         """
+        Fetches HTML content from a URL, isolates the main content, aggressively
+        removes boilerplate, and converts the result to Markdown.
         """
         try:
             headers = {
             html_content = response.text
             soup = BeautifulSoup(html_content, 'html.parser')
+            # First, try to find a specific main content container.
+            main_content = soup.find('article') or soup.find('main') or \
+                           soup.find('div', class_=re.compile(r'(post|content|entry|main-content)')) or \
+                           soup.find('div', {'role': 'main'})
+            # If a main content container is found, use it. Otherwise, fall back to the whole body.
+            content_container = main_content if main_content else soup.find('body')
+            if not content_container:
+                return "Error: Could not find any processable content on the webpage."
+            # Aggressively remove common boilerplate elements by tag, class, or role.
+            # This list is more comprehensive to catch varied web designs.
+            unwanted_selectors = [
+                'nav', 'header', 'footer', 'aside', 'form', 'figure', 'figcaption',
+                '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
+                '[role="search"]', '[role="complementary"]',
+                '.nav', '.navbar', '.header', '.footer', '.sidebar', '.aside',
+                '.menu', '.pagination', '.breadcrumbs', '.comments', '.comment-list',
+                '.social-links', '.share-buttons', '.cookie-notice', '.banner',
+                '#nav', '#header', '#footer', '#sidebar', '#comments',
+                # Add specific selectors for common ad and promo blocks
+                '[class*="ad"]', '[id*="ad"]', '[class*="promo"]', '[id*="promo"]'
+            ]
+            for selector in unwanted_selectors:
+                for element in content_container.select(selector):
                     element.decompose()
+            # Also specifically remove script and style tags which are never content.
+            for tag in content_container.find_all(['script', 'style', 'noscript']):
+                tag.decompose()
+            # Convert the cleaned HTML to Markdown
+            markdown_output = convert_to_markdown(str(content_container))
+            # Post-processing to clean up the resulting Markdown
+            # Collapse extra newlines
+            markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
+            # Remove empty list items or lines with just navigation symbols that might remain
+            markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
+            return markdown_output.strip()
         except requests.exceptions.Timeout:
             return "Error: The request timed out. The server is taking too long to respond."
         if not markdown_content or "Error" in markdown_content:
             return []
         parser = MarkdownNodeParser(include_metadata=True)
         doc = Document(text=markdown_content)
         nodes = parser.get_nodes_from_documents([doc])
             if not content:
                 continue
             title_match = re.match(r"^(#+)\s*(.*)", content)
             if title_match:
                 title = title_match.group(2).strip()
                 content_text = content[len(title_match.group(0)):].strip()
             else:
                 first_line = content.split('\n')[0].strip()
                 title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
                 content_text = content
             if not title:
                 title = f"[Chunk {i+1}]"
             stats['word_count'] = textstat.lexicon_count(text, removepunct=True)
             stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
             stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
+        except (Exception, TypeError):
             stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0})
         return stats
         chunk = self.get_chunk_by_id(chunk_id)
         if chunk:
             chunk["content"] = new_content
+            self._add_stats_to_chunk(chunk)
+            # Update title if it was a placeholder
+            if chunk["title"].startswith("["):
+                 first_line = new_content.split('\n')[0].strip()
+                 new_title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
+                 if new_title:
+                    chunk["title"] = new_title
     def delete_chunk(self, chunk_id: int):
         self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
         for i, chunk in enumerate(self._chunks):
             chunk['id'] = i
     def get_final_markdown(self) -> str:
         if not self._chunks:
             return "No content to display."
         final_doc_parts = []
         for c in self._chunks:
+            # Check if title is a real header or just derived text
+            is_header = re.match(r"^(#+)\s*(.*)", c['title'])
+            if not c['title'].startswith("[") and not is_header:
+                 final_doc_parts.append(f"## {c['title']}\n\n{c['content']}")
             else:
+                 final_doc_parts.append(c['content'])
         return "\n\n---\n\n".join(final_doc_parts)
         self.target_grade_max = grade_max
         self.target_min_chunk_words = min_words
         self.target_max_chunk_words = max_words
         self.set_chunks(self.get_chunks())
 # --- Streamlit UI Application ---
 st.set_page_config(layout="wide", page_title="Webpage Content Editor")
 def init_session_state():
     if 'processor' not in st.session_state:
         st.session_state.processor = WebpageContentProcessor()
 processor = st.session_state.processor
 manager = st.session_state.manager
 st.title("✨ Webpage Content Editor")
 st.caption("A tool to fetch, chunk, and refine web content.")
     icon="ℹ️"
 )
 url_input = st.text_input("Enter a webpage URL to start", key="url_input")
 if st.button("Process URL", use_container_width=True, type="primary"):
     if url_input:
                 else:
                     st.session_state.status_message = "Could not extract any content chunks."
                     st.session_state.selected_chunk_id = None
+            st.rerun()
 if st.session_state.status_message:
     st.toast(st.session_state.status_message)
+    st.session_state.status_message = ""
 tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
 with tab1:
         st.write("Process a URL to begin editing content chunks.")
     else:
         chunk_ids = [c['id'] for c in chunks]
         if st.session_state.selected_chunk_id not in chunk_ids:
             st.session_state.selected_chunk_id = chunk_ids[0] if chunk_ids else None
         if st.session_state.selected_chunk_id is not None:
             chunk_options = {c['id']: c['title'] for c in chunks}
             selected_id = st.selectbox(
                 "Select a chunk to edit",
                 options=chunk_ids,
                 index=chunk_ids.index(st.session_state.selected_chunk_id)
             )
             if selected_id != st.session_state.selected_chunk_id:
                 st.session_state.selected_chunk_id = selected_id
                 st.rerun()
                 st.markdown(f"**Editing: {selected_chunk['title']}**")
                 st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
                 edited_content = st.text_area(
                     "Chunk Content",
                     value=selected_chunk['content'],
                     height=350,
+                    key=f"editor_{selected_chunk['id']}"
                 )
                 col1, col2, _ = st.columns([1, 1, 5])
                 if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
                     manager.update_chunk_content(selected_chunk['id'], edited_content)
                 if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
                     manager.delete_chunk(selected_chunk['id'])
                     st.session_state.status_message = "Chunk deleted."
                     remaining_chunks = manager.get_chunks()
                     st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
                     st.rerun()