Spaces:

Em4e
/

chunk-based-text-editor

Sleeping

App Files Files Community

Em4e commited on Jun 9, 2025

Commit

c063934

verified ·

1 Parent(s): fe63ffc

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -27

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import requests
 from bs4 import BeautifulSoup
 from html_to_markdown import convert_to_markdown
 import re
-from llama_index.core.node_parser import SentenceSplitter
 from llama_index.core.schema import Document, MetadataMode
 import textstat # For readability metrics
@@ -82,38 +82,44 @@ class WebpageContentProcessor:
     def parse_markdown_into_chunks(self, markdown_content: str) -> list:
         """
-        Parses markdown content into chunks using a sentence splitter for more
-        reliable chunking than header-based splitting.
         """
         if not markdown_content or "Error" in markdown_content:
             return []
-        # Use SentenceSplitter for more reliable chunking based on size,
-        # rather than relying on Markdown headers which may not exist.
-        parser = SentenceSplitter(
-            chunk_size=2000,  # Characters per chunk
-            chunk_overlap=200 # Characters to overlap between chunks
-        )
-        doc = Document(text=markdown_content)
         nodes = parser.get_nodes_from_documents([doc])
         structured_chunks = []
         for i, node in enumerate(nodes):
-            content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
-            if not content:
                 continue
-            # Generate a title from the first non-empty line of the chunk
-            first_line = next((line for line in content.split('\n') if line.strip()), "")
-            title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
-            if not title:
-                title = f"Chunk {i+1}"
             structured_chunks.append({
-                "id": i,
-                "title": title,
-                "content": content
             })
         return structured_chunks
@@ -180,6 +186,12 @@ class ChunkManager:
         if chunk:
             chunk["content"] = new_content
             self._add_stats_to_chunk(chunk)
     def delete_chunk(self, chunk_id: int):
         self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
@@ -221,8 +233,8 @@ init_session_state()
 processor = st.session_state.content_processor
 manager = st.session_state.chunk_manager
-st.title("✨ Chunk Webpage Content Editor")
-st.caption("Created by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/) | Inspired by Andrea Volpini's [work on content chunking.](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)")
 st.info(
     "**Note:** Some URLs may be blocked due to server policies (like bot detection). "
@@ -272,7 +284,6 @@ with tab1:
             chunk_options = {c['id']: f"Chunk {c['id']}: {c['title']}" for c in chunks}
             # The selectbox's state is now managed directly by st.session_state.selected_chunk_id
-            # When the user selects a new option, Streamlit automatically updates this state variable and reruns the script.
             selected_id = st.selectbox(
                 "Select a chunk to edit",
                 options=chunk_ids,
@@ -294,7 +305,7 @@ with tab1:
                     "Chunk Content",
                     value=selected_chunk['content'],
                     height=300,
-                    key=f"editor_{selected_chunk['id']}" # Unique key forces widget to re-render on selection change
                 )
                 col1, col2, _ = st.columns([1, 1, 4])
@@ -328,4 +339,4 @@ with tab2:
             st.rerun()
     st.subheader("Final Document")
-    st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")

 from bs4 import BeautifulSoup
 from html_to_markdown import convert_to_markdown
 import re
+from llama_index.core.node_parser import MarkdownNodeParser
 from llama_index.core.schema import Document, MetadataMode
 import textstat # For readability metrics
     def parse_markdown_into_chunks(self, markdown_content: str) -> list:
         """
+        Parses Markdown content into LlamaIndex nodes (chunks) and extracts title and content.
+        This version uses MarkdownNodeParser to leverage the document's structure.
         """
         if not markdown_content or "Error" in markdown_content:
             return []
+        doc = Document(text=markdown_content, metadata={"filename": "webpage_content"})
+        parser = MarkdownNodeParser(include_metadata=True)
         nodes = parser.get_nodes_from_documents([doc])
         structured_chunks = []
         for i, node in enumerate(nodes):
+            pure_text_content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
+            if not pure_text_content:
                 continue
+            heading_title = ""
+            content_text = pure_text_content
+            # Attempt to find a title from a markdown header
+            heading_match = re.match(r"^(#+)\s*(.*)", pure_text_content)
+            if heading_match:
+                heading_title = heading_match.group(2).strip()
+                # Remove the title from the content itself
+                content_text = pure_text_content[len(heading_match.group(0)):].strip()
+                if not heading_title:
+                    heading_title = "[Untitled Section]"
+            else:
+                # Fallback to using the first line as the title
+                first_line = content_text.split('\n')[0].strip()
+                heading_title = (first_line[:75] + "...") if len(first_line) > 75 else first_line
+                if not heading_title:
+                    heading_title = "[Empty Section]"
             structured_chunks.append({
+                "id": i,
+                "title": heading_title,
+                "content": content_text
             })
         return structured_chunks
         if chunk:
             chunk["content"] = new_content
             self._add_stats_to_chunk(chunk)
+            # Optionally update title if it's derived from content
+            if chunk["title"].startswith("[") or not re.match(r"^(#+)\s*(.*)", chunk["content"]):
+                 first_line = new_content.split('\n')[0].strip()
+                 chunk["title"] = (first_line[:75] + '...') if len(first_line) > 75 else first_line
+                 if not chunk["title"]: chunk["title"] = "[Empty Section]"
     def delete_chunk(self, chunk_id: int):
         self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
 processor = st.session_state.content_processor
 manager = st.session_state.chunk_manager
+st.title("✨ Webpage Content Editor")
+st.caption("Created by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/) | Inspired by Andrea Volpini's work on content chunking.")
 st.info(
     "**Note:** Some URLs may be blocked due to server policies (like bot detection). "
             chunk_options = {c['id']: f"Chunk {c['id']}: {c['title']}" for c in chunks}
             # The selectbox's state is now managed directly by st.session_state.selected_chunk_id
             selected_id = st.selectbox(
                 "Select a chunk to edit",
                 options=chunk_ids,
                     "Chunk Content",
                     value=selected_chunk['content'],
                     height=300,
+                    key=f"editor_{selected_chunk['id']}" # Unique key forces widget to re-render
                 )
                 col1, col2, _ = st.columns([1, 1, 4])
             st.rerun()
     st.subheader("Final Document")
+    st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")