Spaces:

Em4e
/

chunk-based-text-editor

Sleeping

App Files Files Community

Em4e commited on Jun 9, 2025

Commit

4d98418

verified ·

1 Parent(s): fc54d8b

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -6

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import requests
 from bs4 import BeautifulSoup
 from html_to_markdown import convert_to_markdown
 import re
-from llama_index.core.node_parser import MarkdownNodeParser
 from llama_index.core.schema import Document, MetadataMode
 import textstat # For readability metrics
@@ -81,19 +81,41 @@ class WebpageContentProcessor:
             return f"An unexpected error occurred: {e}"
     def parse_markdown_into_chunks(self, markdown_content: str) -> list:
         if not markdown_content or "Error" in markdown_content:
             return []
         doc = Document(text=markdown_content)
-        parser = MarkdownNodeParser(include_metadata=True)
         nodes = parser.get_nodes_from_documents([doc])
         structured_chunks = []
         for i, node in enumerate(nodes):
             content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
             if not content:
                 continue
-            title_match = re.match(r"^(#+)\s*(.*)", content)
-            title = title_match.group(2).strip() if title_match and title_match.group(2).strip() else (content.split('\n')[0][:70] + "...")
-            structured_chunks.append({"id": i, "title": title, "content": content})
         return structured_chunks
 class ChunkManager:
@@ -306,4 +328,4 @@ with tab2:
             st.rerun()
     st.subheader("Final Document")
-    st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")

 from bs4 import BeautifulSoup
 from html_to_markdown import convert_to_markdown
 import re
+from llama_index.core.node_parser import SentenceSplitter
 from llama_index.core.schema import Document, MetadataMode
 import textstat # For readability metrics
             return f"An unexpected error occurred: {e}"
     def parse_markdown_into_chunks(self, markdown_content: str) -> list:
+        """
+        Parses markdown content into chunks using a sentence splitter for more
+        reliable chunking than header-based splitting.
+        """
         if not markdown_content or "Error" in markdown_content:
             return []
+        # Use SentenceSplitter for more reliable chunking based on size,
+        # rather than relying on Markdown headers which may not exist.
+        parser = SentenceSplitter(
+            chunk_size=2000,  # Characters per chunk
+            chunk_overlap=200 # Characters to overlap between chunks
+        )
         doc = Document(text=markdown_content)
         nodes = parser.get_nodes_from_documents([doc])
         structured_chunks = []
         for i, node in enumerate(nodes):
             content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
             if not content:
                 continue
+            # Generate a title from the first non-empty line of the chunk
+            first_line = next((line for line in content.split('\n') if line.strip()), "")
+            title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
+            if not title:
+                title = f"Chunk {i+1}"
+            structured_chunks.append({
+                "id": i,
+                "title": title,
+                "content": content
+            })
         return structured_chunks
 class ChunkManager:
             st.rerun()
     st.subheader("Final Document")
+    st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")