Spaces:

Em4e
/

chunk-based-text-editor

Sleeping

App Files Files Community

Em4e commited on Jun 10, 2025

Commit

0915f87

verified ·

1 Parent(s): 37f325d

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -23

app.py CHANGED Viewed

@@ -5,10 +5,9 @@ import re
 from llama_index.core.node_parser import MarkdownNodeParser
 from llama_index.core.schema import Document, MetadataMode
 import textstat
-from markdownify import markdownify as md
 # --- Core Logic Classes ---
 class WebpageContentProcessor:
     """
     Handles fetching, converting, and parsing webpage content into structured chunks.
@@ -30,27 +29,22 @@ class WebpageContentProcessor:
             response.raise_for_status()
             html_content = response.text
             soup = BeautifulSoup(html_content, 'html.parser')
             # Remove common boilerplate and non-content tags from the entire document
             tags_to_remove = ['nav', 'header', 'footer', 'aside', 'script', 'style', 'noscript', 'form']
             for tag_name in tags_to_remove:
                 for element in soup.find_all(tag_name):
                     element.decompose()
             # Process the entire remaining body
             content_container = soup.find('body')
             if not content_container:
                 return "Error: Could not find the <body> of the webpage."
             # --- MODIFIED: Switched to markdownify for conversion ---
             # markdownify is a simple function call.
             markdown_output = md(str(content_container))
             # -----------------------------------------------
             # Post-processing to clean up the resulting Markdown
             markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
             markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
             return markdown_output.strip()
         except requests.exceptions.Timeout:
             return "Error: The request timed out. The server is taking too long to respond."
@@ -202,6 +196,9 @@ def init_session_state():
         st.session_state.selected_chunk_id = None
     if 'status_message' not in st.session_state:
         st.session_state.status_message = ""
 init_session_state()
@@ -213,25 +210,24 @@ st.caption("A tool to fetch, chunk, and refine web content.")
 st.markdown(
     "Developed by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/). "
     "Inspired by Andrea Volpini's [work on content chunking](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)")
 with st.expander("ℹ️ App Information & Chunking Details", expanded=False):
     st.info(
         """
-        • **App version:** v0.0 (alpha) — this is the very first public release, so you may run into bugs or incomplete features.
-        • **Server policy warning:** this app relies on automated requests (“bots”) under the hood.
-          If the target server enforces a restrictive bot policy (e.g., rate-limits requests, blocks unknown user-agents or IP addresses), parts of the app **may not work** as expected.
-        **What to do if you hit an issue:**
-        1. Check the server’s logs or policy settings to ensure it allows automated clients.
         2. Keep an eye out for updates — v0.x → v1.0 is coming soon!
         ---
-        **How Layout-Based Chunking is Implemented Here**
-        This app uses a sophisticated, two-step process to create meaningful chunks based on the document’s visual and semantic structure:
-        1. **Structural Preservation (HTML → Markdown):**
-           Converts the webpage’s HTML into Markdown, translating tags (`<h1>`, `<p>`, `<ul>`) into their Markdown equivalents (`#`, paragraph breaks, `*`) to preserve layout and hierarchy.
-        2. **Layout-Aware Parsing (`MarkdownNodeParser`):**
-           Uses LlamaIndex’s `MarkdownNodeParser` to read the structured Markdown and split it at logical boundaries (headers like `#`, `##`, etc.), yielding context-aware chunks that respect original sections.
-        """
     , icon="ℹ️")
 url_input = st.text_input("Enter a webpage URL to start", key="url_input")
@@ -253,6 +249,7 @@ if st.button("Process URL", use_container_width=True, type="primary"):
                 else:
                     st.session_state.status_message = "Could not extract any content chunks."
                     st.session_state.selected_chunk_id = None
             st.rerun()
 if st.session_state.status_message:
@@ -282,6 +279,7 @@ with tab1:
             if selected_id != st.session_state.selected_chunk_id:
                 st.session_state.selected_chunk_id = selected_id
                 st.rerun()
             selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
@@ -297,11 +295,13 @@ with tab1:
                     key=f"editor_{selected_chunk['id']}"
                 )
-                col1, col2, _ = st.columns([1, 1, 5])
                 if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
                     manager.update_chunk_content(selected_chunk['id'], edited_content)
                     st.session_state.status_message = "Chunk updated successfully!"
                     st.rerun()
                 if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
@@ -309,8 +309,26 @@ with tab1:
                     st.session_state.status_message = "Chunk deleted."
                     remaining_chunks = manager.get_chunks()
                     st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
                     st.rerun()
 with tab2:
     st.subheader("Document Overview")
     st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
@@ -330,4 +348,4 @@ with tab2:
             st.rerun()
     st.subheader("Final Compiled Document")
-    st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")

 from llama_index.core.node_parser import MarkdownNodeParser
 from llama_index.core.schema import Document, MetadataMode
 import textstat
+from markdownify import markdownify as md
 # --- Core Logic Classes ---
 class WebpageContentProcessor:
     """
     Handles fetching, converting, and parsing webpage content into structured chunks.
             response.raise_for_status()
             html_content = response.text
             soup = BeautifulSoup(html_content, 'html.parser')
             # Remove common boilerplate and non-content tags from the entire document
             tags_to_remove = ['nav', 'header', 'footer', 'aside', 'script', 'style', 'noscript', 'form']
             for tag_name in tags_to_remove:
                 for element in soup.find_all(tag_name):
                     element.decompose()
             # Process the entire remaining body
             content_container = soup.find('body')
             if not content_container:
                 return "Error: Could not find the <body> of the webpage."
             # --- MODIFIED: Switched to markdownify for conversion ---
             # markdownify is a simple function call.
             markdown_output = md(str(content_container))
             # -----------------------------------------------
             # Post-processing to clean up the resulting Markdown
             markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
             markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
             return markdown_output.strip()
         except requests.exceptions.Timeout:
             return "Error: The request timed out. The server is taking too long to respond."
         st.session_state.selected_chunk_id = None
     if 'status_message' not in st.session_state:
         st.session_state.status_message = ""
+    # NEW: State for toggling the content preview
+    if 'show_preview' not in st.session_state:
+        st.session_state.show_preview = False
 init_session_state()
 st.markdown(
     "Developed by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/). "
     "Inspired by Andrea Volpini's [work on content chunking](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)")
 with st.expander("ℹ️ App Information & Chunking Details", expanded=False):
     st.info(
         """
+        • **App version:** v0.0 (alpha) — this is the very first public release, so you may run into bugs or incomplete features.
+        • **Server policy warning:** this app relies on automated requests (“bots”) under the hood.
+          If the target server enforces a restrictive bot policy (e.g., rate-limits requests, blocks unknown user-agents or IP addresses), parts of the app **may not work** as expected.
+        **What to do if you hit an issue:**
+        1. Check the server’s logs or policy settings to ensure it allows automated clients.
         2. Keep an eye out for updates — v0.x → v1.0 is coming soon!
         ---
+        **How Layout-Based Chunking is Implemented Here**
+          This app uses a sophisticated, two-step process to create meaningful chunks based on the document’s visual and semantic structure:
+          1. **Structural Preservation (HTML → Markdown):**
+             Converts the webpage’s HTML into Markdown, translating tags (`<h1>`, `<p>`, `<ul>`) into their Markdown equivalents (`#`, paragraph breaks, `*`) to preserve layout and hierarchy.
+          2. **Layout-Aware Parsing (`MarkdownNodeParser`):**
+             Uses LlamaIndex’s `MarkdownNodeParser` to read the structured Markdown and split it at logical boundaries (headers like `#`, `##`, etc.), yielding context-aware chunks that respect original sections.
+          """
     , icon="ℹ️")
 url_input = st.text_input("Enter a webpage URL to start", key="url_input")
                 else:
                     st.session_state.status_message = "Could not extract any content chunks."
                     st.session_state.selected_chunk_id = None
+            st.session_state.show_preview = False # Ensure preview is off when processing new URL
             st.rerun()
 if st.session_state.status_message:
             if selected_id != st.session_state.selected_chunk_id:
                 st.session_state.selected_chunk_id = selected_id
+                st.session_state.show_preview = False # NEW: Reset preview when changing chunk
                 st.rerun()
             selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
                     key=f"editor_{selected_chunk['id']}"
                 )
+                # MODIFIED: Added a third column for the Preview button
+                col1, col2, col3, _ = st.columns([1, 1, 1, 4])
                 if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
                     manager.update_chunk_content(selected_chunk['id'], edited_content)
                     st.session_state.status_message = "Chunk updated successfully!"
+                    st.session_state.show_preview = False # Hide preview after update
                     st.rerun()
                 if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
                     st.session_state.status_message = "Chunk deleted."
                     remaining_chunks = manager.get_chunks()
                     st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
+                    st.session_state.show_preview = False # Hide preview after delete
                     st.rerun()
+                # NEW: Preview button in the third column
+                with col3:
+                    if st.button("Preview Content", use_container_width=True, key=f"preview_{selected_chunk['id']}"):
+                        # Toggle the preview state
+                        st.session_state.show_preview = not st.session_state.show_preview
+                        st.rerun()
+                # NEW: Conditional container to show the rendered Markdown
+                if st.session_state.show_preview:
+                    st.markdown("---")
+                    with st.container(border=True):
+                        st.markdown("**Rendered Preview** (showing current editor content)")
+                        # Renders the content directly from the text_area above
+                        st.markdown(edited_content, unsafe_allow_html=True)
+                    st.markdown("---")
 with tab2:
     st.subheader("Document Overview")
     st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
             st.rerun()
     st.subheader("Final Compiled Document")
+    st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")