Spaces:

Em4e
/

chunk-based-text-editor

Sleeping

App Files Files Community

Em4e commited on Jun 9, 2025

Commit

07f83ac

verified ·

1 Parent(s): f27d6ee

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -31

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import streamlit as st
 import requests
 from bs4 import BeautifulSoup
-from html_to_markdown import convert_to_markdown
 import re
 from llama_index.core.node_parser import MarkdownNodeParser
 from llama_index.core.schema import Document, MetadataMode
 import textstat
 # --- Core Logic Classes ---
@@ -15,12 +15,14 @@ class WebpageContentProcessor:
     This class is responsible for the entire content processing pipeline.
     """
     def __init__(self):
-        pass
     def fetch_and_convert_to_markdown(self, url: str) -> str:
         """
         Fetches HTML content, removes common boilerplate tags from the entire page,
-        and then converts the remaining body content to Markdown.
         """
         try:
             headers = {
@@ -42,15 +44,17 @@ class WebpageContentProcessor:
             if not content_container:
                 return "Error: Could not find the <body> of the webpage."
-            # Convert the cleaned HTML to Markdown
-            markdown_output = convert_to_markdown(str(content_container))
             # Post-processing to clean up the resulting Markdown
             markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
             markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
-            return markdown_output.strip()
         except requests.exceptions.Timeout:
             return "Error: The request timed out. The server is taking too long to respond."
         except requests.exceptions.RequestException as e:
@@ -65,17 +69,14 @@ class WebpageContentProcessor:
         """
         if not markdown_content or "Error" in markdown_content:
             return []
         parser = MarkdownNodeParser(include_metadata=True)
         doc = Document(text=markdown_content)
         nodes = parser.get_nodes_from_documents([doc])
         structured_chunks = []
         for i, node in enumerate(nodes):
             content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
             if not content:
                 continue
             title_match = re.match(r"^(#+)\s*(.*)", content)
             if title_match:
                 title = title_match.group(2).strip()
@@ -84,10 +85,8 @@ class WebpageContentProcessor:
                 first_line = content.split('\n')[0].strip()
                 title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
                 content_text = content
             if not title:
                 title = f"[Chunk {i+1}]"
             structured_chunks.append({
                 "id": i,
                 "title": title,
@@ -132,7 +131,6 @@ class ChunkManager:
         flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red"
         grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red"
         word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red"
         return (
             f"**Word Count:** <span style='color:{word_color};'>{stats.get('word_count', 0)}</span> &nbsp;&nbsp;|&nbsp;&nbsp; "
             f"**Reading Ease:** <span style='color:{flesch_color};'>{stats.get('flesch_reading_ease', 0):.2f}</span> &nbsp;&nbsp;|&nbsp;&nbsp; "
@@ -143,14 +141,12 @@ class ChunkManager:
         """Calculates and formats stats for the entire document."""
         if not self._chunks:
             return "No document loaded."
         total_words = sum(c['stats']['word_count'] for c in self._chunks)
         if len(self._chunks) > 0:
             avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks)
             avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks)
         else:
             avg_ease = avg_grade = 0
         return (
             f"- **Total Chunks:** {len(self._chunks)}\n"
             f"- **Total Words:** {total_words}\n"
@@ -189,7 +185,6 @@ class ChunkManager:
                  final_doc_parts.append(c['content'])
         return "\n\n---\n\n".join(final_doc_parts)
     def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
         self.target_flesch_min = flesch_min
         self.target_grade_max = grade_max
@@ -198,7 +193,6 @@ class ChunkManager:
         self.set_chunks(self.get_chunks())
 # --- Streamlit UI Application ---
 st.set_page_config(layout="wide", page_title="Webpage Content Editor")
 def init_session_state():
@@ -220,28 +214,22 @@ st.title("Chunk Webpage Content Editor")
 st.caption("A tool to fetch, chunk, and refine web content.")
 st.markdown(
     "Developed by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/). "
-    "Inspired by Andrea Volpini's [work on content chunking](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)"
-)
 st.info(
     """
     **How Layout-Based Chunking is Implemented Here**
     This app uses a sophisticated, two-step process to create meaningful chunks based on the document's visual and semantic structure:
     1.  **Structural Preservation (HTML → Markdown):**
         The code first converts the webpage's HTML into Markdown. This is a critical step because it translates structural tags (`<h1>`, `<p>`, `<ul>`) into their Markdown equivalents (`#`, paragraph breaks, `*`). This preserves the document's original layout and hierarchy.
     2.  **Layout-Aware Parsing (`MarkdownNodeParser`):**
         Next, it uses the `MarkdownNodeParser` from the LlamaIndex library. This specialized tool is designed to read the structured Markdown and split it at its logical boundaries—primarily the headers (`#`, `##`, etc.).
     The result is a set of context-aware chunks that respect the original document's sections, rather than being arbitrary splits.
     "**Note:** Some websites may block content scraping. This is an early version, so you might encounter bugs.",
     """,
-    icon="ℹ️"
-)
 url_input = st.text_input("Enter a webpage URL to start", key="url_input")
 if st.button("Process URL", use_container_width=True, type="primary"):
     if url_input:
         with st.spinner("Fetching and chunking content..."):
@@ -278,7 +266,7 @@ with tab1:
         if st.session_state.selected_chunk_id is not None:
             chunk_options = {c['id']: c['title'] for c in chunks}
             selected_id = st.selectbox(
                 "Select a chunk to edit",
                 options=chunk_ids,
@@ -295,7 +283,7 @@ with tab1:
             if selected_chunk:
                 st.markdown(f"**Editing: {selected_chunk['title']}**")
                 st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
                 edited_content = st.text_area(
                     "Chunk Content",
                     value=selected_chunk['content'],
@@ -304,11 +292,12 @@ with tab1:
                 )
                 col1, col2, _ = st.columns([1, 1, 5])
                 if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
                     manager.update_chunk_content(selected_chunk['id'], edited_content)
                     st.session_state.status_message = "Chunk updated successfully!"
                     st.rerun()
                 if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
                     manager.delete_chunk(selected_chunk['id'])
                     st.session_state.status_message = "Chunk deleted."
@@ -319,7 +308,7 @@ with tab1:
 with tab2:
     st.subheader("Document Overview")
     st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
     st.subheader("Content Targets")
     with st.form("targets_form"):
         st.write("Set readability targets to guide your editing. See color feedback in the editor.")

 import streamlit as st
 import requests
 from bs4 import BeautifulSoup
 import re
 from llama_index.core.node_parser import MarkdownNodeParser
 from llama_index.core.schema import Document, MetadataMode
 import textstat
+from markitdown import Markitdown # <-- MODIFIED: Import Markitdown
 # --- Core Logic Classes ---
     This class is responsible for the entire content processing pipeline.
     """
     def __init__(self):
+        # --- MODIFIED: Instantiate Markitdown converter ---
+        self.markdown_converter = Markitdown()
+        # -------------------------------------------------
     def fetch_and_convert_to_markdown(self, url: str) -> str:
         """
         Fetches HTML content, removes common boilerplate tags from the entire page,
+        and then converts the remaining body content to Markdown using Markitdown.
         """
         try:
             headers = {
             if not content_container:
                 return "Error: Could not find the <body> of the webpage."
+            # --- MODIFIED: Use Markitdown for conversion ---
+            # The .convert() method returns an object; the HTML is in the .text attribute
+            conversion_result = self.markdown_converter.convert(str(content_container))
+            markdown_output = conversion_result.text
+            # -----------------------------------------------
             # Post-processing to clean up the resulting Markdown
             markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
             markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
+            return markdown_output.strip()
         except requests.exceptions.Timeout:
             return "Error: The request timed out. The server is taking too long to respond."
         except requests.exceptions.RequestException as e:
         """
         if not markdown_content or "Error" in markdown_content:
             return []
         parser = MarkdownNodeParser(include_metadata=True)
         doc = Document(text=markdown_content)
         nodes = parser.get_nodes_from_documents([doc])
         structured_chunks = []
         for i, node in enumerate(nodes):
             content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
             if not content:
                 continue
             title_match = re.match(r"^(#+)\s*(.*)", content)
             if title_match:
                 title = title_match.group(2).strip()
                 first_line = content.split('\n')[0].strip()
                 title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
                 content_text = content
             if not title:
                 title = f"[Chunk {i+1}]"
             structured_chunks.append({
                 "id": i,
                 "title": title,
         flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red"
         grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red"
         word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red"
         return (
             f"**Word Count:** <span style='color:{word_color};'>{stats.get('word_count', 0)}</span> &nbsp;&nbsp;|&nbsp;&nbsp; "
             f"**Reading Ease:** <span style='color:{flesch_color};'>{stats.get('flesch_reading_ease', 0):.2f}</span> &nbsp;&nbsp;|&nbsp;&nbsp; "
         """Calculates and formats stats for the entire document."""
         if not self._chunks:
             return "No document loaded."
         total_words = sum(c['stats']['word_count'] for c in self._chunks)
         if len(self._chunks) > 0:
             avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks)
             avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks)
         else:
             avg_ease = avg_grade = 0
         return (
             f"- **Total Chunks:** {len(self._chunks)}\n"
             f"- **Total Words:** {total_words}\n"
                  final_doc_parts.append(c['content'])
         return "\n\n---\n\n".join(final_doc_parts)
     def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
         self.target_flesch_min = flesch_min
         self.target_grade_max = grade_max
         self.set_chunks(self.get_chunks())
 # --- Streamlit UI Application ---
 st.set_page_config(layout="wide", page_title="Webpage Content Editor")
 def init_session_state():
 st.caption("A tool to fetch, chunk, and refine web content.")
 st.markdown(
     "Developed by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/). "
+    "Inspired by Andrea Volpini's [work on content chunking](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)")
 st.info(
     """
     **How Layout-Based Chunking is Implemented Here**
     This app uses a sophisticated, two-step process to create meaningful chunks based on the document's visual and semantic structure:
     1.  **Structural Preservation (HTML → Markdown):**
         The code first converts the webpage's HTML into Markdown. This is a critical step because it translates structural tags (`<h1>`, `<p>`, `<ul>`) into their Markdown equivalents (`#`, paragraph breaks, `*`). This preserves the document's original layout and hierarchy.
     2.  **Layout-Aware Parsing (`MarkdownNodeParser`):**
         Next, it uses the `MarkdownNodeParser` from the LlamaIndex library. This specialized tool is designed to read the structured Markdown and split it at its logical boundaries—primarily the headers (`#`, `##`, etc.).
     The result is a set of context-aware chunks that respect the original document's sections, rather than being arbitrary splits.
     "**Note:** Some websites may block content scraping. This is an early version, so you might encounter bugs.",
     """,
+    icon="ℹ️")
 url_input = st.text_input("Enter a webpage URL to start", key="url_input")
 if st.button("Process URL", use_container_width=True, type="primary"):
     if url_input:
         with st.spinner("Fetching and chunking content..."):
         if st.session_state.selected_chunk_id is not None:
             chunk_options = {c['id']: c['title'] for c in chunks}
             selected_id = st.selectbox(
                 "Select a chunk to edit",
                 options=chunk_ids,
             if selected_chunk:
                 st.markdown(f"**Editing: {selected_chunk['title']}**")
                 st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
                 edited_content = st.text_area(
                     "Chunk Content",
                     value=selected_chunk['content'],
                 )
                 col1, col2, _ = st.columns([1, 1, 5])
                 if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
                     manager.update_chunk_content(selected_chunk['id'], edited_content)
                     st.session_state.status_message = "Chunk updated successfully!"
                     st.rerun()
                 if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
                     manager.delete_chunk(selected_chunk['id'])
                     st.session_state.status_message = "Chunk deleted."
 with tab2:
     st.subheader("Document Overview")
     st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
     st.subheader("Content Targets")
     with st.form("targets_form"):
         st.write("Set readability targets to guide your editing. See color feedback in the editor.")