Spaces:

Em4e
/

chunk-based-text-editor

Sleeping

App Files Files Community

Em4e commited on Jun 9, 2025

Commit

c7506fd

verified ·

1 Parent(s): 581abcd

Upload app.py

Browse files

Files changed (1) hide show

app.py +297 -542

app.py CHANGED Viewed

@@ -1,542 +1,297 @@
-import streamlit as st
-import requests
-from bs4 import BeautifulSoup
-from html_to_markdown import convert_to_markdown
-import re
-from llama_index.core.node_parser import MarkdownNodeParser
-from llama_index.core.schema import Document, MetadataMode
-import textstat # For readability metrics
-class WebpageContentProcessor:
-    """
-    Handles fetching, converting, and parsing webpage content into structured chunks.
-    Adheres to the Single Responsibility Principle (SRP) for content processing.
-    """
-    def __init__(self):
-        pass
-    def fetch_and_convert_to_markdown(self, url: str) -> str:
-        """
-        Fetches HTML content from a given URL, attempts to isolate the main content,
-        removes common boilerplate, and converts to Markdown.
-        Prioritizes semantic content tags over H1-based identification for robust extraction.
-        """
-        try:
-            response = requests.get(url, timeout=10) # Add a timeout for robustness
-            response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
-            html_content = response.text
-            soup = BeautifulSoup(html_content, 'html.parser')
-            # Aggressive initial removal of script, style, and meta tags that are never content.
-            for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
-                for element in soup.find_all(tag_name):
-                    element.decompose()
-            content_for_conversion = None
-            # Prioritize finding main content containers first (semantic tags and common divs)
-            content_for_conversion = soup.find('article') or soup.find('main') or \
-                                     soup.find('div', class_='main-content') or \
-                                     soup.find('div', {'role': 'main'})
-            # Fallback logic if main content container wasn't found
-            if not content_for_conversion:
-                 first_h1 = soup.find('h1')
-                 if first_h1:
-                    candidate_container = first_h1.parent
-                    found_main_wrapper_via_h1_parent = False
-                    # Check up to 5 parent levels for a suitable content wrapper
-                    for _ in range(5):
-                         if candidate_container is None:
-                            break
-                         if candidate_container.name in ['article', 'main', 'section', 'div'] and \
-                           any(cls in candidate_container.get('class', []) for cls in ['content', 'post-body', 'article-content', 'entry-content', 'main-content']) or \
-                           candidate_container.get('role') == 'main':
-                            content_for_conversion = candidate_container
-                            found_main_wrapper_via_h1_parent = True
-                            break
-                         candidate_container = candidate_container.parent
-                    # If no clear wrapper found via H1's parent, take H1 and its direct siblings as a fallback
-                    if not found_main_wrapper_via_h1_parent:
-                        temp_soup = BeautifulSoup('', 'html.parser')
-                        temp_soup.append(first_h1)
-                        current_element = first_h1.next_sibling
-                        while current_element:
-                            temp_soup.append(current_element)
-                            current_element = current_element.next_sibling
-                        content_for_conversion = temp_soup
-                 else:
-                    # Ultimate fallback: use the entire body if no specific content tags or H1 found
-                    content_for_conversion = soup.body
-            if not content_for_conversion:
-                return "Error: Could not identify main content for conversion."
-            # Selective boilerplate removal within the *identified* main content tag
-            unwanted_elements_in_content = [
-                'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button', 'input',
-                'textarea', 'svg', 'canvas', 'audio', 'video', 'picture', 'source', 'track',
-                'map', 'area', 'embed', 'object', 'param', 'applet', 'bgsound', 'frame',
-                'frameset', 'noframes', 'template', 'slot', 'portal', 'datalist', 'keygen',
-                'output', 'progress', 'meter', 'details', 'summary', 'dialog', 'menu',
-                'menuitem', 'command', 'hr', 'figure', 'figcaption', 'cite',
-                '.social-share', '.comments', '.related-posts', '.pagination',
-                '.breadcrumbs', '.pop-up', '.modal', '.overlay', '.cookie-consent',
-                '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
-                '[role="complementary"]', '[role="search"]', '[role="menubar"]', '[role="toolbar"]',
-                '[class*="utility"]', '[class*="global-nav"]', '[class*="skip"]', '[class*="toast"]',
-                '[class*="announcement"]', '[class*="fixed-bottom"]', '[class*="fixed-top"]',
-                '[id*="promo"]', '[id*="ad"]', '[id*="banner"]', '[id*="popup"]', '[id*="modal"]',
-                '[id*="overlay"]', '[id*="cookie"]', '[id*="skip"]', '[id*="navbar"]', '[id*="menu"]',
-                '.hidden', '.visually-hidden',
-                '.no-print', '.print-hide',
-                '.wp-block-navigation', '.wp-block-group.is-style-stripes',
-                '[class*="column"]', '[class*="grid"]'
-             ]
-            for selector in unwanted_elements_in_content:
-                if re.match(r'^[a-zA-Z0-9]+$', selector):
-                     for element in content_for_conversion.find_all(selector):
-                        element.decompose()
-                else:
-                     for element in content_for_conversion.select(selector):
-                        element.decompose()
-            markdown_output = convert_to_markdown(str(content_for_conversion))
-            # Post-processing: Clean up resulting Markdown
-            markdown_output = re.sub(r'\n\s*\n\s*\n+', '\n\n', markdown_output)
-            markdown_output = re.sub(r'^\s*[\*\-]\s*$', '', markdown_output, flags=re.MULTILINE)
-            markdown_output = re.sub(r'\*{3,}', '', markdown_output)
-            markdown_output = markdown_output.strip()
-            return markdown_output
-        except requests.exceptions.Timeout:
-            return "Error: Request timed out. The server took too long to respond."
-        except requests.exceptions.RequestException as e:
-            return f"Error fetching URL: {e}. Please check the URL or your internet connection."
-        except Exception as e:
-            return f"An unexpected error occurred during HTML conversion: {e}"
-    def parse_markdown_into_chunks(self, markdown_content: str) -> list:
-        """
-        Parses Markdown content into LlamaIndex nodes (chunks) and extracts title and content.
-        Adheres to SRP for parsing logic.
-        """
-        if not markdown_content or "Error fetching URL" in markdown_content or "An unexpected error occurred" in markdown_content:
-            return []
-        doc = Document(text=markdown_content, metadata={"filename": "webpage_content"})
-        parser = MarkdownNodeParser(include_metadata=True)
-        nodes = parser.get_nodes_from_documents([doc])
-        print(f"✅ Parsed {len(nodes)} nodes from Markdown.") # Debug print
-        structured_chunks = []
-        current_id = 0
-        for node in nodes:
-            pure_text_content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
-            heading_title = ""
-            content_text = pure_text_content
-            heading_match = re.match(r"^(#+)\s*(.*)", pure_text_content)
-            if heading_match:
-                heading_title = heading_match.group(2).strip()
-                content_text = pure_text_content[len(heading_match.group(0)):].strip()
-                if not heading_title:
-                     heading_title = "[Untitled Section]"
-            else:
-                first_line = content_text.split('\n')[0].strip()
-                heading_title = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
-                if not heading_title:
-                    heading_title = "[Empty Section]"
-                elif not content_text:
-                        heading_title = "[Empty Section]"
-            structured_chunks.append({
-                "id": current_id,
-                "title": heading_title,
-                "content": content_text,
-                "original_node": node # Keep reference to the original LlamaIndex node
-            })
-            current_id += 1
-        return structured_chunks
-class ChunkManager:
-    """
-    Manages the collection of content chunks, their statistics, and target settings.
-    Adheres to SRP for chunk data management and OCP by allowing new statistics
-    or formatting without changing core chunk operations.
-    """
-    def __init__(self):
-        self._chunks = []
-        self.target_flesch_min = 60
-        self.target_grade_max = 8
-        self.target_min_chunk_words = 50
-        self.target_max_chunk_words = 500
-    def set_chunks(self, chunks: list):
-        """Sets the internal list of chunks and calculates their initial statistics."""
-        self._chunks = []
-        for chunk in chunks:
-            chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
-            self._chunks.append(chunk)
-    def get_chunks(self) -> list:
-        """Returns the current list of processed chunks."""
-        return self._chunks
-    def _calculate_chunk_stats(self, text: str) -> dict:
-        """
-        Calculates various linguistic statistics for a given text chunk.
-        (Private helper method, SRP for stats calculation)
-        """
-        stats = {}
-        cleaned_text = re.sub(r'#+\s*', '', text)
-        cleaned_text = re.sub(r'[\*\-]\s*', '', cleaned_text)
-        cleaned_text = re.sub(r'\n\s*\n+', ' ', cleaned_text).strip()
-        stats['word_count'] = textstat.lexicon_count(cleaned_text, removepunct=True)
-        stats['char_count'] = len(cleaned_text)
-        stats['sentence_count'] = textstat.sentence_count(cleaned_text)
-        if stats['sentence_count'] > 0:
-            stats['avg_sentence_length'] = stats['word_count'] / stats['sentence_count']
-        else:
-            stats['avg_sentence_length'] = 0
-        stats['paragraph_count'] = cleaned_text.count('\n\n') + 1 if cleaned_text else 0
-        try:
-            stats['flesch_reading_ease'] = textstat.flesch_reading_ease(cleaned_text)
-        except Exception:
-            stats['flesch_reading_ease'] = 0
-        try:
-            stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(cleaned_text)
-        except Exception:
-            stats['flesch_kincaid_grade'] = 0
-        try:
-            stats['gunning_fog_score'] = textstat.gunning_fog(cleaned_text)
-        except Exception:
-            stats['gunning_fog_score'] = 0
-        return stats
-    def format_chunk_stats(self, stats: dict) -> str:
-        """
-        Formats chunk statistics into a readable string, including explanations for readability scores.
-        Adheres to SRP for formatting.
-        """
-        flesch_ease_color = "red" if stats['flesch_reading_ease'] < self.target_flesch_min else "green"
-        kincaid_grade_color = "red" if stats['flesch_kincaid_grade'] > self.target_grade_max else "green"
-        word_count_color = "red" if not (self.target_min_chunk_words <= stats['word_count'] <= self.target_max_chunk_words) else "green"
-        stats_str = "#### Chunk Statistics:\n"
-        stats_str += f"- **Word Count:** <span style='color:{word_count_color}'>{stats['word_count']}</span> (Target: {self.target_min_chunk_words}-{self.target_max_chunk_words})\n"
-        stats_str += f"- **Character Count:** {stats['char_count']}\n"
-        stats_str += f"- **Sentence Count:** {stats['sentence_count']}\n"
-        stats_str += f"- **Avg Sentence Length:** {stats['avg_sentence_length']:.2f} words\n"
-        stats_str += f"- **Paragraph Count:** {stats['paragraph_count']}\n"
-        stats_str += f"- **Flesch Reading Ease:** <span style='color:{flesch_ease_color}'>{stats['flesch_reading_ease']:.2f}</span> (Higher scores mean easier to read.)\n"
-        stats_str += f"- **Flesch-Kincaid Grade:** <span style='color:{kincaid_grade_color}'>{stats['flesch_kincaid_grade']:.2f}</span> (Indicates the U.S. grade level needed to understand the text.)\n"
-        stats_str += f"- **Gunning Fog Score:** {stats['gunning_fog_score']:.2f}\n"
-        return stats_str
-    def get_document_summary_stats(self) -> str:
-        """
-        Aggregates statistics for the entire document across all managed chunks.
-        Adheres to SRP for document-level summary.
-        """
-        if not self._chunks:
-            return "No document loaded to generate statistics."
-        total_words = 0
-        total_chars = 0
-        total_sentences = 0
-        total_paragraphs = 0
-        all_content_text = ""
-        for chunk in self._chunks:
-            content_text_for_stats = chunk['content']
-            # Re-calculate stats for each chunk content to ensure summary is up-to-date
-            current_chunk_stats = self._calculate_chunk_stats(content_text_for_stats)
-            total_words += current_chunk_stats['word_count']
-            total_chars += current_chunk_stats['char_count']
-            total_sentences += current_chunk_stats['sentence_count']
-            total_paragraphs += current_chunk_stats['paragraph_count']
-            all_content_text += content_text_for_stats + "\n\n"
-        doc_stats_str = "## Overall Document Statistics:\n"
-        doc_stats_str += f"- **Total Chunks:** {len(self._chunks)}\n"
-        doc_stats_str += f"- **Total Words:** {total_words}\n"
-        doc_stats_str += f"- **Total Characters:** {total_chars}\n"
-        doc_stats_str += f"- **Total Sentences:** {total_sentences}\n"
-        doc_stats_str += f"- **Total Paragraphs:** {total_paragraphs}\n"
-        if len(self._chunks) > 0:
-            doc_stats_str += f"- **Average Words per Chunk:** {total_words / len(self._chunks):.2f}\n"
-        if all_content_text.strip():
-            overall_stats = self._calculate_chunk_stats(all_content_text)
-            doc_stats_str += f"- **Overall Flesch Reading Ease:** {overall_stats['flesch_reading_ease']:.2f}\n"
-            doc_stats_str += f"- **Overall Flesch-Kincaid Grade Level:** {overall_stats['flesch_kincaid_grade']:.2f}\n"
-            doc_stats_str += f"- **Overall Gunning Fog Score:** {overall_stats['gunning_fog_score']:.2f}\n"
-            doc_stats_str += f"- **Overall Average Sentence Length:** {overall_stats['avg_sentence_length']:.2f} words\n"
-        else:
-            doc_stats_str += "- No content available for overall readability metrics.\n"
-        return doc_stats_str
-    def get_chunk_by_id(self, chunk_id: int) -> dict | None:
-        """Retrieves a chunk by its ID."""
-        return next((chunk for chunk in self._chunks if chunk["id"] == chunk_id), None)
-    def get_chunk_titles_for_dropdown(self) -> list:
-        """Generates dropdown choices using plain text (no HTML)."""
-        dropdown_choices = []
-        for chunk in self._chunks:
-            title = chunk['title']
-            dropdown_choices.append(f"{chunk['id']}: {title}")
-        return dropdown_choices
-    def update_chunk_content(self, chunk_id: int, new_content: str) -> bool:
-        """
-        Updates the content of a chunk, recalculates its stats, and updates its title if needed.
-        Returns True if successful, False otherwise.
-        """
-        for chunk in self._chunks:
-            if chunk["id"] == chunk_id:
-                chunk["content"] = new_content
-                chunk["stats"] = self._calculate_chunk_stats(new_content)
-                # Update chunk title if it was a placeholder or empty
-                if chunk["title"].startswith("[") and chunk["title"].endswith("]") or not chunk["title"]:
-                    first_line = new_content.split('\n')[0].strip()
-                    chunk["title"] = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
-                    if not chunk["title"]:
-                        chunk["title"] = "[Empty Section]"
-                    elif not new_content:
-                            chunk["title"] = "[Empty Section]"
-                return True
-        return False
-    def delete_chunk(self, chunk_id: int) -> bool:
-        """
-        Deletes a chunk by ID and re-indexes remaining chunks.
-        Returns True if successful, False otherwise.
-        """
-        initial_chunk_count = len(self._chunks)
-        self._chunks = [chunk for chunk in self._chunks if chunk["id"] != chunk_id]
-        if len(self._chunks) == initial_chunk_count:
-            return False # Chunk not found
-        # Re-index IDs to be sequential again
-        for i, chunk in enumerate(self._chunks):
-            chunk['id'] = i
-        return True
-    def get_final_markdown(self) -> str:
-        """Compiles all current chunks into a single Markdown string."""
-        final_md = ""
-        if not self._chunks:
-            return "No content to compile. Please process a URL first."
-        for chunk in self._chunks:
-            # Use H1 heading if title is meaningful
-            if not chunk["title"].startswith("[") and chunk["title"]:
-                final_md += f"# {chunk['title']}\n\n"
-            final_md += f"{chunk['content']}\n\n"
-        return final_md.strip()
-    def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
-        """Sets the global readability and word count targets."""
-        self.target_flesch_min = flesch_min
-        self.target_grade_max = grade_max
-        self.target_min_chunk_words = min_words
-        self.target_max_chunk_words = max_words
-        # Recalculate stats for all chunks to reflect new targets in color coding (if displayed)
-        for chunk in self._chunks:
-            chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
-# --- Streamlit UI Definition ---
-st.set_page_config(layout="wide", page_title="Chunk-Powered Webpage Editor")
-# Initialize session state
-if 'chunk_manager' not in st.session_state:
-    st.session_state.chunk_manager = ChunkManager()
-if 'content_processor' not in st.session_state:
-    st.session_state.content_processor = WebpageContentProcessor()
-if 'status_message' not in st.session_state:
-    st.session_state.status_message = ""
-if 'chunk_selector' not in st.session_state:
-    st.session_state.chunk_selector = None
-if 'chunk_content_editor' not in st.session_state:
-    st.session_state.chunk_content_editor = ""
-if 'final_markdown' not in st.session_state:
-    st.session_state.final_markdown = "Click 'Compile All Chunks' to see the final document with your edits."
-# Instantiate the managers
-content_processor = st.session_state.content_processor
-chunk_manager = st.session_state.chunk_manager
-st.markdown("# <center>✨ Chunk-Powered Webpage Editor ✨</center>", unsafe_allow_html=True)
-st.info(
-    "ℹ️ **Please Note:**\n\n"
-    "- Some URLs may be inaccessible due to restrictive server policies (e.g., firewalls or bot detection).\n"
-    "- This is an early version of the app, and you may encounter some bugs."
-)
-st.markdown("""Enter a URL, fetch its content, and break it into editable 'chunks'. Review statistics, set targets, edit chunks, and compile your final Markdown.<div style="font-size: 0.9em; margin-bottom: 12px;">
-    Inspired by <a href="https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/" target="_blank">Andrea Volpini</a></div><div style="display: flex; justify-content: flex-start; align-items: center; gap: 16px;">
-    <span>Runs best on Desktop. App created by <a href="https://www.linkedin.com/in/emilijagjorgjevska/" target="_blank">Emilija Gjorgjevska</a></span>
-    <a href="https://buymeacoffee.com/emiliagjorgjevska" target="_blank">
-        <img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" alt="Buy Me A Coffee" style="height: 30px;">
-    </a></div><br>""", unsafe_allow_html=True)
-# --- URL Input and Processing ---
-col1, col2 = st.columns([4, 1])
-with col1:
-    url_input = st.text_input(
-        label="Enter Webpage URL",
-        placeholder="e.g., https://www.llamaindex.ai/blog/what-is-llamaindex",
-        key="url_input"
-    )
-with col2:
-    st.write("") # Spacer
-    st.write("") # Spacer
-    process_button = st.button("Process URL", use_container_width=True)
-if st.session_state.status_message:
-    st.info(st.session_state.status_message)
-if process_button:
-    if not url_input:
-        st.session_state.status_message = "Please enter a URL to process."
-    else:
-        with st.spinner("Processing URL..."):
-            markdown_content = content_processor.fetch_and_convert_to_markdown(url_input)
-            if "Error" in markdown_content:
-                chunk_manager.set_chunks([])
-                st.session_state.status_message = markdown_content
-            else:
-                chunks = content_processor.parse_markdown_into_chunks(markdown_content)
-                chunk_manager.set_chunks(chunks)
-                st.session_state.status_message = "URL processed successfully!" if chunks else "URL processed, but no content chunks could be extracted."
-                if chunks:
-                    st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
-                else:
-                    st.session_state.chunk_selector = None
-# --- Tabs for Editor and Overview ---
-tab1, tab2 = st.tabs(["Editor", "Document Overview & Targets"])
-with tab1:
-    st.markdown("## Edit Chunks Individually")
-    chunk_selector_options = chunk_manager.get_chunk_titles_for_dropdown()
-    if chunk_selector_options:
-        try:
-            # Find the index of the currently selected item to handle updates
-            current_selection_index = chunk_selector_options.index(st.session_state.chunk_selector)
-        except (ValueError, TypeError):
-            current_selection_index = 0
-        selected_chunk_title = st.selectbox(
-            label="Select Chunk to Edit",
-            options=chunk_selector_options,
-            index=current_selection_index,
-            key="chunk_selector"
-        )
-    else:
-        selected_chunk_title = st.selectbox(
-            label="Select Chunk to Edit",
-            options=["No chunks available"],
-            disabled=True
-        )
-    # Get the currently selected chunk
-    selected_chunk = None
-    if selected_chunk_title and "No chunks available" not in selected_chunk_title:
-        current_id = int(selected_chunk_title.split(':')[0].strip())
-        selected_chunk = chunk_manager.get_chunk_by_id(current_id)
-    if selected_chunk:
-        st.text_input(
-            label="Chunk Title (Auto-detected)",
-            value=selected_chunk["title"],
-            disabled=True
-        )
-        chunk_content_editor = st.text_area(
-            label="Chunk Content",
-            value=selected_chunk["content"],
-            height=250,
-            key=f"editor_{selected_chunk['id']}" # Unique key to prevent state loss
-        )
-        st.markdown(
-            chunk_manager.format_chunk_stats(selected_chunk['stats']),
-            unsafe_allow_html=True
-        )
-        update_col, delete_col, _ = st.columns([1, 1, 3])
-        with update_col:
-            if st.button("Update Selected Chunk", use_container_width=True):
-                chunk_manager.update_chunk_content(selected_chunk['id'], chunk_content_editor)
-                st.session_state.status_message = f"Chunk '{selected_chunk_title}' updated successfully!"
-                # Force a re-render to update the dropdown with the new title
-                st.session_state.chunk_selector = f"{selected_chunk['id']}: {chunk_manager.get_chunk_by_id(selected_chunk['id'])['title']}"
-        with delete_col:
-            if st.button("Delete Selected Chunk", use_container_width=True):
-                chunk_manager.delete_chunk(selected_chunk['id'])
-                st.session_state.status_message = f"Chunk '{selected_chunk_title}' deleted successfully!"
-                if chunk_manager.get_chunks():
-                    st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
-                else:
-                    st.session_state.chunk_selector = None
-    else:
-        st.text_input("Chunk Title (Auto-detected)", "Title of the selected chunk", disabled=True)
-        st.text_area("Chunk Content", "Content of the selected chunk will appear here for editing.", height=250, disabled=True)
-        st.markdown("Chunk statistics will appear here.")
-    st.markdown("---")
-    st.markdown("## Final Compiled Markdown")
-    if st.button("Compile All Chunks", use_container_width=True):
-        st.session_state.final_markdown = chunk_manager.get_final_markdown()
-    st.text_area(
-        label="Compiled Markdown",
-        value=st.session_state.final_markdown,
-        height=400,
-        key="final_markdown_output",
-        disabled=False
-    )
-with tab2:
-    st.markdown("## Document Summary Statistics")
-    st.markdown(chunk_manager.get_document_summary_stats(), unsafe_allow_html=True)
-    st.markdown("---")
-    st.markdown("## Content Targets")
-    st.markdown("Adjust these targets to guide your writing and see visual feedback in the chunk selector (green=good, red=needs attention).")
-    with st.form("targets_form"):
-        col1, col2 = st.columns(2)
-        with col1:
-            target_flesch_min_input = st.number_input("Min Flesch Reading Ease", value=float(chunk_manager.target_flesch_min))
-            target_min_chunk_words_input = st.number_input("Min Chunk Words", value=chunk_manager.target_min_chunk_words)
-        with col2:
-            target_grade_max_input = st.number_input("Max Flesch-Kincaid Grade", value=float(chunk_manager.target_grade_max))
-            target_max_chunk_words_input = st.number_input("Max Chunk Words", value=chunk_manager.target_max_chunk_words)
-        submitted = st.form_submit_button("Set New Targets", use_container_width=True)
-        if submitted:
-            chunk_manager.set_targets(
-                target_flesch_min_input,
-                target_grade_max_input,
-                int(target_min_chunk_words_input),
-                int(target_max_chunk_words_input)
-            )
-            st.session_state.status_message = "Target settings updated."
-            st.rerun()

+import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+from html_to_markdown import convert_to_markdown
+import re
+from llama_index.core.node_parser import MarkdownNodeParser
+from llama_index.core.schema import Document, MetadataMode
+import textstat # For readability metrics
+class WebpageContentProcessor:
+    """
+    Handles fetching, converting, and parsing webpage content into structured chunks.
+    Adheres to the Single Responsibility Principle (SRP) for content processing.
+    """
+    def __init__(self):
+        pass
+    def fetch_and_convert_to_markdown(self, url: str) -> str:
+        """
+        Fetches HTML content from a given URL, attempts to isolate the main content,
+        removes common boilerplate, and converts to Markdown.
+        Prioritizes semantic content tags over H1-based identification for robust extraction.
+        """
+        try:
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+            }
+            response = requests.get(url, headers=headers, timeout=15)
+            response.raise_for_status()
+            html_content = response.text
+            soup = BeautifulSoup(html_content, 'html.parser')
+            for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
+                for element in soup.find_all(tag_name):
+                    element.decompose()
+            content_for_conversion = soup.find('article') or soup.find('main') or \
+                                     soup.find('div', class_='main-content') or \
+                                     soup.find('div', {'role': 'main'})
+            if not content_for_conversion:
+                first_h1 = soup.find('h1')
+                if first_h1:
+                    candidate_container = first_h1.parent
+                    for _ in range(5):
+                        if candidate_container is None: break
+                        if candidate_container.name in ['article', 'main', 'section', 'div']:
+                            content_for_conversion = candidate_container
+                            break
+                        candidate_container = candidate_container.parent
+                    if not content_for_conversion:
+                         content_for_conversion = first_h1.find_parent()
+                else:
+                    content_for_conversion = soup.body
+            if not content_for_conversion:
+                return "Error: Could not identify main content for conversion."
+            unwanted_selectors = [
+                'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button', 'input',
+                'textarea', 'svg', 'figure', 'figcaption',
+                '.social-share', '.comments', '.related-posts', '.pagination',
+                '.breadcrumbs', '.cookie-consent', '[role="navigation"]',
+                '[role="banner"]', '[role="contentinfo"]', '[class*="ad"]', '[id*="ad"]'
+            ]
+            for selector in unwanted_selectors:
+                for element in content_for_conversion.select(selector):
+                    element.decompose()
+            markdown_output = convert_to_markdown(str(content_for_conversion))
+            markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
+            markdown_output = markdown_output.strip()
+            return markdown_output
+        except requests.exceptions.Timeout:
+            return "Error: Request timed out. The server took too long to respond."
+        except requests.exceptions.RequestException as e:
+            return f"Error fetching URL: {e}."
+        except Exception as e:
+            return f"An unexpected error occurred: {e}"
+    def parse_markdown_into_chunks(self, markdown_content: str) -> list:
+        if not markdown_content or "Error" in markdown_content:
+            return []
+        doc = Document(text=markdown_content)
+        parser = MarkdownNodeParser(include_metadata=True)
+        nodes = parser.get_nodes_from_documents([doc])
+        structured_chunks = []
+        for i, node in enumerate(nodes):
+            content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
+            title_match = re.match(r"^(#+)\s*(.*)", content)
+            title = title_match.group(2).strip() if title_match else (content.split('\n')[0][:70] + "...")
+            structured_chunks.append({"id": i, "title": title, "content": content})
+        return structured_chunks
+class ChunkManager:
+    def __init__(self):
+        self._chunks = []
+        self.target_flesch_min = 60
+        self.target_grade_max = 8
+        self.target_min_chunk_words = 50
+        self.target_max_chunk_words = 500
+    def set_chunks(self, chunks: list):
+        self._chunks = [self._add_stats_to_chunk(chunk) for chunk in chunks]
+    def get_chunks(self) -> list:
+        return self._chunks
+    def _add_stats_to_chunk(self, chunk: dict) -> dict:
+        chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
+        return chunk
+    def _calculate_chunk_stats(self, text: str) -> dict:
+        stats = {}
+        try:
+            stats['word_count'] = textstat.lexicon_count(text, removepunct=True)
+            stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
+            stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
+        except Exception:
+            stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0})
+        return stats
+    def format_chunk_stats(self, stats: dict) -> str:
+        flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red"
+        grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red"
+        word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red"
+        return (
+            f"**Word Count:** <span style='color:{word_color}'>{stats.get('word_count', 0)}</span> | "
+            f"**Reading Ease:** <span style='color:{flesch_color}'>{stats.get('flesch_reading_ease', 0):.2f}</span> | "
+            f"**Grade Level:** <span style='color:{grade_color}'>{stats.get('flesch_kincaid_grade', 0):.2f}</span>"
+        )
+    def get_document_summary_stats(self) -> str:
+        if not self._chunks:
+            return "No document loaded."
+        total_words = sum(c['stats']['word_count'] for c in self._chunks)
+        avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks) if self._chunks else 0
+        avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks) if self._chunks else 0
+        return (
+            f"**Total Chunks:** {len(self._chunks)} | "
+            f"**Total Words:** {total_words} | "
+            f"**Avg. Reading Ease:** {avg_ease:.2f} | "
+            f"**Avg. Grade Level:** {avg_grade:.2f}"
+        )
+    def get_chunk_by_id(self, chunk_id: int) -> dict | None:
+        return next((c for c in self._chunks if c["id"] == chunk_id), None)
+    def update_chunk_content(self, chunk_id: int, new_content: str):
+        chunk = self.get_chunk_by_id(chunk_id)
+        if chunk:
+            chunk["content"] = new_content
+            self._add_stats_to_chunk(chunk)
+    def delete_chunk(self, chunk_id: int):
+        self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
+        for i, chunk in enumerate(self._chunks):
+            chunk['id'] = i
+    def get_final_markdown(self) -> str:
+        if not self._chunks:
+            return "No content to display."
+        return "\n\n".join(f"# {c['title']}\n{c['content']}" for c in self._chunks)
+    def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
+        self.target_flesch_min = flesch_min
+        self.target_grade_max = grade_max
+        self.target_min_chunk_words = min_words
+        self.target_max_chunk_words = max_words
+        self.set_chunks(self.get_chunks()) # Recalculate stats with new targets
+st.set_page_config(layout="wide", page_title="Webpage Content Editor")
+# Initialize session state variables
+if 'chunk_manager' not in st.session_state:
+    st.session_state.chunk_manager = ChunkManager()
+if 'content_processor' not in st.session_state:
+    st.session_state.content_processor = WebpageContentProcessor()
+if 'selected_chunk_id' not in st.session_state:
+    st.session_state.selected_chunk_id = None
+if 'status_message' not in st.session_state:
+    st.session_state.status_message = ""
+processor = st.session_state.content_processor
+manager = st.session_state.chunk_manager
+st.title("✨ Webpage Content Editor")
+st.caption("Created by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/) | Inspired by Andrea Volpini's work on content chunking.")
+st.info(
+    "**Note:** Some URLs may be blocked due to server policies (like bot detection). "
+    "This is an early version, so expect a few bugs!",
+    icon="ℹ️"
+)
+url_input = st.text_input("Enter a webpage URL to begin", key="url_input")
+if st.button("Process URL", use_container_width=True):
+    if url_input:
+        with st.spinner("Fetching and processing content..."):
+            markdown = processor.fetch_and_convert_to_markdown(url_input)
+            if "Error" in markdown:
+                st.session_state.status_message = markdown
+                manager.set_chunks([])
+            else:
+                chunks = processor.parse_markdown_into_chunks(markdown)
+                manager.set_chunks(chunks)
+                st.session_state.status_message = f"Successfully processed {len(chunks)} chunks." if chunks else "Could not extract content chunks."
+            if manager.get_chunks():
+                st.session_state.selected_chunk_id = manager.get_chunks()[0]['id']
+            else:
+                st.session_state.selected_chunk_id = None
+            st.rerun()
+if st.session_state.status_message:
+    st.toast(st.session_state.status_message)
+    st.session_state.status_message = "" # Clear message after showing
+tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
+with tab1:
+    chunks = manager.get_chunks()
+    if not chunks:
+        st.write("Process a URL to start editing chunks.")
+    else:
+        # Ensure selected_chunk_id is valid
+        if st.session_state.selected_chunk_id not in [c['id'] for c in chunks]:
+            st.session_state.selected_chunk_id = chunks[0]['id'] if chunks else None
+        if st.session_state.selected_chunk_id is not None:
+            chunk_options = {c['id']: f"Chunk {c['id']}: {c['title']}" for c in chunks}
+            def on_select_change():
+                # Callback to update the selected ID in session state
+                st.session_state.selected_chunk_id = st.session_state.chunk_selector
+            selected_id_from_widget = st.selectbox(
+                "Select a chunk to edit",
+                options=list(chunk_options.keys()),
+                format_func=lambda x: chunk_options[x],
+                key="chunk_selector",
+                on_change=on_select_change,
+                index=list(chunk_options.keys()).index(st.session_state.selected_chunk_id)
+            )
+            selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
+            if selected_chunk:
+                st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
+                edited_content = st.text_area(
+                    "Chunk Content",
+                    value=selected_chunk['content'],
+                    height=300,
+                    key=f"editor_{selected_chunk['id']}" # Unique key forces re-render
+                )
+                col1, col2, _ = st.columns([1, 1, 4])
+                if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
+                    manager.update_chunk_content(selected_chunk['id'], edited_content)
+                    st.session_state.status_message = "Chunk updated!"
+                    st.rerun()
+                if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
+                    old_id = selected_chunk['id']
+                    manager.delete_chunk(old_id)
+                    st.session_state.status_message = "Chunk deleted!"
+                    # Select the next available chunk or none if empty
+                    remaining_chunks = manager.get_chunks()
+                    st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
+                    st.rerun()
+with tab2:
+    st.subheader("Document Overview")
+    st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
+    st.subheader("Content Targets")
+    with st.form("targets_form"):
+        c1, c2 = st.columns(2)
+        f_min = c1.number_input("Min Flesch Reading Ease", value=float(manager.target_flesch_min))
+        g_max = c2.number_input("Max Flesch-Kincaid Grade", value=float(manager.target_grade_max))
+        w_min = c1.number_input("Min Chunk Words", value=int(manager.target_min_chunk_words))
+        w_max = c2.number_input("Max Chunk Words", value=int(manager.target_max_chunk_words))
+        if st.form_submit_button("Set New Targets", use_container_width=True):
+            manager.set_targets(f_min, g_max, w_min, w_max)
+            st.session_state.status_message = "Targets updated."
+            st.rerun()
+    st.subheader("Final Document")
+    st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")