File size: 16,936 Bytes
92ff4ac
 
 
 
c063934
92ff4ac
5543eef
0915f87
5543eef
 
92ff4ac
 
 
5543eef
92ff4ac
 
259dab0
92ff4ac
 
 
53eca1f
54f6925
92ff4ac
 
 
 
 
 
 
 
 
53eca1f
 
 
 
 
 
 
66603bd
53eca1f
54f6925
66603bd
 
 
07f83ac
92ff4ac
5543eef
92ff4ac
5543eef
92ff4ac
5543eef
92ff4ac
 
4d98418
5543eef
 
4d98418
92ff4ac
 
c063934
5543eef
92ff4ac
 
 
5543eef
 
fc54d8b
5543eef
 
 
 
c063934
5543eef
 
 
 
 
4d98418
c063934
5543eef
c063934
4d98418
92ff4ac
 
 
5543eef
 
 
92ff4ac
 
 
5543eef
 
 
92ff4ac
 
 
 
 
 
 
 
 
 
 
 
5543eef
92ff4ac
 
3217d2c
 
 
66603bd
92ff4ac
 
 
 
5543eef
92ff4ac
 
 
 
5543eef
 
 
92ff4ac
 
 
5543eef
92ff4ac
 
 
5543eef
 
 
 
 
92ff4ac
5543eef
 
 
 
92ff4ac
 
 
 
 
 
 
 
 
66603bd
 
 
 
 
 
92ff4ac
 
 
 
 
 
 
 
 
5543eef
 
66603bd
 
 
5543eef
66603bd
5543eef
 
92ff4ac
 
 
 
 
5543eef
 
 
92ff4ac
 
0f1fb1b
 
 
 
 
 
 
 
 
 
 
 
fc54d8b
5543eef
 
 
 
fc54d8b
 
 
 
 
 
92ff4ac
5543eef
 
 
4f72763
0f1fb1b
4f72763
 
 
 
 
 
 
 
 
 
 
 
 
 
de55a7c
4f72763
 
 
 
 
 
 
3e90953
 
4f72763
 
de55a7c
4f72763
 
 
 
 
 
 
 
 
e6e0447
28a462a
92ff4ac
5543eef
dab4d69
 
 
 
 
 
 
 
 
 
 
 
 
07f83ac
5543eef
 
 
3217d2c
 
 
92ff4ac
fc54d8b
92ff4ac
3217d2c
92ff4ac
fc54d8b
 
 
 
5543eef
fc54d8b
66603bd
92ff4ac
 
 
66603bd
92ff4ac
4f72763
 
 
a1361c0
 
 
 
 
 
 
 
 
 
 
 
4f72763
 
 
 
 
 
 
 
 
 
 
 
 
 
92ff4ac
4f72763
 
92ff4ac
 
4f72763
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.schema import Document, MetadataMode
import textstat
from markdownify import markdownify as md

# --- Core Logic Classes ---
class WebpageContentProcessor:
    """
    Handles fetching, converting, and parsing webpage content into structured chunks.
    This class is responsible for the entire content processing pipeline.
    """
    def __init__(self):
        pass

    def fetch_and_convert_to_markdown(self, url: str) -> str:
        """
        Fetches HTML content, removes common boilerplate tags from the entire page,
        and then converts the remaining body content to Markdown using markdownify.
        """
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(url, headers=headers, timeout=15)
            response.raise_for_status()
            html_content = response.text
            soup = BeautifulSoup(html_content, 'html.parser')
            # Remove common boilerplate and non-content tags from the entire document
            tags_to_remove = ['nav', 'header', 'footer', 'aside', 'script', 'style', 'noscript', 'form']
            for tag_name in tags_to_remove:
                for element in soup.find_all(tag_name):
                    element.decompose()
            # Process the entire remaining body
            content_container = soup.find('body')
            if not content_container:
                return "Error: Could not find the <body> of the webpage."
            markdown_output = md(str(content_container))
            # Post-processing to clean up the resulting Markdown
            markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
            markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
            return markdown_output.strip()
        except requests.exceptions.Timeout:
            return "Error: The request timed out. The server is taking too long to respond."
        except requests.exceptions.RequestException as e:
            return f"Error fetching the URL: {e}. Please check the URL and your connection."
        except Exception as e:
            return f"An unexpected error occurred during content processing: {e}"

    def parse_markdown_into_chunks(self, markdown_content: str) -> list:
        """
        Parses Markdown content into logically separated chunks based on its structure.
        Uses MarkdownNodeParser to respect headers and sections.
        """
        if not markdown_content or "Error" in markdown_content:
            return []
        parser = MarkdownNodeParser(include_metadata=True)
        doc = Document(text=markdown_content)
        nodes = parser.get_nodes_from_documents([doc])
        structured_chunks = []
        for i, node in enumerate(nodes):
            content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
            if not content:
                continue
            title_match = re.match(r"^(#+)\s*(.*)", content)
            if title_match:
                title = title_match.group(2).strip()
                content_text = content[len(title_match.group(0)):].strip()
            else:
                first_line = content.split('\n')[0].strip()
                title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
                content_text = content
            if not title:
                title = f"[Chunk {i+1}]"
            structured_chunks.append({
                "id": i,
                "title": title,
                "content": content_text
            })
        return structured_chunks

class ChunkManager:
    """
    Manages the state of chunks, including their content, statistics, and targets.
    """
    def __init__(self):
        self._chunks = []
        self.target_flesch_min = 60
        self.target_grade_max = 9
        self.target_min_chunk_words = 40
        self.target_max_chunk_words = 600

    def set_chunks(self, chunks: list):
        self._chunks = [self._add_stats_to_chunk(chunk) for chunk in chunks]

    def get_chunks(self) -> list:
        return self._chunks

    def _add_stats_to_chunk(self, chunk: dict) -> dict:
        chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
        return chunk

    def _calculate_chunk_stats(self, text: str) -> dict:
        """Calculates readability and other metrics for a text chunk."""
        stats = {}
        try:
            stats['word_count'] = textstat.lexicon_count(text, removepunct=True)
            stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
            stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
        except (Exception, TypeError):
            stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0})
        return stats

    def format_chunk_stats(self, stats: dict) -> str:
        """Creates a formatted string of stats with color-coding based on targets."""
        flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red"
        grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red"
        word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red"
        return (
            f"**Word Count:** <span style='color:{word_color};'>{stats.get('word_count', 0)}</span> &nbsp;&nbsp;|&nbsp;&nbsp; "
            f"**Reading Ease:** <span style='color:{flesch_color};'>{stats.get('flesch_reading_ease', 0):.2f}</span> &nbsp;&nbsp;|&nbsp;&nbsp; "
            f"**Grade Level:** <span style='color:{grade_color};'>{stats.get('flesch_kincaid_grade', 0):.2f}</span>"
        )

    def get_document_summary_stats(self) -> str:
        """Calculates and formats stats for the entire document."""
        if not self._chunks:
            return "No document loaded."
        total_words = sum(c['stats']['word_count'] for c in self._chunks)
        if len(self._chunks) > 0:
            avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks)
            avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks)
        else:
            avg_ease = avg_grade = 0
        return (
            f"- **Total Chunks:** {len(self._chunks)}\n"
            f"- **Total Words:** {total_words}\n"
            f"- **Avg. Reading Ease:** {avg_ease:.2f}\n"
            f"- **Avg. Grade Level:** {avg_grade:.2f}"
        )

    def get_chunk_by_id(self, chunk_id: int) -> dict | None:
        return next((c for c in self._chunks if c["id"] == chunk_id), None)

    def update_chunk_content(self, chunk_id: int, new_content: str):
        chunk = self.get_chunk_by_id(chunk_id)
        if chunk:
            chunk["content"] = new_content
            self._add_stats_to_chunk(chunk)
            if chunk["title"].startswith("["):
                 first_line = new_content.split('\n')[0].strip()
                 new_title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
                 if new_title:
                    chunk["title"] = new_title

    def delete_chunk(self, chunk_id: int):
        self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
        for i, chunk in enumerate(self._chunks):
            chunk['id'] = i

    def get_final_markdown(self) -> str:
        if not self._chunks:
            return "No content to display."
        final_doc_parts = []
        for c in self._chunks:
            is_header = re.match(r"^(#+)\s*(.*)", c['title'])
            if not c['title'].startswith("[") and not is_header:
                 final_doc_parts.append(f"## {c['title']}\n\n{c['content']}")
            else:
                 final_doc_parts.append(c['content'])
        return "\n\n---\n\n".join(final_doc_parts)

    def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
        self.target_flesch_min = flesch_min
        self.target_grade_max = grade_max
        self.target_min_chunk_words = min_words
        self.target_max_chunk_words = max_words
        self.set_chunks(self.get_chunks())

# --- Streamlit UI Application ---
st.set_page_config(layout="wide", page_title="Webpage Content Editor")

# --- MODIFIED: Custom CSS to increase sidebar width ---
st.markdown(
    """
    <style>
    [data-testid="stSidebar"] {
        width: 450px !important;
    }
    </style>
    """,
    unsafe_allow_html=True
)

def init_session_state():
    if 'processor' not in st.session_state:
        st.session_state.processor = WebpageContentProcessor()
    if 'manager' not in st.session_state:
        st.session_state.manager = ChunkManager()
    if 'selected_chunk_id' not in st.session_state:
        st.session_state.selected_chunk_id = None
    if 'status_message' not in st.session_state:
        st.session_state.status_message = ""

init_session_state()

processor = st.session_state.processor
manager = st.session_state.manager

with st.sidebar:
    # --- MODIFIED: Removed the st.image line for the logo ---
    st.title("Settings & Overview")
    
    with st.expander("About this App & AI Writing Guidelines", expanded=True):
        st.info(
            """
            This app helps you refine web content for AI synthesis by chunking it into logical, verifiable blocks.
            
            **Writing for AI Verifiability:**
            * **Structure with Headers:** Use H1, H2, H3 tags logically.
            * **Write for Clarity:** Use short, direct sentences. State facts explicitly.
            * **Create Verifiable Blocks:** Format content as definitions, Q&As, or step-by-step guides.
            * **Use the Editor's Metrics:** Aim for a **Reading Ease > 60** and a **Word Count** between 40-600 per chunk. The colors will guide you.
            """, icon="πŸ’‘"
        )
        
    st.subheader("πŸ“Š Document Overview")
    st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)

    st.subheader("🎯 Content Targets")
    with st.form("targets_form"):
        st.write("Set readability targets to guide your editing. Colors in the editor will reflect these targets.")
        c1, c2 = st.columns(2)
        f_min = c1.number_input("Min Flesch Reading Ease", value=float(manager.target_flesch_min), help="Measures readability. Higher scores mean the text is easier to read. Scores of 60-70 are considered plain English.")
        g_max = c2.number_input("Max Flesch-Kincaid Grade", value=float(manager.target_grade_max), help="Estimates the U.S. school grade level needed to understand the text. A score of 8.0 means an eighth grader can read it. Lower scores are easier to read.")
        w_min = c1.number_input("Min Chunk Words", value=int(manager.target_min_chunk_words))
        w_max = c2.number_input("Max Chunk Words", value=int(manager.target_max_chunk_words))

        if st.form_submit_button("Set New Targets", use_container_width=True):
            manager.set_targets(f_min, g_max, w_min, w_max)
            st.session_state.status_message = "Content targets have been updated."
            st.rerun()

    st.subheader("πŸ“‹ Final Compiled Document")
    st.text_area("Final Markdown Output", manager.get_final_markdown(), height=300, key="final_markdown")

# --- Main Page Layout ---
st.title("πŸ“ Content Chunk Editor")
st.caption("Developed by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/) | Inspired by Andrea Volpini's [work on content chunking](https://wordlift.io/blog/en/googles-ai-mode-product-pages/).<br>A tool to fetch, chunk, and refine web content for AI synthesis. Best experienced on desktop.", unsafe_allow_html=True)

url_input = st.text_input("Enter a webpage URL to start", key="url_input")
with st.expander("⚠️ Important Information", expanded=False):
    st.warning(
        """
        **Early Draft:** This is an early version of the application. You may encounter bugs or incomplete features.
        """,
        icon="πŸ› οΈ"
    )
    st.warning(
        """
        **Restrictive Bot Policy:** This tool fetches content using automated requests. If a target website blocks bots, the app may time out or fail to retrieve content.
        """,
        icon="πŸ€–"
    )

if st.button("Process URL", use_container_width=True, type="primary"):
    if url_input:
        with st.spinner("Fetching and chunking content..."):
            markdown = processor.fetch_and_convert_to_markdown(url_input)
            if "Error" in markdown:
                st.session_state.status_message = markdown
                manager.set_chunks([])
                st.session_state.selected_chunk_id = None
            else:
                chunks = processor.parse_markdown_into_chunks(markdown)
                manager.set_chunks(chunks)
                if chunks:
                    st.session_state.status_message = f"Successfully processed {len(chunks)} chunks."
                    st.session_state.selected_chunk_id = chunks[0]['id']
                else:
                    st.session_state.status_message = "Could not extract any content chunks."
                    st.session_state.selected_chunk_id = None
            st.rerun()

if st.session_state.status_message:
    st.toast(st.session_state.status_message)
    st.session_state.status_message = ""

chunks = manager.get_chunks()
if not chunks:
    st.write("Process a URL to begin editing content chunks, or adjust settings in the sidebar.")
    with st.expander("Chunking Strategy Examples"):
            st.write("See how different websites structure their content, affecting chunking quality.")
            st.error("**Bad Chunking Example (Few Structural Headers)**")
            st.markdown("""
            * [Wikipedia: Markdown](https://en.wikipedia.org/wiki/Markdown)
            """)
            st.success("**Good Chunking Examples (Clear, Hierarchical Headers)**")
            st.markdown("""
            * [The Blog Starter](https://www.theblogstarter.com/)
            * [Google Safety Blog](https://blog.google/technology/safety-security/google-survey-digital-security-2025/)
            * [HubSpot: What is a Blog?](https://blog.hubspot.com/marketing/what-is-a-blog)
            """)
else:
    chunk_ids = [c['id'] for c in chunks]
    if st.session_state.selected_chunk_id not in chunk_ids:
        st.session_state.selected_chunk_id = chunk_ids[0] if chunk_ids else None

    if st.session_state.selected_chunk_id is not None:
        chunk_options = {c['id']: c['title'] for c in chunks}

        selected_id = st.selectbox(
            "Select a chunk to edit",
            options=chunk_ids,
            format_func=lambda x: f"Chunk {x}: {chunk_options.get(x, 'N/A')}",
            index=chunk_ids.index(st.session_state.selected_chunk_id)
        )

        if selected_id != st.session_state.selected_chunk_id:
            st.session_state.selected_chunk_id = selected_id
            st.rerun()

        selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)

        if selected_chunk:
            editor_col, preview_col = st.columns(2)

            with editor_col:
                st.markdown(f"**Editing: {selected_chunk['title']}**")
                st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)

                edited_content = st.text_area(
                    "Chunk Content (Markdown)",
                    value=selected_chunk['content'],
                    height=400,
                    key=f"editor_{selected_chunk['id']}"
                )
                
                b_col1, b_col2, _ = st.columns([1, 1, 3])

                if b_col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
                    manager.update_chunk_content(selected_chunk['id'], edited_content)
                    st.session_state.status_message = "Chunk updated successfully!"
                    st.rerun()

                if b_col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
                    manager.delete_chunk(selected_chunk['id'])
                    st.session_state.status_message = "Chunk deleted."
                    remaining_chunks = manager.get_chunks()
                    st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
                    st.rerun()

            with preview_col:
                st.markdown("**Live Preview**")
                with st.container(height=525, border=True):
                     st.markdown(edited_content, unsafe_allow_html=True)