Em4e commited on
Commit
5543eef
·
verified ·
1 Parent(s): c063934

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -127
app.py CHANGED
@@ -5,21 +5,22 @@ from html_to_markdown import convert_to_markdown
5
  import re
6
  from llama_index.core.node_parser import MarkdownNodeParser
7
  from llama_index.core.schema import Document, MetadataMode
8
- import textstat # For readability metrics
 
 
9
 
10
  class WebpageContentProcessor:
11
  """
12
  Handles fetching, converting, and parsing webpage content into structured chunks.
13
- Adheres to the Single Responsibility Principle (SRP) for content processing.
14
  """
15
  def __init__(self):
16
  pass
17
 
18
  def fetch_and_convert_to_markdown(self, url: str) -> str:
19
  """
20
- Fetches HTML content from a given URL, attempts to isolate the main content,
21
- removes common boilerplate, and converts to Markdown.
22
- Prioritizes semantic content tags over H1-based identification for robust extraction.
23
  """
24
  try:
25
  headers = {
@@ -30,107 +31,87 @@ class WebpageContentProcessor:
30
  html_content = response.text
31
  soup = BeautifulSoup(html_content, 'html.parser')
32
 
33
- for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
 
34
  for element in soup.find_all(tag_name):
35
  element.decompose()
36
 
 
37
  content_for_conversion = soup.find('article') or soup.find('main') or \
38
- soup.find('div', class_='main-content') or \
39
  soup.find('div', {'role': 'main'})
40
 
 
41
  if not content_for_conversion:
42
- first_h1 = soup.find('h1')
43
- if first_h1:
44
- candidate_container = first_h1.parent
45
- for _ in range(5):
46
- if candidate_container is None: break
47
- if candidate_container.name in ['article', 'main', 'section', 'div']:
48
- content_for_conversion = candidate_container
49
- break
50
- candidate_container = candidate_container.parent
51
- if not content_for_conversion:
52
- content_for_conversion = first_h1.find_parent()
53
- else:
54
- content_for_conversion = soup.body
55
-
56
- if not content_for_conversion:
57
- return "Error: Could not identify main content for conversion."
58
-
59
- unwanted_selectors = [
60
- 'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button', 'input',
61
- 'textarea', 'svg', 'figure', 'figcaption',
62
- '.social-share', '.comments', '.related-posts', '.pagination',
63
- '.breadcrumbs', '.cookie-consent', '[role="navigation"]',
64
- '[role="banner"]', '[role="contentinfo"]', '[class*="ad"]', '[id*="ad"]'
65
- ]
66
- for selector in unwanted_selectors:
67
- for element in content_for_conversion.select(selector):
68
- element.decompose()
69
 
 
70
  markdown_output = convert_to_markdown(str(content_for_conversion))
71
- markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
72
- markdown_output = markdown_output.strip()
73
-
74
  return markdown_output
75
 
76
  except requests.exceptions.Timeout:
77
- return "Error: Request timed out. The server took too long to respond."
78
  except requests.exceptions.RequestException as e:
79
- return f"Error fetching URL: {e}."
80
  except Exception as e:
81
- return f"An unexpected error occurred: {e}"
82
 
83
  def parse_markdown_into_chunks(self, markdown_content: str) -> list:
84
  """
85
- Parses Markdown content into LlamaIndex nodes (chunks) and extracts title and content.
86
- This version uses MarkdownNodeParser to leverage the document's structure.
87
  """
88
  if not markdown_content or "Error" in markdown_content:
89
  return []
90
 
91
- doc = Document(text=markdown_content, metadata={"filename": "webpage_content"})
92
  parser = MarkdownNodeParser(include_metadata=True)
 
93
  nodes = parser.get_nodes_from_documents([doc])
94
-
95
  structured_chunks = []
96
  for i, node in enumerate(nodes):
97
- pure_text_content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
98
- if not pure_text_content:
99
  continue
100
 
101
- heading_title = ""
102
- content_text = pure_text_content
103
-
104
- # Attempt to find a title from a markdown header
105
- heading_match = re.match(r"^(#+)\s*(.*)", pure_text_content)
106
- if heading_match:
107
- heading_title = heading_match.group(2).strip()
108
- # Remove the title from the content itself
109
- content_text = pure_text_content[len(heading_match.group(0)):].strip()
110
- if not heading_title:
111
- heading_title = "[Untitled Section]"
112
  else:
113
- # Fallback to using the first line as the title
114
- first_line = content_text.split('\n')[0].strip()
115
- heading_title = (first_line[:75] + "...") if len(first_line) > 75 else first_line
116
- if not heading_title:
117
- heading_title = "[Empty Section]"
 
 
 
118
 
119
  structured_chunks.append({
120
  "id": i,
121
- "title": heading_title,
122
  "content": content_text
123
  })
124
-
125
  return structured_chunks
126
 
127
  class ChunkManager:
 
 
 
128
  def __init__(self):
129
  self._chunks = []
130
  self.target_flesch_min = 60
131
- self.target_grade_max = 8
132
- self.target_min_chunk_words = 50
133
- self.target_max_chunk_words = 500
134
 
135
  def set_chunks(self, chunks: list):
136
  self._chunks = [self._add_stats_to_chunk(chunk) for chunk in chunks]
@@ -143,39 +124,45 @@ class ChunkManager:
143
  return chunk
144
 
145
  def _calculate_chunk_stats(self, text: str) -> dict:
 
146
  stats = {}
147
  try:
148
  stats['word_count'] = textstat.lexicon_count(text, removepunct=True)
149
  stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
150
  stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
151
- except Exception:
152
  stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0})
153
  return stats
154
 
155
  def format_chunk_stats(self, stats: dict) -> str:
 
156
  flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red"
157
  grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red"
158
  word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red"
159
-
160
  return (
161
- f"**Word Count:** <span style='color:{word_color}'>{stats.get('word_count', 0)}</span> | "
162
- f"**Reading Ease:** <span style='color:{flesch_color}'>{stats.get('flesch_reading_ease', 0):.2f}</span> | "
163
- f"**Grade Level:** <span style='color:{grade_color}'>{stats.get('flesch_kincaid_grade', 0):.2f}</span>"
164
  )
165
 
166
  def get_document_summary_stats(self) -> str:
 
167
  if not self._chunks:
168
  return "No document loaded."
169
-
170
  total_words = sum(c['stats']['word_count'] for c in self._chunks)
171
- avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks) if self._chunks else 0
172
- avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks) if self._chunks else 0
 
 
 
173
 
174
  return (
175
- f"**Total Chunks:** {len(self._chunks)} | "
176
- f"**Total Words:** {total_words} | "
177
- f"**Avg. Reading Ease:** {avg_ease:.2f} | "
178
- f"**Avg. Grade Level:** {avg_grade:.2f}"
179
  )
180
 
181
  def get_chunk_by_id(self, chunk_id: int) -> dict | None:
@@ -185,13 +172,7 @@ class ChunkManager:
185
  chunk = self.get_chunk_by_id(chunk_id)
186
  if chunk:
187
  chunk["content"] = new_content
188
- self._add_stats_to_chunk(chunk)
189
- # Optionally update title if it's derived from content
190
- if chunk["title"].startswith("[") or not re.match(r"^(#+)\s*(.*)", chunk["content"]):
191
- first_line = new_content.split('\n')[0].strip()
192
- chunk["title"] = (first_line[:75] + '...') if len(first_line) > 75 else first_line
193
- if not chunk["title"]: chunk["title"] = "[Empty Section]"
194
-
195
 
196
  def delete_chunk(self, chunk_id: int):
197
  self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
@@ -202,53 +183,61 @@ class ChunkManager:
202
  def get_final_markdown(self) -> str:
203
  if not self._chunks:
204
  return "No content to display."
205
- return "\n\n".join(f"# {c['title']}\n{c['content']}" for c in self._chunks)
 
 
 
 
 
 
 
 
 
206
 
207
  def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
208
  self.target_flesch_min = flesch_min
209
  self.target_grade_max = grade_max
210
  self.target_min_chunk_words = min_words
211
  self.target_max_chunk_words = max_words
212
- # Recalculate stats for all chunks with new targets
213
- self.set_chunks(self.get_chunks())
 
 
214
 
215
- # --- Streamlit UI ---
216
  st.set_page_config(layout="wide", page_title="Webpage Content Editor")
217
 
218
- # Initialize session state variables if they don't exist
219
  def init_session_state():
220
- if 'chunk_manager' not in st.session_state:
221
- st.session_state.chunk_manager = ChunkManager()
222
- if 'content_processor' not in st.session_state:
223
- st.session_state.content_processor = WebpageContentProcessor()
224
  if 'selected_chunk_id' not in st.session_state:
225
  st.session_state.selected_chunk_id = None
226
  if 'status_message' not in st.session_state:
227
  st.session_state.status_message = ""
228
- if 'url_input' not in st.session_state:
229
- st.session_state.url_input = ""
230
 
231
  init_session_state()
232
 
233
- processor = st.session_state.content_processor
234
- manager = st.session_state.chunk_manager
 
 
235
 
236
  st.title("✨ Webpage Content Editor")
237
- st.caption("Created by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/) | Inspired by Andrea Volpini's work on content chunking.")
238
 
239
  st.info(
240
- "**Note:** Some URLs may be blocked due to server policies (like bot detection). "
241
- "This is an early version, so expect a few bugs!",
242
  icon="ℹ️"
243
  )
244
 
245
- url_input = st.text_input("Enter a webpage URL to begin", value=st.session_state.url_input, key="url_input_widget")
246
-
247
- if st.button("Process URL", use_container_width=True):
248
- st.session_state.url_input = st.session_state.url_input_widget
249
- if st.session_state.url_input:
250
- with st.spinner("Fetching and processing content..."):
251
- markdown = processor.fetch_and_convert_to_markdown(st.session_state.url_input)
252
  if "Error" in markdown:
253
  st.session_state.status_message = markdown
254
  manager.set_chunks([])
@@ -260,38 +249,40 @@ if st.button("Process URL", use_container_width=True):
260
  st.session_state.status_message = f"Successfully processed {len(chunks)} chunks."
261
  st.session_state.selected_chunk_id = chunks[0]['id']
262
  else:
263
- st.session_state.status_message = "Could not extract content chunks."
264
  st.session_state.selected_chunk_id = None
265
- st.rerun()
266
 
 
267
  if st.session_state.status_message:
268
  st.toast(st.session_state.status_message)
269
- st.session_state.status_message = ""
270
 
 
271
  tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
272
 
273
  with tab1:
274
  chunks = manager.get_chunks()
275
  if not chunks:
276
- st.write("Process a URL to start editing chunks.")
277
  else:
278
  chunk_ids = [c['id'] for c in chunks]
279
- # Ensure selected_chunk_id is valid
280
  if st.session_state.selected_chunk_id not in chunk_ids:
281
  st.session_state.selected_chunk_id = chunk_ids[0] if chunk_ids else None
282
 
283
  if st.session_state.selected_chunk_id is not None:
284
- chunk_options = {c['id']: f"Chunk {c['id']}: {c['title']}" for c in chunks}
285
 
286
- # The selectbox's state is now managed directly by st.session_state.selected_chunk_id
287
  selected_id = st.selectbox(
288
  "Select a chunk to edit",
289
  options=chunk_ids,
290
- format_func=lambda x: chunk_options.get(x, "Invalid Chunk"),
291
  index=chunk_ids.index(st.session_state.selected_chunk_id)
292
  )
293
-
294
- # Update the session state ONLY if the user selection has changed
295
  if selected_id != st.session_state.selected_chunk_id:
296
  st.session_state.selected_chunk_id = selected_id
297
  st.rerun()
@@ -299,24 +290,28 @@ with tab1:
299
  selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
300
 
301
  if selected_chunk:
 
302
  st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
303
 
 
304
  edited_content = st.text_area(
305
  "Chunk Content",
306
  value=selected_chunk['content'],
307
- height=300,
308
- key=f"editor_{selected_chunk['id']}" # Unique key forces widget to re-render
309
  )
310
 
311
- col1, col2, _ = st.columns([1, 1, 4])
 
312
  if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
313
  manager.update_chunk_content(selected_chunk['id'], edited_content)
314
- st.session_state.status_message = "Chunk updated!"
315
  st.rerun()
316
 
317
  if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
318
  manager.delete_chunk(selected_chunk['id'])
319
- st.session_state.status_message = "Chunk deleted!"
 
320
  remaining_chunks = manager.get_chunks()
321
  st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
322
  st.rerun()
@@ -327,6 +322,7 @@ with tab2:
327
 
328
  st.subheader("Content Targets")
329
  with st.form("targets_form"):
 
330
  c1, c2 = st.columns(2)
331
  f_min = c1.number_input("Min Flesch Reading Ease", value=float(manager.target_flesch_min))
332
  g_max = c2.number_input("Max Flesch-Kincaid Grade", value=float(manager.target_grade_max))
@@ -335,8 +331,8 @@ with tab2:
335
 
336
  if st.form_submit_button("Set New Targets", use_container_width=True):
337
  manager.set_targets(f_min, g_max, w_min, w_max)
338
- st.session_state.status_message = "Targets updated."
339
  st.rerun()
340
 
341
- st.subheader("Final Document")
342
- st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")
 
5
  import re
6
  from llama_index.core.node_parser import MarkdownNodeParser
7
  from llama_index.core.schema import Document, MetadataMode
8
+ import textstat
9
+
10
+ # --- Core Logic Classes ---
11
 
12
  class WebpageContentProcessor:
13
  """
14
  Handles fetching, converting, and parsing webpage content into structured chunks.
15
+ This class is responsible for the entire content processing pipeline.
16
  """
17
  def __init__(self):
18
  pass
19
 
20
  def fetch_and_convert_to_markdown(self, url: str) -> str:
21
  """
22
+ Fetches HTML content from a URL, cleans it, and converts it to Markdown.
23
+ It intelligently tries to find the main content block of the page.
 
24
  """
25
  try:
26
  headers = {
 
31
  html_content = response.text
32
  soup = BeautifulSoup(html_content, 'html.parser')
33
 
34
+ # Remove non-content tags like scripts and styles
35
+ for tag_name in ['script', 'style', 'noscript', 'meta', 'link', 'header', 'footer', 'nav', 'aside']:
36
  for element in soup.find_all(tag_name):
37
  element.decompose()
38
 
39
+ # Find the main content area of the webpage
40
  content_for_conversion = soup.find('article') or soup.find('main') or \
41
+ soup.find('div', class_=re.compile(r'content|post|body')) or \
42
  soup.find('div', {'role': 'main'})
43
 
44
+ # Fallback to the entire body if no main content is found
45
  if not content_for_conversion:
46
+ content_for_conversion = soup.body
47
+ if not content_for_conversion:
48
+ return "Error: Could not find any content on the page."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Convert the cleaned HTML to Markdown
51
  markdown_output = convert_to_markdown(str(content_for_conversion))
52
+ # Clean up excessive newlines
53
+ markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output).strip()
 
54
  return markdown_output
55
 
56
  except requests.exceptions.Timeout:
57
+ return "Error: The request timed out. The server is taking too long to respond."
58
  except requests.exceptions.RequestException as e:
59
+ return f"Error fetching the URL: {e}. Please check the URL and your connection."
60
  except Exception as e:
61
+ return f"An unexpected error occurred during content processing: {e}"
62
 
63
  def parse_markdown_into_chunks(self, markdown_content: str) -> list:
64
  """
65
+ Parses Markdown content into logically separated chunks based on its structure.
66
+ Uses MarkdownNodeParser to respect headers and sections.
67
  """
68
  if not markdown_content or "Error" in markdown_content:
69
  return []
70
 
71
+ # This parser understands Markdown structure (headings, lists) and splits accordingly.
72
  parser = MarkdownNodeParser(include_metadata=True)
73
+ doc = Document(text=markdown_content)
74
  nodes = parser.get_nodes_from_documents([doc])
75
+
76
  structured_chunks = []
77
  for i, node in enumerate(nodes):
78
+ content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
79
+ if not content:
80
  continue
81
 
82
+ # Extract title from the markdown header if it exists
83
+ title_match = re.match(r"^(#+)\s*(.*)", content)
84
+ if title_match:
85
+ title = title_match.group(2).strip()
86
+ # The content should not include the title line itself
87
+ content_text = content[len(title_match.group(0)):].strip()
 
 
 
 
 
88
  else:
89
+ # If no header, use the first line as a fallback title
90
+ first_line = content.split('\n')[0].strip()
91
+ title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
92
+ content_text = content
93
+
94
+ # Ensure there's a title even for empty sections
95
+ if not title:
96
+ title = f"[Chunk {i+1}]"
97
 
98
  structured_chunks.append({
99
  "id": i,
100
+ "title": title,
101
  "content": content_text
102
  })
 
103
  return structured_chunks
104
 
105
  class ChunkManager:
106
+ """
107
+ Manages the state of chunks, including their content, statistics, and targets.
108
+ """
109
  def __init__(self):
110
  self._chunks = []
111
  self.target_flesch_min = 60
112
+ self.target_grade_max = 9
113
+ self.target_min_chunk_words = 40
114
+ self.target_max_chunk_words = 600
115
 
116
  def set_chunks(self, chunks: list):
117
  self._chunks = [self._add_stats_to_chunk(chunk) for chunk in chunks]
 
124
  return chunk
125
 
126
  def _calculate_chunk_stats(self, text: str) -> dict:
127
+ """Calculates readability and other metrics for a text chunk."""
128
  stats = {}
129
  try:
130
  stats['word_count'] = textstat.lexicon_count(text, removepunct=True)
131
  stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
132
  stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
133
+ except (Exception, TypeError): # Catch potential errors from textstat
134
  stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0})
135
  return stats
136
 
137
  def format_chunk_stats(self, stats: dict) -> str:
138
+ """Creates a formatted string of stats with color-coding based on targets."""
139
  flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red"
140
  grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red"
141
  word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red"
142
+
143
  return (
144
+ f"**Word Count:** <span style='color:{word_color};'>{stats.get('word_count', 0)}</span> &nbsp;&nbsp;|&nbsp;&nbsp; "
145
+ f"**Reading Ease:** <span style='color:{flesch_color};'>{stats.get('flesch_reading_ease', 0):.2f}</span> &nbsp;&nbsp;|&nbsp;&nbsp; "
146
+ f"**Grade Level:** <span style='color:{grade_color};'>{stats.get('flesch_kincaid_grade', 0):.2f}</span>"
147
  )
148
 
149
  def get_document_summary_stats(self) -> str:
150
+ """Calculates and formats stats for the entire document."""
151
  if not self._chunks:
152
  return "No document loaded."
153
+
154
  total_words = sum(c['stats']['word_count'] for c in self._chunks)
155
+ if len(self._chunks) > 0:
156
+ avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks)
157
+ avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks)
158
+ else:
159
+ avg_ease = avg_grade = 0
160
 
161
  return (
162
+ f"- **Total Chunks:** {len(self._chunks)}\n"
163
+ f"- **Total Words:** {total_words}\n"
164
+ f"- **Avg. Reading Ease:** {avg_ease:.2f}\n"
165
+ f"- **Avg. Grade Level:** {avg_grade:.2f}"
166
  )
167
 
168
  def get_chunk_by_id(self, chunk_id: int) -> dict | None:
 
172
  chunk = self.get_chunk_by_id(chunk_id)
173
  if chunk:
174
  chunk["content"] = new_content
175
+ self._add_stats_to_chunk(chunk) # Recalculate stats after update
 
 
 
 
 
 
176
 
177
  def delete_chunk(self, chunk_id: int):
178
  self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
 
183
  def get_final_markdown(self) -> str:
184
  if not self._chunks:
185
  return "No content to display."
186
+ # Compile final document, adding headers back for chunks that have them
187
+ final_doc_parts = []
188
+ for c in self._chunks:
189
+ title_is_header = re.match(r"^(#+)\s*(.*)", c['title']) is None
190
+ if not c['title'].startswith("[") and not title_is_header:
191
+ final_doc_parts.append(f"## {c['title']}\n\n{c['content']}")
192
+ else:
193
+ final_doc_parts.append(c['content'])
194
+ return "\n\n---\n\n".join(final_doc_parts)
195
+
196
 
197
  def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
198
  self.target_flesch_min = flesch_min
199
  self.target_grade_max = grade_max
200
  self.target_min_chunk_words = min_words
201
  self.target_max_chunk_words = max_words
202
+ # Recalculate stats for all chunks to reflect new targets
203
+ self.set_chunks(self.get_chunks())
204
+
205
+ # --- Streamlit UI Application ---
206
 
 
207
  st.set_page_config(layout="wide", page_title="Webpage Content Editor")
208
 
209
+ # Initialize session state for managers and UI state
210
  def init_session_state():
211
+ if 'processor' not in st.session_state:
212
+ st.session_state.processor = WebpageContentProcessor()
213
+ if 'manager' not in st.session_state:
214
+ st.session_state.manager = ChunkManager()
215
  if 'selected_chunk_id' not in st.session_state:
216
  st.session_state.selected_chunk_id = None
217
  if 'status_message' not in st.session_state:
218
  st.session_state.status_message = ""
 
 
219
 
220
  init_session_state()
221
 
222
+ processor = st.session_state.processor
223
+ manager = st.session_state.manager
224
+
225
+ # --- Page Layout ---
226
 
227
  st.title("✨ Webpage Content Editor")
228
+ st.caption("A tool to fetch, chunk, and refine web content.")
229
 
230
  st.info(
231
+ "**Note:** Some websites may block content scraping. This is an early version, so you might encounter bugs.",
 
232
  icon="ℹ️"
233
  )
234
 
235
+ # URL input and processing button
236
+ url_input = st.text_input("Enter a webpage URL to start", key="url_input")
237
+ if st.button("Process URL", use_container_width=True, type="primary"):
238
+ if url_input:
239
+ with st.spinner("Fetching and chunking content..."):
240
+ markdown = processor.fetch_and_convert_to_markdown(url_input)
 
241
  if "Error" in markdown:
242
  st.session_state.status_message = markdown
243
  manager.set_chunks([])
 
249
  st.session_state.status_message = f"Successfully processed {len(chunks)} chunks."
250
  st.session_state.selected_chunk_id = chunks[0]['id']
251
  else:
252
+ st.session_state.status_message = "Could not extract any content chunks."
253
  st.session_state.selected_chunk_id = None
254
+ st.rerun() # Rerun to update the UI with new state
255
 
256
+ # Display status messages as toasts
257
  if st.session_state.status_message:
258
  st.toast(st.session_state.status_message)
259
+ st.session_state.status_message = "" # Clear after displaying
260
 
261
+ # Main UI with tabs
262
  tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
263
 
264
  with tab1:
265
  chunks = manager.get_chunks()
266
  if not chunks:
267
+ st.write("Process a URL to begin editing content chunks.")
268
  else:
269
  chunk_ids = [c['id'] for c in chunks]
270
+ # Ensure the selected chunk ID is valid
271
  if st.session_state.selected_chunk_id not in chunk_ids:
272
  st.session_state.selected_chunk_id = chunk_ids[0] if chunk_ids else None
273
 
274
  if st.session_state.selected_chunk_id is not None:
275
+ chunk_options = {c['id']: c['title'] for c in chunks}
276
 
277
+ # Dropdown to select a chunk for editing
278
  selected_id = st.selectbox(
279
  "Select a chunk to edit",
280
  options=chunk_ids,
281
+ format_func=lambda x: f"Chunk {x}: {chunk_options.get(x, 'N/A')}",
282
  index=chunk_ids.index(st.session_state.selected_chunk_id)
283
  )
284
+
285
+ # Update state if the selection changes
286
  if selected_id != st.session_state.selected_chunk_id:
287
  st.session_state.selected_chunk_id = selected_id
288
  st.rerun()
 
290
  selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
291
 
292
  if selected_chunk:
293
+ st.markdown(f"**Editing: {selected_chunk['title']}**")
294
  st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
295
 
296
+ # Text area for editing the selected chunk's content
297
  edited_content = st.text_area(
298
  "Chunk Content",
299
  value=selected_chunk['content'],
300
+ height=350,
301
+ key=f"editor_{selected_chunk['id']}" # Unique key ensures the widget updates
302
  )
303
 
304
+ # Action buttons for the selected chunk
305
+ col1, col2, _ = st.columns([1, 1, 5])
306
  if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
307
  manager.update_chunk_content(selected_chunk['id'], edited_content)
308
+ st.session_state.status_message = "Chunk updated successfully!"
309
  st.rerun()
310
 
311
  if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
312
  manager.delete_chunk(selected_chunk['id'])
313
+ st.session_state.status_message = "Chunk deleted."
314
+ # Select the next available chunk or reset
315
  remaining_chunks = manager.get_chunks()
316
  st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
317
  st.rerun()
 
322
 
323
  st.subheader("Content Targets")
324
  with st.form("targets_form"):
325
+ st.write("Set readability targets to guide your editing. See color feedback in the editor.")
326
  c1, c2 = st.columns(2)
327
  f_min = c1.number_input("Min Flesch Reading Ease", value=float(manager.target_flesch_min))
328
  g_max = c2.number_input("Max Flesch-Kincaid Grade", value=float(manager.target_grade_max))
 
331
 
332
  if st.form_submit_button("Set New Targets", use_container_width=True):
333
  manager.set_targets(f_min, g_max, w_min, w_max)
334
+ st.session_state.status_message = "Content targets have been updated."
335
  st.rerun()
336
 
337
+ st.subheader("Final Compiled Document")
338
+ st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")