Em4e commited on
Commit
35de2ed
·
verified ·
1 Parent(s): 5c1ddc7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -120
app.py CHANGED
@@ -25,9 +25,7 @@ class WebpageContentProcessor:
25
  response = requests.get(url, timeout=10) # Add a timeout for robustness
26
  response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
27
  html_content = response.text
28
-
29
  soup = BeautifulSoup(html_content, 'html.parser')
30
-
31
  # Aggressive initial removal of script, style, and meta tags that are never content.
32
  for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
33
  for element in soup.find_all(tag_name):
@@ -41,23 +39,23 @@ class WebpageContentProcessor:
41
  soup.find('div', {'role': 'main'})
42
 
43
  # Fallback logic if main content container wasn't found
44
- if not content_for_conversion:
45
- first_h1 = soup.find('h1')
46
- if first_h1:
47
  candidate_container = first_h1.parent
48
  found_main_wrapper_via_h1_parent = False
49
  # Check up to 5 parent levels for a suitable content wrapper
50
- for _ in range(5):
51
- if candidate_container is None:
52
  break
53
- if candidate_container.name in ['article', 'main', 'section', 'div'] and \
54
  any(cls in candidate_container.get('class', []) for cls in ['content', 'post-body', 'article-content', 'entry-content', 'main-content']) or \
55
  candidate_container.get('role') == 'main':
56
  content_for_conversion = candidate_container
57
  found_main_wrapper_via_h1_parent = True
58
  break
59
- candidate_container = candidate_container.parent
60
-
61
  # If no clear wrapper found via H1's parent, take H1 and its direct siblings as a fallback
62
  if not found_main_wrapper_via_h1_parent:
63
  temp_soup = BeautifulSoup('', 'html.parser')
@@ -67,7 +65,7 @@ class WebpageContentProcessor:
67
  temp_soup.append(current_element)
68
  current_element = current_element.next_sibling
69
  content_for_conversion = temp_soup
70
- else:
71
  # Ultimate fallback: use the entire body if no specific content tags or H1 found
72
  content_for_conversion = soup.body
73
 
@@ -81,7 +79,7 @@ class WebpageContentProcessor:
81
  'map', 'area', 'embed', 'object', 'param', 'applet', 'bgsound', 'frame',
82
  'frameset', 'noframes', 'template', 'slot', 'portal', 'datalist', 'keygen',
83
  'output', 'progress', 'meter', 'details', 'summary', 'dialog', 'menu',
84
- 'menuitem', 'command', 'hr', 'figure', 'figcaption', 'cite',
85
  '.social-share', '.comments', '.related-posts', '.pagination',
86
  '.breadcrumbs', '.pop-up', '.modal', '.overlay', '.cookie-consent',
87
  '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
@@ -90,26 +88,26 @@ class WebpageContentProcessor:
90
  '[class*="announcement"]', '[class*="fixed-bottom"]', '[class*="fixed-top"]',
91
  '[id*="promo"]', '[id*="ad"]', '[id*="banner"]', '[id*="popup"]', '[id*="modal"]',
92
  '[id*="overlay"]', '[id*="cookie"]', '[id*="skip"]', '[id*="navbar"]', '[id*="menu"]',
93
- '.hidden', '.visually-hidden',
94
- '.no-print', '.print-hide',
95
- '.wp-block-navigation', '.wp-block-group.is-style-stripes',
96
- '[class*="column"]', '[class*="grid"]'
97
- ]
98
 
99
  for selector in unwanted_elements_in_content:
100
- if re.match(r'^[a-zA-Z0-9]+$', selector):
101
- for element in content_for_conversion.find_all(selector):
102
  element.decompose()
103
- else:
104
- for element in content_for_conversion.select(selector):
105
  element.decompose()
106
-
107
  markdown_output = convert_to_markdown(str(content_for_conversion))
108
 
109
  # Post-processing: Clean up resulting Markdown
110
  markdown_output = re.sub(r'\n\s*\n\s*\n+', '\n\n', markdown_output)
111
  markdown_output = re.sub(r'^\s*[\*\-]\s*$', '', markdown_output, flags=re.MULTILINE)
112
- markdown_output = re.sub(r'\*{3,}', '', markdown_output)
113
  markdown_output = markdown_output.strip()
114
 
115
  return markdown_output
@@ -135,21 +133,19 @@ class WebpageContentProcessor:
135
  print(f"✅ Parsed {len(nodes)} nodes from Markdown.") # Debug print
136
 
137
  structured_chunks = []
138
- current_id = 0
139
 
140
  for node in nodes:
141
  pure_text_content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
142
-
143
  heading_title = ""
144
  content_text = pure_text_content
145
 
146
  heading_match = re.match(r"^(#+)\s*(.*)", pure_text_content)
147
-
148
  if heading_match:
149
  heading_title = heading_match.group(2).strip()
150
  content_text = pure_text_content[len(heading_match.group(0)):].strip()
151
- if not heading_title:
152
- heading_title = "[Untitled Section]"
153
  else:
154
  first_line = content_text.split('\n')[0].strip()
155
  heading_title = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
@@ -165,7 +161,7 @@ class WebpageContentProcessor:
165
  "original_node": node # Keep reference to the original LlamaIndex node
166
  })
167
  current_id += 1
168
-
169
  return structured_chunks
170
 
171
  class ChunkManager:
@@ -198,14 +194,14 @@ class ChunkManager:
198
  (Private helper method, SRP for stats calculation)
199
  """
200
  stats = {}
201
- cleaned_text = re.sub(r'#+\s*', '', text)
202
- cleaned_text = re.sub(r'[\*\-]\s*', '', cleaned_text)
203
- cleaned_text = re.sub(r'\n\s*\n+', ' ', cleaned_text).strip()
204
 
205
  stats['word_count'] = textstat.lexicon_count(cleaned_text, removepunct=True)
206
  stats['char_count'] = len(cleaned_text)
207
  stats['sentence_count'] = textstat.sentence_count(cleaned_text)
208
-
209
  if stats['sentence_count'] > 0:
210
  stats['avg_sentence_length'] = stats['word_count'] / stats['sentence_count']
211
  else:
@@ -217,17 +213,16 @@ class ChunkManager:
217
  stats['flesch_reading_ease'] = textstat.flesch_reading_ease(cleaned_text)
218
  except Exception:
219
  stats['flesch_reading_ease'] = 0
220
-
221
  try:
222
  stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(cleaned_text)
223
  except Exception:
224
  stats['flesch_kincaid_grade'] = 0
225
-
226
  try:
227
  stats['gunning_fog_score'] = textstat.gunning_fog(cleaned_text)
228
  except Exception:
229
  stats['gunning_fog_score'] = 0
230
-
231
  return stats
232
 
233
  def format_chunk_stats(self, stats: dict) -> str:
@@ -262,7 +257,7 @@ class ChunkManager:
262
  total_chars = 0
263
  total_sentences = 0
264
  total_paragraphs = 0
265
-
266
  all_content_text = ""
267
  for chunk in self._chunks:
268
  content_text_for_stats = chunk['content']
@@ -280,10 +275,10 @@ class ChunkManager:
280
  doc_stats_str += f"- **Total Characters:** {total_chars}\n"
281
  doc_stats_str += f"- **Total Sentences:** {total_sentences}\n"
282
  doc_stats_str += f"- **Total Paragraphs:** {total_paragraphs}\n"
283
-
284
  if len(self._chunks) > 0:
285
  doc_stats_str += f"- **Average Words per Chunk:** {total_words / len(self._chunks):.2f}\n"
286
-
287
  if all_content_text.strip():
288
  overall_stats = self._calculate_chunk_stats(all_content_text)
289
  doc_stats_str += f"- **Overall Flesch Reading Ease:** {overall_stats['flesch_reading_ease']:.2f}\n"
@@ -292,7 +287,6 @@ class ChunkManager:
292
  doc_stats_str += f"- **Overall Average Sentence Length:** {overall_stats['avg_sentence_length']:.2f} words\n"
293
  else:
294
  doc_stats_str += "- No content available for overall readability metrics.\n"
295
-
296
  return doc_stats_str
297
 
298
  def get_chunk_by_id(self, chunk_id: int) -> dict | None:
@@ -336,25 +330,21 @@ class ChunkManager:
336
  self._chunks = [chunk for chunk in self._chunks if chunk["id"] != chunk_id]
337
  if len(self._chunks) == initial_chunk_count:
338
  return False # Chunk not found
339
-
340
  # Re-index IDs to be sequential again
341
  for i, chunk in enumerate(self._chunks):
342
  chunk['id'] = i
343
-
344
  return True
345
-
346
  def get_final_markdown(self) -> str:
347
  """Compiles all current chunks into a single Markdown string."""
348
  final_md = ""
349
  if not self._chunks:
350
  return "No content to compile. Please process a URL first."
351
-
352
  for chunk in self._chunks:
353
  # Use H1 heading if title is meaningful
354
  if not chunk["title"].startswith("[") and chunk["title"]:
355
  final_md += f"# {chunk['title']}\n\n"
356
  final_md += f"{chunk['content']}\n\n"
357
-
358
  return final_md.strip()
359
 
360
  def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
@@ -365,7 +355,7 @@ class ChunkManager:
365
  self.target_max_chunk_words = max_words
366
  # Recalculate stats for all chunks to reflect new targets in color coding (if displayed)
367
  for chunk in self._chunks:
368
- chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
369
 
370
  # --- Streamlit UI Definition ---
371
  st.set_page_config(layout="wide", page_title="Chunk-Powered Webpage Editor")
@@ -384,35 +374,23 @@ if 'chunk_content_editor' not in st.session_state:
384
  if 'final_markdown' not in st.session_state:
385
  st.session_state.final_markdown = "Click 'Compile All Chunks' to see the final document with your edits."
386
 
387
-
388
  # Instantiate the managers
389
  content_processor = st.session_state.content_processor
390
  chunk_manager = st.session_state.chunk_manager
391
 
392
  st.markdown("# <center>✨ Chunk-Powered Webpage Editor ✨</center>", unsafe_allow_html=True)
393
- st.markdown("""
394
- Enter a URL, fetch its content, and break it into editable 'chunks'.
395
- Review statistics, set targets, edit chunks, and compile your final Markdown.
396
-
397
- <br>
398
-
399
- <div style="font-size: 0.9em; margin-bottom: 12px;">
400
- Inspired by <a href="https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/" target="_blank">Andrea Volpini</a>
401
- </div>
402
-
403
- <div style="display: flex; justify-content: flex-start; align-items: center; gap: 16px;">
404
  <span>Runs best on Desktop. App created by <a href="https://www.linkedin.com/in/emilijagjorgjevska/" target="_blank">Emilija Gjorgjevska</a></span>
405
  <a href="https://buymeacoffee.com/emiliagjorgjevska" target="_blank">
406
  <img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" alt="Buy Me A Coffee" style="height: 30px;">
407
- </a>
408
- </div>
409
- <br>
410
- """, unsafe_allow_html=True)
411
  # --- URL Input and Processing ---
412
  col1, col2 = st.columns([4, 1])
413
  with col1:
414
  url_input = st.text_input(
415
- label="Enter Webpage URL",
416
  placeholder="e.g., https://www.llamaindex.ai/blog/what-is-llamaindex",
417
  key="url_input"
418
  )
@@ -430,7 +408,7 @@ if process_button:
430
  else:
431
  with st.spinner("Processing URL..."):
432
  markdown_content = content_processor.fetch_and_convert_to_markdown(url_input)
433
-
434
  if "Error" in markdown_content:
435
  chunk_manager.set_chunks([])
436
  st.session_state.status_message = markdown_content
@@ -438,62 +416,38 @@ if process_button:
438
  chunks = content_processor.parse_markdown_into_chunks(markdown_content)
439
  chunk_manager.set_chunks(chunks)
440
  st.session_state.status_message = "URL processed successfully!" if chunks else "URL processed, but no content chunks could be extracted."
441
-
442
  if chunks:
443
  st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
444
  else:
445
  st.session_state.chunk_selector = None
446
 
447
-
448
  # --- Tabs for Editor and Overview ---
449
  tab1, tab2 = st.tabs(["Editor", "Document Overview & Targets"])
450
 
451
  with tab1:
452
  st.markdown("## Edit Chunks Individually")
453
-
454
- col1, col2 = st.columns([2, 1])
455
-
456
- with col1:
457
- chunk_selector_options = chunk_manager.get_chunk_titles_for_dropdown()
458
- if chunk_selector_options:
459
- try:
460
- # Find the index of the currently selected item to handle updates
461
- current_selection_index = chunk_selector_options.index(st.session_state.chunk_selector)
462
- except (ValueError, TypeError):
463
- current_selection_index = 0
464
-
465
- selected_chunk_title = st.selectbox(
466
- label="Select Chunk to Edit",
467
- options=chunk_selector_options,
468
- index=current_selection_index,
469
- key="chunk_selector"
470
- )
471
- else:
472
- selected_chunk_title = st.selectbox(
473
- label="Select Chunk to Edit",
474
- options=["No chunks available"],
475
- disabled=True
476
- )
477
 
478
- with col2:
479
- nav_col1, nav_col2 = st.columns(2)
480
- with nav_col1:
481
- if st.button("⬅️ Previous Chunk", use_container_width=True):
482
- if selected_chunk_title and "No chunks available" not in selected_chunk_title:
483
- current_id = int(selected_chunk_title.split(':')[0].strip())
484
- new_id = max(0, current_id - 1)
485
- new_chunk = chunk_manager.get_chunk_by_id(new_id)
486
- if new_chunk:
487
- st.session_state.chunk_selector = f"{new_chunk['id']}: {new_chunk['title']}"
488
-
489
- with nav_col2:
490
- if st.button("Next Chunk ➡️", use_container_width=True):
491
- if selected_chunk_title and "No chunks available" not in selected_chunk_title:
492
- current_id = int(selected_chunk_title.split(':')[0].strip())
493
- new_id = min(len(chunk_manager.get_chunks()) - 1, current_id + 1)
494
- new_chunk = chunk_manager.get_chunk_by_id(new_id)
495
- if new_chunk:
496
- st.session_state.chunk_selector = f"{new_chunk['id']}: {new_chunk['title']}"
 
497
 
498
  # Get the currently selected chunk
499
  selected_chunk = None
@@ -507,14 +461,14 @@ with tab1:
507
  value=selected_chunk["title"],
508
  disabled=True
509
  )
510
-
511
  chunk_content_editor = st.text_area(
512
  label="Chunk Content",
513
  value=selected_chunk["content"],
514
  height=250,
515
  key=f"editor_{selected_chunk['id']}" # Unique key to prevent state loss
516
  )
517
-
518
  st.markdown(
519
  chunk_manager.format_chunk_stats(selected_chunk['stats']),
520
  unsafe_allow_html=True
@@ -537,15 +491,13 @@ with tab1:
537
  else:
538
  st.session_state.chunk_selector = None
539
 
540
-
541
  else:
542
  st.text_input("Chunk Title (Auto-detected)", "Title of the selected chunk", disabled=True)
543
  st.text_area("Chunk Content", "Content of the selected chunk will appear here for editing.", height=250, disabled=True)
544
  st.markdown("Chunk statistics will appear here.")
545
-
546
  st.markdown("---")
547
  st.markdown("## Final Compiled Markdown")
548
-
549
  if st.button("Compile All Chunks", use_container_width=True):
550
  st.session_state.final_markdown = chunk_manager.get_final_markdown()
551
 
@@ -556,11 +508,10 @@ with tab1:
556
  key="final_markdown_output",
557
  disabled=False
558
  )
559
-
560
  with tab2:
561
  st.markdown("## Document Summary Statistics")
562
  st.markdown(chunk_manager.get_document_summary_stats(), unsafe_allow_html=True)
563
-
564
  st.markdown("---")
565
  st.markdown("## Content Targets")
566
  st.markdown("Adjust these targets to guide your writing and see visual feedback in the chunk selector (green=good, red=needs attention).")
@@ -573,13 +524,13 @@ with tab2:
573
  with col2:
574
  target_grade_max_input = st.number_input("Max Flesch-Kincaid Grade", value=float(chunk_manager.target_grade_max))
575
  target_max_chunk_words_input = st.number_input("Max Chunk Words", value=chunk_manager.target_max_chunk_words)
576
-
577
  submitted = st.form_submit_button("Set New Targets", use_container_width=True)
578
  if submitted:
579
  chunk_manager.set_targets(
580
- target_flesch_min_input,
581
- target_grade_max_input,
582
- int(target_min_chunk_words_input),
583
  int(target_max_chunk_words_input)
584
  )
585
  st.session_state.status_message = "Target settings updated."
 
25
  response = requests.get(url, timeout=10) # Add a timeout for robustness
26
  response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
27
  html_content = response.text
 
28
  soup = BeautifulSoup(html_content, 'html.parser')
 
29
  # Aggressive initial removal of script, style, and meta tags that are never content.
30
  for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
31
  for element in soup.find_all(tag_name):
 
39
  soup.find('div', {'role': 'main'})
40
 
41
  # Fallback logic if main content container wasn't found
42
+ if not content_for_conversion:
43
+ first_h1 = soup.find('h1')
44
+ if first_h1:
45
  candidate_container = first_h1.parent
46
  found_main_wrapper_via_h1_parent = False
47
  # Check up to 5 parent levels for a suitable content wrapper
48
+ for _ in range(5):
49
+ if candidate_container is None:
50
  break
51
+ if candidate_container.name in ['article', 'main', 'section', 'div'] and \
52
  any(cls in candidate_container.get('class', []) for cls in ['content', 'post-body', 'article-content', 'entry-content', 'main-content']) or \
53
  candidate_container.get('role') == 'main':
54
  content_for_conversion = candidate_container
55
  found_main_wrapper_via_h1_parent = True
56
  break
57
+ candidate_container = candidate_container.parent
58
+
59
  # If no clear wrapper found via H1's parent, take H1 and its direct siblings as a fallback
60
  if not found_main_wrapper_via_h1_parent:
61
  temp_soup = BeautifulSoup('', 'html.parser')
 
65
  temp_soup.append(current_element)
66
  current_element = current_element.next_sibling
67
  content_for_conversion = temp_soup
68
+ else:
69
  # Ultimate fallback: use the entire body if no specific content tags or H1 found
70
  content_for_conversion = soup.body
71
 
 
79
  'map', 'area', 'embed', 'object', 'param', 'applet', 'bgsound', 'frame',
80
  'frameset', 'noframes', 'template', 'slot', 'portal', 'datalist', 'keygen',
81
  'output', 'progress', 'meter', 'details', 'summary', 'dialog', 'menu',
82
+ 'menuitem', 'command', 'hr', 'figure', 'figcaption', 'cite',
83
  '.social-share', '.comments', '.related-posts', '.pagination',
84
  '.breadcrumbs', '.pop-up', '.modal', '.overlay', '.cookie-consent',
85
  '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
 
88
  '[class*="announcement"]', '[class*="fixed-bottom"]', '[class*="fixed-top"]',
89
  '[id*="promo"]', '[id*="ad"]', '[id*="banner"]', '[id*="popup"]', '[id*="modal"]',
90
  '[id*="overlay"]', '[id*="cookie"]', '[id*="skip"]', '[id*="navbar"]', '[id*="menu"]',
91
+ '.hidden', '.visually-hidden',
92
+ '.no-print', '.print-hide',
93
+ '.wp-block-navigation', '.wp-block-group.is-style-stripes',
94
+ '[class*="column"]', '[class*="grid"]'
95
+ ]
96
 
97
  for selector in unwanted_elements_in_content:
98
+ if re.match(r'^[a-zA-Z0-9]+$', selector):
99
+ for element in content_for_conversion.find_all(selector):
100
  element.decompose()
101
+ else:
102
+ for element in content_for_conversion.select(selector):
103
  element.decompose()
104
+
105
  markdown_output = convert_to_markdown(str(content_for_conversion))
106
 
107
  # Post-processing: Clean up resulting Markdown
108
  markdown_output = re.sub(r'\n\s*\n\s*\n+', '\n\n', markdown_output)
109
  markdown_output = re.sub(r'^\s*[\*\-]\s*$', '', markdown_output, flags=re.MULTILINE)
110
+ markdown_output = re.sub(r'\*{3,}', '', markdown_output)
111
  markdown_output = markdown_output.strip()
112
 
113
  return markdown_output
 
133
  print(f"✅ Parsed {len(nodes)} nodes from Markdown.") # Debug print
134
 
135
  structured_chunks = []
136
+ current_id = 0
137
 
138
  for node in nodes:
139
  pure_text_content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
 
140
  heading_title = ""
141
  content_text = pure_text_content
142
 
143
  heading_match = re.match(r"^(#+)\s*(.*)", pure_text_content)
 
144
  if heading_match:
145
  heading_title = heading_match.group(2).strip()
146
  content_text = pure_text_content[len(heading_match.group(0)):].strip()
147
+ if not heading_title:
148
+ heading_title = "[Untitled Section]"
149
  else:
150
  first_line = content_text.split('\n')[0].strip()
151
  heading_title = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
 
161
  "original_node": node # Keep reference to the original LlamaIndex node
162
  })
163
  current_id += 1
164
+
165
  return structured_chunks
166
 
167
  class ChunkManager:
 
194
  (Private helper method, SRP for stats calculation)
195
  """
196
  stats = {}
197
+ cleaned_text = re.sub(r'#+\s*', '', text)
198
+ cleaned_text = re.sub(r'[\*\-]\s*', '', cleaned_text)
199
+ cleaned_text = re.sub(r'\n\s*\n+', ' ', cleaned_text).strip()
200
 
201
  stats['word_count'] = textstat.lexicon_count(cleaned_text, removepunct=True)
202
  stats['char_count'] = len(cleaned_text)
203
  stats['sentence_count'] = textstat.sentence_count(cleaned_text)
204
+
205
  if stats['sentence_count'] > 0:
206
  stats['avg_sentence_length'] = stats['word_count'] / stats['sentence_count']
207
  else:
 
213
  stats['flesch_reading_ease'] = textstat.flesch_reading_ease(cleaned_text)
214
  except Exception:
215
  stats['flesch_reading_ease'] = 0
216
+
217
  try:
218
  stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(cleaned_text)
219
  except Exception:
220
  stats['flesch_kincaid_grade'] = 0
221
+
222
  try:
223
  stats['gunning_fog_score'] = textstat.gunning_fog(cleaned_text)
224
  except Exception:
225
  stats['gunning_fog_score'] = 0
 
226
  return stats
227
 
228
  def format_chunk_stats(self, stats: dict) -> str:
 
257
  total_chars = 0
258
  total_sentences = 0
259
  total_paragraphs = 0
260
+
261
  all_content_text = ""
262
  for chunk in self._chunks:
263
  content_text_for_stats = chunk['content']
 
275
  doc_stats_str += f"- **Total Characters:** {total_chars}\n"
276
  doc_stats_str += f"- **Total Sentences:** {total_sentences}\n"
277
  doc_stats_str += f"- **Total Paragraphs:** {total_paragraphs}\n"
278
+
279
  if len(self._chunks) > 0:
280
  doc_stats_str += f"- **Average Words per Chunk:** {total_words / len(self._chunks):.2f}\n"
281
+
282
  if all_content_text.strip():
283
  overall_stats = self._calculate_chunk_stats(all_content_text)
284
  doc_stats_str += f"- **Overall Flesch Reading Ease:** {overall_stats['flesch_reading_ease']:.2f}\n"
 
287
  doc_stats_str += f"- **Overall Average Sentence Length:** {overall_stats['avg_sentence_length']:.2f} words\n"
288
  else:
289
  doc_stats_str += "- No content available for overall readability metrics.\n"
 
290
  return doc_stats_str
291
 
292
  def get_chunk_by_id(self, chunk_id: int) -> dict | None:
 
330
  self._chunks = [chunk for chunk in self._chunks if chunk["id"] != chunk_id]
331
  if len(self._chunks) == initial_chunk_count:
332
  return False # Chunk not found
 
333
  # Re-index IDs to be sequential again
334
  for i, chunk in enumerate(self._chunks):
335
  chunk['id'] = i
 
336
  return True
337
+
338
  def get_final_markdown(self) -> str:
339
  """Compiles all current chunks into a single Markdown string."""
340
  final_md = ""
341
  if not self._chunks:
342
  return "No content to compile. Please process a URL first."
 
343
  for chunk in self._chunks:
344
  # Use H1 heading if title is meaningful
345
  if not chunk["title"].startswith("[") and chunk["title"]:
346
  final_md += f"# {chunk['title']}\n\n"
347
  final_md += f"{chunk['content']}\n\n"
 
348
  return final_md.strip()
349
 
350
  def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
 
355
  self.target_max_chunk_words = max_words
356
  # Recalculate stats for all chunks to reflect new targets in color coding (if displayed)
357
  for chunk in self._chunks:
358
+ chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
359
 
360
  # --- Streamlit UI Definition ---
361
  st.set_page_config(layout="wide", page_title="Chunk-Powered Webpage Editor")
 
374
  if 'final_markdown' not in st.session_state:
375
  st.session_state.final_markdown = "Click 'Compile All Chunks' to see the final document with your edits."
376
 
 
377
  # Instantiate the managers
378
  content_processor = st.session_state.content_processor
379
  chunk_manager = st.session_state.chunk_manager
380
 
381
  st.markdown("# <center>✨ Chunk-Powered Webpage Editor ✨</center>", unsafe_allow_html=True)
382
+ st.markdown("""Enter a URL, fetch its content, and break it into editable 'chunks'. Review statistics, set targets, edit chunks, and compile your final Markdown.<br><div style="font-size: 0.9em; margin-bottom: 12px;">
383
+ Inspired by <a href="https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/" target="_blank">Andrea Volpini</a></div><div style="display: flex; justify-content: flex-start; align-items: center; gap: 16px;">
 
 
 
 
 
 
 
 
 
384
  <span>Runs best on Desktop. App created by <a href="https://www.linkedin.com/in/emilijagjorgjevska/" target="_blank">Emilija Gjorgjevska</a></span>
385
  <a href="https://buymeacoffee.com/emiliagjorgjevska" target="_blank">
386
  <img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" alt="Buy Me A Coffee" style="height: 30px;">
387
+ </a></div><br>""", unsafe_allow_html=True)
388
+
 
 
389
  # --- URL Input and Processing ---
390
  col1, col2 = st.columns([4, 1])
391
  with col1:
392
  url_input = st.text_input(
393
+ label="Enter Webpage URL",
394
  placeholder="e.g., https://www.llamaindex.ai/blog/what-is-llamaindex",
395
  key="url_input"
396
  )
 
408
  else:
409
  with st.spinner("Processing URL..."):
410
  markdown_content = content_processor.fetch_and_convert_to_markdown(url_input)
411
+
412
  if "Error" in markdown_content:
413
  chunk_manager.set_chunks([])
414
  st.session_state.status_message = markdown_content
 
416
  chunks = content_processor.parse_markdown_into_chunks(markdown_content)
417
  chunk_manager.set_chunks(chunks)
418
  st.session_state.status_message = "URL processed successfully!" if chunks else "URL processed, but no content chunks could be extracted."
419
+
420
  if chunks:
421
  st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
422
  else:
423
  st.session_state.chunk_selector = None
424
 
 
425
  # --- Tabs for Editor and Overview ---
426
  tab1, tab2 = st.tabs(["Editor", "Document Overview & Targets"])
427
 
428
  with tab1:
429
  st.markdown("## Edit Chunks Individually")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
 
431
+ chunk_selector_options = chunk_manager.get_chunk_titles_for_dropdown()
432
+ if chunk_selector_options:
433
+ try:
434
+ # Find the index of the currently selected item to handle updates
435
+ current_selection_index = chunk_selector_options.index(st.session_state.chunk_selector)
436
+ except (ValueError, TypeError):
437
+ current_selection_index = 0
438
+
439
+ selected_chunk_title = st.selectbox(
440
+ label="Select Chunk to Edit",
441
+ options=chunk_selector_options,
442
+ index=current_selection_index,
443
+ key="chunk_selector"
444
+ )
445
+ else:
446
+ selected_chunk_title = st.selectbox(
447
+ label="Select Chunk to Edit",
448
+ options=["No chunks available"],
449
+ disabled=True
450
+ )
451
 
452
  # Get the currently selected chunk
453
  selected_chunk = None
 
461
  value=selected_chunk["title"],
462
  disabled=True
463
  )
464
+
465
  chunk_content_editor = st.text_area(
466
  label="Chunk Content",
467
  value=selected_chunk["content"],
468
  height=250,
469
  key=f"editor_{selected_chunk['id']}" # Unique key to prevent state loss
470
  )
471
+
472
  st.markdown(
473
  chunk_manager.format_chunk_stats(selected_chunk['stats']),
474
  unsafe_allow_html=True
 
491
  else:
492
  st.session_state.chunk_selector = None
493
 
 
494
  else:
495
  st.text_input("Chunk Title (Auto-detected)", "Title of the selected chunk", disabled=True)
496
  st.text_area("Chunk Content", "Content of the selected chunk will appear here for editing.", height=250, disabled=True)
497
  st.markdown("Chunk statistics will appear here.")
498
+
499
  st.markdown("---")
500
  st.markdown("## Final Compiled Markdown")
 
501
  if st.button("Compile All Chunks", use_container_width=True):
502
  st.session_state.final_markdown = chunk_manager.get_final_markdown()
503
 
 
508
  key="final_markdown_output",
509
  disabled=False
510
  )
511
+
512
  with tab2:
513
  st.markdown("## Document Summary Statistics")
514
  st.markdown(chunk_manager.get_document_summary_stats(), unsafe_allow_html=True)
 
515
  st.markdown("---")
516
  st.markdown("## Content Targets")
517
  st.markdown("Adjust these targets to guide your writing and see visual feedback in the chunk selector (green=good, red=needs attention).")
 
524
  with col2:
525
  target_grade_max_input = st.number_input("Max Flesch-Kincaid Grade", value=float(chunk_manager.target_grade_max))
526
  target_max_chunk_words_input = st.number_input("Max Chunk Words", value=chunk_manager.target_max_chunk_words)
527
+
528
  submitted = st.form_submit_button("Set New Targets", use_container_width=True)
529
  if submitted:
530
  chunk_manager.set_targets(
531
+ target_flesch_min_input,
532
+ target_grade_max_input,
533
+ int(target_min_chunk_words_input),
534
  int(target_max_chunk_words_input)
535
  )
536
  st.session_state.status_message = "Target settings updated."