Em4e commited on
Commit
dd5c454
·
verified ·
1 Parent(s): 0ec0963

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +576 -572
app.py CHANGED
@@ -1,573 +1,577 @@
1
- import streamlit as st
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from html_to_markdown import convert_to_markdown
5
- import re
6
- from llama_index.core.node_parser import MarkdownNodeParser
7
- from llama_index.core.schema import Document, MetadataMode
8
- import textstat # For readability metrics
9
-
10
- class WebpageContentProcessor:
11
- """
12
- Handles fetching, converting, and parsing webpage content into structured chunks.
13
- Adheres to the Single Responsibility Principle (SRP) for content processing.
14
- """
15
- def __init__(self):
16
- pass
17
-
18
- def fetch_and_convert_to_markdown(self, url: str) -> str:
19
- """
20
- Fetches HTML content from a given URL, attempts to isolate the main content,
21
- removes common boilerplate, and converts to Markdown.
22
- Prioritizes semantic content tags over H1-based identification for robust extraction.
23
- """
24
- try:
25
- response = requests.get(url, timeout=10) # Add a timeout for robustness
26
- response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
27
- html_content = response.text
28
-
29
- soup = BeautifulSoup(html_content, 'html.parser')
30
-
31
- # Aggressive initial removal of script, style, and meta tags that are never content.
32
- for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
33
- for element in soup.find_all(tag_name):
34
- element.decompose()
35
-
36
- content_for_conversion = None
37
-
38
- # Prioritize finding main content containers first (semantic tags and common divs)
39
- content_for_conversion = soup.find('article') or soup.find('main') or \
40
- soup.find('div', class_='main-content') or \
41
- soup.find('div', {'role': 'main'})
42
-
43
- # Fallback logic if main content container wasn't found
44
- if not content_for_conversion:
45
- first_h1 = soup.find('h1')
46
- if first_h1:
47
- candidate_container = first_h1.parent
48
- found_main_wrapper_via_h1_parent = False
49
- # Check up to 5 parent levels for a suitable content wrapper
50
- for _ in range(5):
51
- if candidate_container is None:
52
- break
53
- if candidate_container.name in ['article', 'main', 'section', 'div'] and \
54
- any(cls in candidate_container.get('class', []) for cls in ['content', 'post-body', 'article-content', 'entry-content', 'main-content']) or \
55
- candidate_container.get('role') == 'main':
56
- content_for_conversion = candidate_container
57
- found_main_wrapper_via_h1_parent = True
58
- break
59
- candidate_container = candidate_container.parent
60
-
61
- # If no clear wrapper found via H1's parent, take H1 and its direct siblings as a fallback
62
- if not found_main_wrapper_via_h1_parent:
63
- temp_soup = BeautifulSoup('', 'html.parser')
64
- temp_soup.append(first_h1)
65
- current_element = first_h1.next_sibling
66
- while current_element:
67
- temp_soup.append(current_element)
68
- current_element = current_element.next_sibling
69
- content_for_conversion = temp_soup
70
- else:
71
- # Ultimate fallback: use the entire body if no specific content tags or H1 found
72
- content_for_conversion = soup.body
73
-
74
- if not content_for_conversion:
75
- return "Error: Could not identify main content for conversion."
76
-
77
- # Selective boilerplate removal within the *identified* main content tag
78
- unwanted_elements_in_content = [
79
- 'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button', 'input',
80
- 'textarea', 'svg', 'canvas', 'audio', 'video', 'picture', 'source', 'track',
81
- 'map', 'area', 'embed', 'object', 'param', 'applet', 'bgsound', 'frame',
82
- 'frameset', 'noframes', 'template', 'slot', 'portal', 'datalist', 'keygen',
83
- 'output', 'progress', 'meter', 'details', 'summary', 'dialog', 'menu',
84
- 'menuitem', 'command', 'hr', 'figure', 'figcaption', 'cite',
85
- '.social-share', '.comments', '.related-posts', '.pagination',
86
- '.breadcrumbs', '.pop-up', '.modal', '.overlay', '.cookie-consent',
87
- '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
88
- '[role="complementary"]', '[role="search"]', '[role="menubar"]', '[role="toolbar"]',
89
- '[class*="utility"]', '[class*="global-nav"]', '[class*="skip"]', '[class*="toast"]',
90
- '[class*="announcement"]', '[class*="fixed-bottom"]', '[class*="fixed-top"]',
91
- '[id*="promo"]', '[id*="ad"]', '[id*="banner"]', '[id*="popup"]', '[id*="modal"]',
92
- '[id*="overlay"]', '[id*="cookie"]', '[id*="skip"]', '[id*="navbar"]', '[id*="menu"]',
93
- '.hidden', '.visually-hidden',
94
- '.no-print', '.print-hide',
95
- '.wp-block-navigation', '.wp-block-group.is-style-stripes',
96
- '[class*="column"]', '[class*="grid"]'
97
- ]
98
-
99
- for selector in unwanted_elements_in_content:
100
- if re.match(r'^[a-zA-Z0-9]+$', selector):
101
- for element in content_for_conversion.find_all(selector):
102
- element.decompose()
103
- else:
104
- for element in content_for_conversion.select(selector):
105
- element.decompose()
106
-
107
- markdown_output = convert_to_markdown(str(content_for_conversion))
108
-
109
- # Post-processing: Clean up resulting Markdown
110
- markdown_output = re.sub(r'\n\s*\n\s*\n+', '\n\n', markdown_output)
111
- markdown_output = re.sub(r'^\s*[\*\-]\s*$', '', markdown_output, flags=re.MULTILINE)
112
- markdown_output = re.sub(r'\*{3,}', '', markdown_output)
113
- markdown_output = markdown_output.strip()
114
-
115
- return markdown_output
116
-
117
- except requests.exceptions.Timeout:
118
- return "Error: Request timed out. The server took too long to respond."
119
- except requests.exceptions.RequestException as e:
120
- return f"Error fetching URL: {e}. Please check the URL or your internet connection."
121
- except Exception as e:
122
- return f"An unexpected error occurred during HTML conversion: {e}"
123
-
124
- def parse_markdown_into_chunks(self, markdown_content: str) -> list:
125
- """
126
- Parses Markdown content into LlamaIndex nodes (chunks) and extracts title and content.
127
- Adheres to SRP for parsing logic.
128
- """
129
- if not markdown_content or "Error fetching URL" in markdown_content or "An unexpected error occurred" in markdown_content:
130
- return []
131
-
132
- doc = Document(text=markdown_content, metadata={"filename": "webpage_content"})
133
- parser = MarkdownNodeParser(include_metadata=True)
134
- nodes = parser.get_nodes_from_documents([doc])
135
- print(f"✅ Parsed {len(nodes)} nodes from Markdown.") # Debug print
136
-
137
- structured_chunks = []
138
- current_id = 0
139
-
140
- for node in nodes:
141
- pure_text_content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
142
-
143
- heading_title = ""
144
- content_text = pure_text_content
145
-
146
- heading_match = re.match(r"^(#+)\s*(.*)", pure_text_content)
147
-
148
- if heading_match:
149
- heading_title = heading_match.group(2).strip()
150
- content_text = pure_text_content[len(heading_match.group(0)):].strip()
151
- if not heading_title:
152
- heading_title = "[Untitled Section]"
153
- else:
154
- first_line = content_text.split('\n')[0].strip()
155
- heading_title = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
156
- if not heading_title:
157
- heading_title = "[Empty Section]"
158
- elif not content_text:
159
- heading_title = "[Empty Section]"
160
-
161
- structured_chunks.append({
162
- "id": current_id,
163
- "title": heading_title,
164
- "content": content_text,
165
- "original_node": node # Keep reference to the original LlamaIndex node
166
- })
167
- current_id += 1
168
-
169
- return structured_chunks
170
-
171
- class ChunkManager:
172
- """
173
- Manages the collection of content chunks, their statistics, and target settings.
174
- Adheres to SRP for chunk data management and OCP by allowing new statistics
175
- or formatting without changing core chunk operations.
176
- """
177
- def __init__(self):
178
- self._chunks = []
179
- self.target_flesch_min = 60
180
- self.target_grade_max = 8
181
- self.target_min_chunk_words = 50
182
- self.target_max_chunk_words = 500
183
-
184
- def set_chunks(self, chunks: list):
185
- """Sets the internal list of chunks and calculates their initial statistics."""
186
- self._chunks = []
187
- for chunk in chunks:
188
- chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
189
- self._chunks.append(chunk)
190
-
191
- def get_chunks(self) -> list:
192
- """Returns the current list of processed chunks."""
193
- return self._chunks
194
-
195
- def _calculate_chunk_stats(self, text: str) -> dict:
196
- """
197
- Calculates various linguistic statistics for a given text chunk.
198
- (Private helper method, SRP for stats calculation)
199
- """
200
- stats = {}
201
- cleaned_text = re.sub(r'#+\s*', '', text)
202
- cleaned_text = re.sub(r'[\*\-]\s*', '', cleaned_text)
203
- cleaned_text = re.sub(r'\n\s*\n+', ' ', cleaned_text).strip()
204
-
205
- stats['word_count'] = textstat.lexicon_count(cleaned_text, removepunct=True)
206
- stats['char_count'] = len(cleaned_text)
207
- stats['sentence_count'] = textstat.sentence_count(cleaned_text)
208
-
209
- if stats['sentence_count'] > 0:
210
- stats['avg_sentence_length'] = stats['word_count'] / stats['sentence_count']
211
- else:
212
- stats['avg_sentence_length'] = 0
213
-
214
- stats['paragraph_count'] = cleaned_text.count('\n\n') + 1 if cleaned_text else 0
215
-
216
- try:
217
- stats['flesch_reading_ease'] = textstat.flesch_reading_ease(cleaned_text)
218
- except Exception:
219
- stats['flesch_reading_ease'] = 0
220
-
221
- try:
222
- stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(cleaned_text)
223
- except Exception:
224
- stats['flesch_kincaid_grade'] = 0
225
-
226
- try:
227
- stats['gunning_fog_score'] = textstat.gunning_fog(cleaned_text)
228
- except Exception:
229
- stats['gunning_fog_score'] = 0
230
-
231
- return stats
232
-
233
- def format_chunk_stats(self, stats: dict) -> str:
234
- """
235
- Formats chunk statistics into a readable string, including explanations for readability scores.
236
- Adheres to SRP for formatting.
237
- """
238
- flesch_ease_color = "red" if stats['flesch_reading_ease'] < self.target_flesch_min else "green"
239
- kincaid_grade_color = "red" if stats['flesch_kincaid_grade'] > self.target_grade_max else "green"
240
- word_count_color = "red" if not (self.target_min_chunk_words <= stats['word_count'] <= self.target_max_chunk_words) else "green"
241
-
242
- stats_str = "#### Chunk Statistics:\n"
243
- stats_str += f"- **Word Count:** <span style='color:{word_count_color}'>{stats['word_count']}</span> (Target: {self.target_min_chunk_words}-{self.target_max_chunk_words})\n"
244
- stats_str += f"- **Character Count:** {stats['char_count']}\n"
245
- stats_str += f"- **Sentence Count:** {stats['sentence_count']}\n"
246
- stats_str += f"- **Avg Sentence Length:** {stats['avg_sentence_length']:.2f} words\n"
247
- stats_str += f"- **Paragraph Count:** {stats['paragraph_count']}\n"
248
- stats_str += f"- **Flesch Reading Ease:** <span style='color:{flesch_ease_color}'>{stats['flesch_reading_ease']:.2f}</span> (Higher scores mean easier to read.)\n"
249
- stats_str += f"- **Flesch-Kincaid Grade:** <span style='color:{kincaid_grade_color}'>{stats['flesch_kincaid_grade']:.2f}</span> (Indicates the U.S. grade level needed to understand the text.)\n"
250
- stats_str += f"- **Gunning Fog Score:** {stats['gunning_fog_score']:.2f}\n"
251
- return stats_str
252
-
253
- def get_document_summary_stats(self) -> str:
254
- """
255
- Aggregates statistics for the entire document across all managed chunks.
256
- Adheres to SRP for document-level summary.
257
- """
258
- if not self._chunks:
259
- return "No document loaded to generate statistics."
260
-
261
- total_words = 0
262
- total_chars = 0
263
- total_sentences = 0
264
- total_paragraphs = 0
265
-
266
- all_content_text = ""
267
- for chunk in self._chunks:
268
- content_text_for_stats = chunk['content']
269
- # Re-calculate stats for each chunk content to ensure summary is up-to-date
270
- current_chunk_stats = self._calculate_chunk_stats(content_text_for_stats)
271
- total_words += current_chunk_stats['word_count']
272
- total_chars += current_chunk_stats['char_count']
273
- total_sentences += current_chunk_stats['sentence_count']
274
- total_paragraphs += current_chunk_stats['paragraph_count']
275
- all_content_text += content_text_for_stats + "\n\n"
276
-
277
- doc_stats_str = "## Overall Document Statistics:\n"
278
- doc_stats_str += f"- **Total Chunks:** {len(self._chunks)}\n"
279
- doc_stats_str += f"- **Total Words:** {total_words}\n"
280
- doc_stats_str += f"- **Total Characters:** {total_chars}\n"
281
- doc_stats_str += f"- **Total Sentences:** {total_sentences}\n"
282
- doc_stats_str += f"- **Total Paragraphs:** {total_paragraphs}\n"
283
-
284
- if len(self._chunks) > 0:
285
- doc_stats_str += f"- **Average Words per Chunk:** {total_words / len(self._chunks):.2f}\n"
286
-
287
- if all_content_text.strip():
288
- overall_stats = self._calculate_chunk_stats(all_content_text)
289
- doc_stats_str += f"- **Overall Flesch Reading Ease:** {overall_stats['flesch_reading_ease']:.2f}\n"
290
- doc_stats_str += f"- **Overall Flesch-Kincaid Grade Level:** {overall_stats['flesch_kincaid_grade']:.2f}\n"
291
- doc_stats_str += f"- **Overall Gunning Fog Score:** {overall_stats['gunning_fog_score']:.2f}\n"
292
- doc_stats_str += f"- **Overall Average Sentence Length:** {overall_stats['avg_sentence_length']:.2f} words\n"
293
- else:
294
- doc_stats_str += "- No content available for overall readability metrics.\n"
295
-
296
- return doc_stats_str
297
-
298
- def get_chunk_by_id(self, chunk_id: int) -> dict | None:
299
- """Retrieves a chunk by its ID."""
300
- return next((chunk for chunk in self._chunks if chunk["id"] == chunk_id), None)
301
-
302
- def get_chunk_titles_for_dropdown(self) -> list:
303
- """Generates dropdown choices using plain text (no HTML)."""
304
- dropdown_choices = []
305
- for chunk in self._chunks:
306
- title = chunk['title']
307
- dropdown_choices.append(f"{chunk['id']}: {title}")
308
- return dropdown_choices
309
-
310
- def update_chunk_content(self, chunk_id: int, new_content: str) -> bool:
311
- """
312
- Updates the content of a chunk, recalculates its stats, and updates its title if needed.
313
- Returns True if successful, False otherwise.
314
- """
315
- for chunk in self._chunks:
316
- if chunk["id"] == chunk_id:
317
- chunk["content"] = new_content
318
- chunk["stats"] = self._calculate_chunk_stats(new_content)
319
- # Update chunk title if it was a placeholder or empty
320
- if chunk["title"].startswith("[") and chunk["title"].endswith("]") or not chunk["title"]:
321
- first_line = new_content.split('\n')[0].strip()
322
- chunk["title"] = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
323
- if not chunk["title"]:
324
- chunk["title"] = "[Empty Section]"
325
- elif not new_content:
326
- chunk["title"] = "[Empty Section]"
327
- return True
328
- return False
329
-
330
- def delete_chunk(self, chunk_id: int) -> bool:
331
- """
332
- Deletes a chunk by ID and re-indexes remaining chunks.
333
- Returns True if successful, False otherwise.
334
- """
335
- initial_chunk_count = len(self._chunks)
336
- self._chunks = [chunk for chunk in self._chunks if chunk["id"] != chunk_id]
337
- if len(self._chunks) == initial_chunk_count:
338
- return False # Chunk not found
339
-
340
- # Re-index IDs to be sequential again
341
- for i, chunk in enumerate(self._chunks):
342
- chunk['id'] = i
343
-
344
- return True
345
-
346
- def get_final_markdown(self) -> str:
347
- """Compiles all current chunks into a single Markdown string."""
348
- final_md = ""
349
- if not self._chunks:
350
- return "No content to compile. Please process a URL first."
351
-
352
- for chunk in self._chunks:
353
- # Use H1 heading if title is meaningful
354
- if not chunk["title"].startswith("[") and chunk["title"]:
355
- final_md += f"# {chunk['title']}\n\n"
356
- final_md += f"{chunk['content']}\n\n"
357
-
358
- return final_md.strip()
359
-
360
- def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
361
- """Sets the global readability and word count targets."""
362
- self.target_flesch_min = flesch_min
363
- self.target_grade_max = grade_max
364
- self.target_min_chunk_words = min_words
365
- self.target_max_chunk_words = max_words
366
- # Recalculate stats for all chunks to reflect new targets in color coding (if displayed)
367
- for chunk in self._chunks:
368
- chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
369
-
370
- # --- Streamlit UI Definition ---
371
- st.set_page_config(layout="wide", page_title="Chunk-Powered Webpage Editor")
372
-
373
- # Initialize session state
374
- if 'chunk_manager' not in st.session_state:
375
- st.session_state.chunk_manager = ChunkManager()
376
- if 'content_processor' not in st.session_state:
377
- st.session_state.content_processor = WebpageContentProcessor()
378
- if 'status_message' not in st.session_state:
379
- st.session_state.status_message = ""
380
- if 'chunk_selector' not in st.session_state:
381
- st.session_state.chunk_selector = None
382
- if 'chunk_content_editor' not in st.session_state:
383
- st.session_state.chunk_content_editor = ""
384
- if 'final_markdown' not in st.session_state:
385
- st.session_state.final_markdown = "Click 'Compile All Chunks' to see the final document with your edits."
386
-
387
-
388
- # Instantiate the managers
389
- content_processor = st.session_state.content_processor
390
- chunk_manager = st.session_state.chunk_manager
391
-
392
- st.markdown("# <center>✨ Chunk-Powered Webpage Editor ✨</center>", unsafe_allow_html=True)
393
- st.markdown(
394
- "Enter a URL, fetch its content, and break it into editable 'chunks'. "
395
- "Review statistics, set targets, edit chunks, and compile your final Markdown."
396
- )
397
-
398
- # --- URL Input and Processing ---
399
- col1, col2 = st.columns([4, 1])
400
- with col1:
401
- url_input = st.text_input(
402
- label="Enter Webpage URL",
403
- placeholder="e.g., https://www.llamaindex.ai/blog/what-is-llamaindex",
404
- key="url_input"
405
- )
406
- with col2:
407
- st.write("") # Spacer
408
- st.write("") # Spacer
409
- process_button = st.button("Process URL", use_container_width=True)
410
-
411
- if st.session_state.status_message:
412
- st.info(st.session_state.status_message)
413
-
414
- if process_button:
415
- if not url_input:
416
- st.session_state.status_message = "Please enter a URL to process."
417
- else:
418
- with st.spinner("Processing URL..."):
419
- markdown_content = content_processor.fetch_and_convert_to_markdown(url_input)
420
-
421
- if "Error" in markdown_content:
422
- chunk_manager.set_chunks([])
423
- st.session_state.status_message = markdown_content
424
- else:
425
- chunks = content_processor.parse_markdown_into_chunks(markdown_content)
426
- chunk_manager.set_chunks(chunks)
427
- st.session_state.status_message = "URL processed successfully!" if chunks else "URL processed, but no content chunks could be extracted."
428
-
429
- if chunks:
430
- st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
431
- else:
432
- st.session_state.chunk_selector = None
433
-
434
-
435
- # --- Tabs for Editor and Overview ---
436
- tab1, tab2 = st.tabs(["Editor", "Document Overview & Targets"])
437
-
438
- with tab1:
439
- st.markdown("## Edit Chunks Individually")
440
-
441
- col1, col2 = st.columns([2, 1])
442
-
443
- with col1:
444
- chunk_selector_options = chunk_manager.get_chunk_titles_for_dropdown()
445
- if chunk_selector_options:
446
- try:
447
- # Find the index of the currently selected item to handle updates
448
- current_selection_index = chunk_selector_options.index(st.session_state.chunk_selector)
449
- except (ValueError, TypeError):
450
- current_selection_index = 0
451
-
452
- selected_chunk_title = st.selectbox(
453
- label="Select Chunk to Edit",
454
- options=chunk_selector_options,
455
- index=current_selection_index,
456
- key="chunk_selector"
457
- )
458
- else:
459
- selected_chunk_title = st.selectbox(
460
- label="Select Chunk to Edit",
461
- options=["No chunks available"],
462
- disabled=True
463
- )
464
-
465
- with col2:
466
- nav_col1, nav_col2 = st.columns(2)
467
- with nav_col1:
468
- if st.button("⬅️ Previous Chunk", use_container_width=True):
469
- if selected_chunk_title and "No chunks available" not in selected_chunk_title:
470
- current_id = int(selected_chunk_title.split(':')[0].strip())
471
- new_id = max(0, current_id - 1)
472
- new_chunk = chunk_manager.get_chunk_by_id(new_id)
473
- if new_chunk:
474
- st.session_state.chunk_selector = f"{new_chunk['id']}: {new_chunk['title']}"
475
-
476
- with nav_col2:
477
- if st.button("Next Chunk ➡️", use_container_width=True):
478
- if selected_chunk_title and "No chunks available" not in selected_chunk_title:
479
- current_id = int(selected_chunk_title.split(':')[0].strip())
480
- new_id = min(len(chunk_manager.get_chunks()) - 1, current_id + 1)
481
- new_chunk = chunk_manager.get_chunk_by_id(new_id)
482
- if new_chunk:
483
- st.session_state.chunk_selector = f"{new_chunk['id']}: {new_chunk['title']}"
484
-
485
- # Get the currently selected chunk
486
- selected_chunk = None
487
- if selected_chunk_title and "No chunks available" not in selected_chunk_title:
488
- current_id = int(selected_chunk_title.split(':')[0].strip())
489
- selected_chunk = chunk_manager.get_chunk_by_id(current_id)
490
-
491
- if selected_chunk:
492
- st.text_input(
493
- label="Chunk Title (Auto-detected)",
494
- value=selected_chunk["title"],
495
- disabled=True
496
- )
497
-
498
- chunk_content_editor = st.text_area(
499
- label="Chunk Content",
500
- value=selected_chunk["content"],
501
- height=250,
502
- key=f"editor_{selected_chunk['id']}" # Unique key to prevent state loss
503
- )
504
-
505
- st.markdown(
506
- chunk_manager.format_chunk_stats(selected_chunk['stats']),
507
- unsafe_allow_html=True
508
- )
509
-
510
- update_col, delete_col, _ = st.columns([1, 1, 3])
511
- with update_col:
512
- if st.button("Update Selected Chunk", use_container_width=True):
513
- chunk_manager.update_chunk_content(selected_chunk['id'], chunk_content_editor)
514
- st.session_state.status_message = f"Chunk '{selected_chunk_title}' updated successfully!"
515
- # Force a re-render to update the dropdown with the new title
516
- st.session_state.chunk_selector = f"{selected_chunk['id']}: {chunk_manager.get_chunk_by_id(selected_chunk['id'])['title']}"
517
-
518
- with delete_col:
519
- if st.button("Delete Selected Chunk", use_container_width=True):
520
- chunk_manager.delete_chunk(selected_chunk['id'])
521
- st.session_state.status_message = f"Chunk '{selected_chunk_title}' deleted successfully!"
522
- if chunk_manager.get_chunks():
523
- st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
524
- else:
525
- st.session_state.chunk_selector = None
526
-
527
-
528
- else:
529
- st.text_input("Chunk Title (Auto-detected)", "Title of the selected chunk", disabled=True)
530
- st.text_area("Chunk Content", "Content of the selected chunk will appear here for editing.", height=250, disabled=True)
531
- st.markdown("Chunk statistics will appear here.")
532
-
533
- st.markdown("---")
534
- st.markdown("## Final Compiled Markdown")
535
-
536
- if st.button("Compile All Chunks", use_container_width=True):
537
- st.session_state.final_markdown = chunk_manager.get_final_markdown()
538
-
539
- st.text_area(
540
- label="Compiled Markdown",
541
- value=st.session_state.final_markdown,
542
- height=400,
543
- key="final_markdown_output",
544
- disabled=False
545
- )
546
-
547
- with tab2:
548
- st.markdown("## Document Summary Statistics")
549
- st.markdown(chunk_manager.get_document_summary_stats(), unsafe_allow_html=True)
550
-
551
- st.markdown("---")
552
- st.markdown("## Content Targets")
553
- st.markdown("Adjust these targets to guide your writing and see visual feedback in the chunk selector (green=good, red=needs attention).")
554
-
555
- with st.form("targets_form"):
556
- col1, col2 = st.columns(2)
557
- with col1:
558
- target_flesch_min_input = st.number_input("Min Flesch Reading Ease", value=float(chunk_manager.target_flesch_min))
559
- target_min_chunk_words_input = st.number_input("Min Chunk Words", value=chunk_manager.target_min_chunk_words)
560
- with col2:
561
- target_grade_max_input = st.number_input("Max Flesch-Kincaid Grade", value=float(chunk_manager.target_grade_max))
562
- target_max_chunk_words_input = st.number_input("Max Chunk Words", value=chunk_manager.target_max_chunk_words)
563
-
564
- submitted = st.form_submit_button("Set New Targets", use_container_width=True)
565
- if submitted:
566
- chunk_manager.set_targets(
567
- target_flesch_min_input,
568
- target_grade_max_input,
569
- int(target_min_chunk_words_input),
570
- int(target_max_chunk_words_input)
571
- )
572
- st.session_state.status_message = "Target settings updated."
 
 
 
 
573
  st.rerun()
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from html_to_markdown import convert_to_markdown
5
+ import re
6
+ from llama_index.core.node_parser import MarkdownNodeParser
7
+ from llama_index.core.schema import Document, MetadataMode
8
+ import textstat # For readability metrics
9
+
10
+ class WebpageContentProcessor:
11
+ """
12
+ Handles fetching, converting, and parsing webpage content into structured chunks.
13
+ Adheres to the Single Responsibility Principle (SRP) for content processing.
14
+ """
15
+ def __init__(self):
16
+ pass
17
+
18
+ def fetch_and_convert_to_markdown(self, url: str) -> str:
19
+ """
20
+ Fetches HTML content from a given URL, attempts to isolate the main content,
21
+ removes common boilerplate, and converts to Markdown.
22
+ Prioritizes semantic content tags over H1-based identification for robust extraction.
23
+ """
24
+ try:
25
+ response = requests.get(url, timeout=10) # Add a timeout for robustness
26
+ response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
27
+ html_content = response.text
28
+
29
+ soup = BeautifulSoup(html_content, 'html.parser')
30
+
31
+ # Aggressive initial removal of script, style, and meta tags that are never content.
32
+ for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
33
+ for element in soup.find_all(tag_name):
34
+ element.decompose()
35
+
36
+ content_for_conversion = None
37
+
38
+ # Prioritize finding main content containers first (semantic tags and common divs)
39
+ content_for_conversion = soup.find('article') or soup.find('main') or \
40
+ soup.find('div', class_='main-content') or \
41
+ soup.find('div', {'role': 'main'})
42
+
43
+ # Fallback logic if main content container wasn't found
44
+ if not content_for_conversion:
45
+ first_h1 = soup.find('h1')
46
+ if first_h1:
47
+ candidate_container = first_h1.parent
48
+ found_main_wrapper_via_h1_parent = False
49
+ # Check up to 5 parent levels for a suitable content wrapper
50
+ for _ in range(5):
51
+ if candidate_container is None:
52
+ break
53
+ if candidate_container.name in ['article', 'main', 'section', 'div'] and \
54
+ any(cls in candidate_container.get('class', []) for cls in ['content', 'post-body', 'article-content', 'entry-content', 'main-content']) or \
55
+ candidate_container.get('role') == 'main':
56
+ content_for_conversion = candidate_container
57
+ found_main_wrapper_via_h1_parent = True
58
+ break
59
+ candidate_container = candidate_container.parent
60
+
61
+ # If no clear wrapper found via H1's parent, take H1 and its direct siblings as a fallback
62
+ if not found_main_wrapper_via_h1_parent:
63
+ temp_soup = BeautifulSoup('', 'html.parser')
64
+ temp_soup.append(first_h1)
65
+ current_element = first_h1.next_sibling
66
+ while current_element:
67
+ temp_soup.append(current_element)
68
+ current_element = current_element.next_sibling
69
+ content_for_conversion = temp_soup
70
+ else:
71
+ # Ultimate fallback: use the entire body if no specific content tags or H1 found
72
+ content_for_conversion = soup.body
73
+
74
+ if not content_for_conversion:
75
+ return "Error: Could not identify main content for conversion."
76
+
77
+ # Selective boilerplate removal within the *identified* main content tag
78
+ unwanted_elements_in_content = [
79
+ 'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button', 'input',
80
+ 'textarea', 'svg', 'canvas', 'audio', 'video', 'picture', 'source', 'track',
81
+ 'map', 'area', 'embed', 'object', 'param', 'applet', 'bgsound', 'frame',
82
+ 'frameset', 'noframes', 'template', 'slot', 'portal', 'datalist', 'keygen',
83
+ 'output', 'progress', 'meter', 'details', 'summary', 'dialog', 'menu',
84
+ 'menuitem', 'command', 'hr', 'figure', 'figcaption', 'cite',
85
+ '.social-share', '.comments', '.related-posts', '.pagination',
86
+ '.breadcrumbs', '.pop-up', '.modal', '.overlay', '.cookie-consent',
87
+ '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
88
+ '[role="complementary"]', '[role="search"]', '[role="menubar"]', '[role="toolbar"]',
89
+ '[class*="utility"]', '[class*="global-nav"]', '[class*="skip"]', '[class*="toast"]',
90
+ '[class*="announcement"]', '[class*="fixed-bottom"]', '[class*="fixed-top"]',
91
+ '[id*="promo"]', '[id*="ad"]', '[id*="banner"]', '[id*="popup"]', '[id*="modal"]',
92
+ '[id*="overlay"]', '[id*="cookie"]', '[id*="skip"]', '[id*="navbar"]', '[id*="menu"]',
93
+ '.hidden', '.visually-hidden',
94
+ '.no-print', '.print-hide',
95
+ '.wp-block-navigation', '.wp-block-group.is-style-stripes',
96
+ '[class*="column"]', '[class*="grid"]'
97
+ ]
98
+
99
+ for selector in unwanted_elements_in_content:
100
+ if re.match(r'^[a-zA-Z0-9]+$', selector):
101
+ for element in content_for_conversion.find_all(selector):
102
+ element.decompose()
103
+ else:
104
+ for element in content_for_conversion.select(selector):
105
+ element.decompose()
106
+
107
+ markdown_output = convert_to_markdown(str(content_for_conversion))
108
+
109
+ # Post-processing: Clean up resulting Markdown
110
+ markdown_output = re.sub(r'\n\s*\n\s*\n+', '\n\n', markdown_output)
111
+ markdown_output = re.sub(r'^\s*[\*\-]\s*$', '', markdown_output, flags=re.MULTILINE)
112
+ markdown_output = re.sub(r'\*{3,}', '', markdown_output)
113
+ markdown_output = markdown_output.strip()
114
+
115
+ return markdown_output
116
+
117
+ except requests.exceptions.Timeout:
118
+ return "Error: Request timed out. The server took too long to respond."
119
+ except requests.exceptions.RequestException as e:
120
+ return f"Error fetching URL: {e}. Please check the URL or your internet connection."
121
+ except Exception as e:
122
+ return f"An unexpected error occurred during HTML conversion: {e}"
123
+
124
+ def parse_markdown_into_chunks(self, markdown_content: str) -> list:
125
+ """
126
+ Parses Markdown content into LlamaIndex nodes (chunks) and extracts title and content.
127
+ Adheres to SRP for parsing logic.
128
+ """
129
+ if not markdown_content or "Error fetching URL" in markdown_content or "An unexpected error occurred" in markdown_content:
130
+ return []
131
+
132
+ doc = Document(text=markdown_content, metadata={"filename": "webpage_content"})
133
+ parser = MarkdownNodeParser(include_metadata=True)
134
+ nodes = parser.get_nodes_from_documents([doc])
135
+ print(f"✅ Parsed {len(nodes)} nodes from Markdown.") # Debug print
136
+
137
+ structured_chunks = []
138
+ current_id = 0
139
+
140
+ for node in nodes:
141
+ pure_text_content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
142
+
143
+ heading_title = ""
144
+ content_text = pure_text_content
145
+
146
+ heading_match = re.match(r"^(#+)\s*(.*)", pure_text_content)
147
+
148
+ if heading_match:
149
+ heading_title = heading_match.group(2).strip()
150
+ content_text = pure_text_content[len(heading_match.group(0)):].strip()
151
+ if not heading_title:
152
+ heading_title = "[Untitled Section]"
153
+ else:
154
+ first_line = content_text.split('\n')[0].strip()
155
+ heading_title = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
156
+ if not heading_title:
157
+ heading_title = "[Empty Section]"
158
+ elif not content_text:
159
+ heading_title = "[Empty Section]"
160
+
161
+ structured_chunks.append({
162
+ "id": current_id,
163
+ "title": heading_title,
164
+ "content": content_text,
165
+ "original_node": node # Keep reference to the original LlamaIndex node
166
+ })
167
+ current_id += 1
168
+
169
+ return structured_chunks
170
+
171
+ class ChunkManager:
172
+ """
173
+ Manages the collection of content chunks, their statistics, and target settings.
174
+ Adheres to SRP for chunk data management and OCP by allowing new statistics
175
+ or formatting without changing core chunk operations.
176
+ """
177
+ def __init__(self):
178
+ self._chunks = []
179
+ self.target_flesch_min = 60
180
+ self.target_grade_max = 8
181
+ self.target_min_chunk_words = 50
182
+ self.target_max_chunk_words = 500
183
+
184
+ def set_chunks(self, chunks: list):
185
+ """Sets the internal list of chunks and calculates their initial statistics."""
186
+ self._chunks = []
187
+ for chunk in chunks:
188
+ chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
189
+ self._chunks.append(chunk)
190
+
191
+ def get_chunks(self) -> list:
192
+ """Returns the current list of processed chunks."""
193
+ return self._chunks
194
+
195
+ def _calculate_chunk_stats(self, text: str) -> dict:
196
+ """
197
+ Calculates various linguistic statistics for a given text chunk.
198
+ (Private helper method, SRP for stats calculation)
199
+ """
200
+ stats = {}
201
+ cleaned_text = re.sub(r'#+\s*', '', text)
202
+ cleaned_text = re.sub(r'[\*\-]\s*', '', cleaned_text)
203
+ cleaned_text = re.sub(r'\n\s*\n+', ' ', cleaned_text).strip()
204
+
205
+ stats['word_count'] = textstat.lexicon_count(cleaned_text, removepunct=True)
206
+ stats['char_count'] = len(cleaned_text)
207
+ stats['sentence_count'] = textstat.sentence_count(cleaned_text)
208
+
209
+ if stats['sentence_count'] > 0:
210
+ stats['avg_sentence_length'] = stats['word_count'] / stats['sentence_count']
211
+ else:
212
+ stats['avg_sentence_length'] = 0
213
+
214
+ stats['paragraph_count'] = cleaned_text.count('\n\n') + 1 if cleaned_text else 0
215
+
216
+ try:
217
+ stats['flesch_reading_ease'] = textstat.flesch_reading_ease(cleaned_text)
218
+ except Exception:
219
+ stats['flesch_reading_ease'] = 0
220
+
221
+ try:
222
+ stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(cleaned_text)
223
+ except Exception:
224
+ stats['flesch_kincaid_grade'] = 0
225
+
226
+ try:
227
+ stats['gunning_fog_score'] = textstat.gunning_fog(cleaned_text)
228
+ except Exception:
229
+ stats['gunning_fog_score'] = 0
230
+
231
+ return stats
232
+
233
+ def format_chunk_stats(self, stats: dict) -> str:
234
+ """
235
+ Formats chunk statistics into a readable string, including explanations for readability scores.
236
+ Adheres to SRP for formatting.
237
+ """
238
+ flesch_ease_color = "red" if stats['flesch_reading_ease'] < self.target_flesch_min else "green"
239
+ kincaid_grade_color = "red" if stats['flesch_kincaid_grade'] > self.target_grade_max else "green"
240
+ word_count_color = "red" if not (self.target_min_chunk_words <= stats['word_count'] <= self.target_max_chunk_words) else "green"
241
+
242
+ stats_str = "#### Chunk Statistics:\n"
243
+ stats_str += f"- **Word Count:** <span style='color:{word_count_color}'>{stats['word_count']}</span> (Target: {self.target_min_chunk_words}-{self.target_max_chunk_words})\n"
244
+ stats_str += f"- **Character Count:** {stats['char_count']}\n"
245
+ stats_str += f"- **Sentence Count:** {stats['sentence_count']}\n"
246
+ stats_str += f"- **Avg Sentence Length:** {stats['avg_sentence_length']:.2f} words\n"
247
+ stats_str += f"- **Paragraph Count:** {stats['paragraph_count']}\n"
248
+ stats_str += f"- **Flesch Reading Ease:** <span style='color:{flesch_ease_color}'>{stats['flesch_reading_ease']:.2f}</span> (Higher scores mean easier to read.)\n"
249
+ stats_str += f"- **Flesch-Kincaid Grade:** <span style='color:{kincaid_grade_color}'>{stats['flesch_kincaid_grade']:.2f}</span> (Indicates the U.S. grade level needed to understand the text.)\n"
250
+ stats_str += f"- **Gunning Fog Score:** {stats['gunning_fog_score']:.2f}\n"
251
+ return stats_str
252
+
253
+ def get_document_summary_stats(self) -> str:
254
+ """
255
+ Aggregates statistics for the entire document across all managed chunks.
256
+ Adheres to SRP for document-level summary.
257
+ """
258
+ if not self._chunks:
259
+ return "No document loaded to generate statistics."
260
+
261
+ total_words = 0
262
+ total_chars = 0
263
+ total_sentences = 0
264
+ total_paragraphs = 0
265
+
266
+ all_content_text = ""
267
+ for chunk in self._chunks:
268
+ content_text_for_stats = chunk['content']
269
+ # Re-calculate stats for each chunk content to ensure summary is up-to-date
270
+ current_chunk_stats = self._calculate_chunk_stats(content_text_for_stats)
271
+ total_words += current_chunk_stats['word_count']
272
+ total_chars += current_chunk_stats['char_count']
273
+ total_sentences += current_chunk_stats['sentence_count']
274
+ total_paragraphs += current_chunk_stats['paragraph_count']
275
+ all_content_text += content_text_for_stats + "\n\n"
276
+
277
+ doc_stats_str = "## Overall Document Statistics:\n"
278
+ doc_stats_str += f"- **Total Chunks:** {len(self._chunks)}\n"
279
+ doc_stats_str += f"- **Total Words:** {total_words}\n"
280
+ doc_stats_str += f"- **Total Characters:** {total_chars}\n"
281
+ doc_stats_str += f"- **Total Sentences:** {total_sentences}\n"
282
+ doc_stats_str += f"- **Total Paragraphs:** {total_paragraphs}\n"
283
+
284
+ if len(self._chunks) > 0:
285
+ doc_stats_str += f"- **Average Words per Chunk:** {total_words / len(self._chunks):.2f}\n"
286
+
287
+ if all_content_text.strip():
288
+ overall_stats = self._calculate_chunk_stats(all_content_text)
289
+ doc_stats_str += f"- **Overall Flesch Reading Ease:** {overall_stats['flesch_reading_ease']:.2f}\n"
290
+ doc_stats_str += f"- **Overall Flesch-Kincaid Grade Level:** {overall_stats['flesch_kincaid_grade']:.2f}\n"
291
+ doc_stats_str += f"- **Overall Gunning Fog Score:** {overall_stats['gunning_fog_score']:.2f}\n"
292
+ doc_stats_str += f"- **Overall Average Sentence Length:** {overall_stats['avg_sentence_length']:.2f} words\n"
293
+ else:
294
+ doc_stats_str += "- No content available for overall readability metrics.\n"
295
+
296
+ return doc_stats_str
297
+
298
+ def get_chunk_by_id(self, chunk_id: int) -> dict | None:
299
+ """Retrieves a chunk by its ID."""
300
+ return next((chunk for chunk in self._chunks if chunk["id"] == chunk_id), None)
301
+
302
+ def get_chunk_titles_for_dropdown(self) -> list:
303
+ """Generates dropdown choices using plain text (no HTML)."""
304
+ dropdown_choices = []
305
+ for chunk in self._chunks:
306
+ title = chunk['title']
307
+ dropdown_choices.append(f"{chunk['id']}: {title}")
308
+ return dropdown_choices
309
+
310
+ def update_chunk_content(self, chunk_id: int, new_content: str) -> bool:
311
+ """
312
+ Updates the content of a chunk, recalculates its stats, and updates its title if needed.
313
+ Returns True if successful, False otherwise.
314
+ """
315
+ for chunk in self._chunks:
316
+ if chunk["id"] == chunk_id:
317
+ chunk["content"] = new_content
318
+ chunk["stats"] = self._calculate_chunk_stats(new_content)
319
+ # Update chunk title if it was a placeholder or empty
320
+ if chunk["title"].startswith("[") and chunk["title"].endswith("]") or not chunk["title"]:
321
+ first_line = new_content.split('\n')[0].strip()
322
+ chunk["title"] = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
323
+ if not chunk["title"]:
324
+ chunk["title"] = "[Empty Section]"
325
+ elif not new_content:
326
+ chunk["title"] = "[Empty Section]"
327
+ return True
328
+ return False
329
+
330
+ def delete_chunk(self, chunk_id: int) -> bool:
331
+ """
332
+ Deletes a chunk by ID and re-indexes remaining chunks.
333
+ Returns True if successful, False otherwise.
334
+ """
335
+ initial_chunk_count = len(self._chunks)
336
+ self._chunks = [chunk for chunk in self._chunks if chunk["id"] != chunk_id]
337
+ if len(self._chunks) == initial_chunk_count:
338
+ return False # Chunk not found
339
+
340
+ # Re-index IDs to be sequential again
341
+ for i, chunk in enumerate(self._chunks):
342
+ chunk['id'] = i
343
+
344
+ return True
345
+
346
+ def get_final_markdown(self) -> str:
347
+ """Compiles all current chunks into a single Markdown string."""
348
+ final_md = ""
349
+ if not self._chunks:
350
+ return "No content to compile. Please process a URL first."
351
+
352
+ for chunk in self._chunks:
353
+ # Use H1 heading if title is meaningful
354
+ if not chunk["title"].startswith("[") and chunk["title"]:
355
+ final_md += f"# {chunk['title']}\n\n"
356
+ final_md += f"{chunk['content']}\n\n"
357
+
358
+ return final_md.strip()
359
+
360
+ def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
361
+ """Sets the global readability and word count targets."""
362
+ self.target_flesch_min = flesch_min
363
+ self.target_grade_max = grade_max
364
+ self.target_min_chunk_words = min_words
365
+ self.target_max_chunk_words = max_words
366
+ # Recalculate stats for all chunks to reflect new targets in color coding (if displayed)
367
+ for chunk in self._chunks:
368
+ chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
369
+
370
+ # --- Streamlit UI Definition ---
371
+ st.set_page_config(layout="wide", page_title="Chunk-Powered Webpage Editor")
372
+
373
+ # Initialize session state
374
+ if 'chunk_manager' not in st.session_state:
375
+ st.session_state.chunk_manager = ChunkManager()
376
+ if 'content_processor' not in st.session_state:
377
+ st.session_state.content_processor = WebpageContentProcessor()
378
+ if 'status_message' not in st.session_state:
379
+ st.session_state.status_message = ""
380
+ if 'chunk_selector' not in st.session_state:
381
+ st.session_state.chunk_selector = None
382
+ if 'chunk_content_editor' not in st.session_state:
383
+ st.session_state.chunk_content_editor = ""
384
+ if 'final_markdown' not in st.session_state:
385
+ st.session_state.final_markdown = "Click 'Compile All Chunks' to see the final document with your edits."
386
+
387
+
388
+ # Instantiate the managers
389
+ content_processor = st.session_state.content_processor
390
+ chunk_manager = st.session_state.chunk_manager
391
+
392
+ st.markdown("# <center>✨ Chunk-Powered Webpage Editor ✨</center>", unsafe_allow_html=True)
393
+ st.markdown(
394
+ "Enter a URL, fetch its content, and break it into editable 'chunks'. "
395
+ "Review statistics, set targets, edit chunks, and compile your final Markdown."
396
+ )
397
+ st.markdown("""<br><div style="display: flex; justify-content: flex-start; align-items: center; gap: 16px;">
398
+ <span>Runs best on Desktop. App created by <a href="https://www.linkedin.com/in/emilijagjorgjevska/" target="_blank">Emilija Gjorgjevska</a></span>
399
+ <a href="https://buymeacoffee.com/emiliagjorgjevska" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png"
400
+ alt="Buy Me A Coffee" style="height: 30px;"></a></div><br>""", unsafe_allow_html=True)
401
+
402
+ # --- URL Input and Processing ---
403
+ col1, col2 = st.columns([4, 1])
404
+ with col1:
405
+ url_input = st.text_input(
406
+ label="Enter Webpage URL",
407
+ placeholder="e.g., https://www.llamaindex.ai/blog/what-is-llamaindex",
408
+ key="url_input"
409
+ )
410
+ with col2:
411
+ st.write("") # Spacer
412
+ st.write("") # Spacer
413
+ process_button = st.button("Process URL", use_container_width=True)
414
+
415
+ if st.session_state.status_message:
416
+ st.info(st.session_state.status_message)
417
+
418
+ if process_button:
419
+ if not url_input:
420
+ st.session_state.status_message = "Please enter a URL to process."
421
+ else:
422
+ with st.spinner("Processing URL..."):
423
+ markdown_content = content_processor.fetch_and_convert_to_markdown(url_input)
424
+
425
+ if "Error" in markdown_content:
426
+ chunk_manager.set_chunks([])
427
+ st.session_state.status_message = markdown_content
428
+ else:
429
+ chunks = content_processor.parse_markdown_into_chunks(markdown_content)
430
+ chunk_manager.set_chunks(chunks)
431
+ st.session_state.status_message = "URL processed successfully!" if chunks else "URL processed, but no content chunks could be extracted."
432
+
433
+ if chunks:
434
+ st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
435
+ else:
436
+ st.session_state.chunk_selector = None
437
+
438
+
439
+ # --- Tabs for Editor and Overview ---
440
+ tab1, tab2 = st.tabs(["Editor", "Document Overview & Targets"])
441
+
442
+ with tab1:
443
+ st.markdown("## Edit Chunks Individually")
444
+
445
+ col1, col2 = st.columns([2, 1])
446
+
447
+ with col1:
448
+ chunk_selector_options = chunk_manager.get_chunk_titles_for_dropdown()
449
+ if chunk_selector_options:
450
+ try:
451
+ # Find the index of the currently selected item to handle updates
452
+ current_selection_index = chunk_selector_options.index(st.session_state.chunk_selector)
453
+ except (ValueError, TypeError):
454
+ current_selection_index = 0
455
+
456
+ selected_chunk_title = st.selectbox(
457
+ label="Select Chunk to Edit",
458
+ options=chunk_selector_options,
459
+ index=current_selection_index,
460
+ key="chunk_selector"
461
+ )
462
+ else:
463
+ selected_chunk_title = st.selectbox(
464
+ label="Select Chunk to Edit",
465
+ options=["No chunks available"],
466
+ disabled=True
467
+ )
468
+
469
+ with col2:
470
+ nav_col1, nav_col2 = st.columns(2)
471
+ with nav_col1:
472
+ if st.button("⬅️ Previous Chunk", use_container_width=True):
473
+ if selected_chunk_title and "No chunks available" not in selected_chunk_title:
474
+ current_id = int(selected_chunk_title.split(':')[0].strip())
475
+ new_id = max(0, current_id - 1)
476
+ new_chunk = chunk_manager.get_chunk_by_id(new_id)
477
+ if new_chunk:
478
+ st.session_state.chunk_selector = f"{new_chunk['id']}: {new_chunk['title']}"
479
+
480
+ with nav_col2:
481
+ if st.button("Next Chunk ➡️", use_container_width=True):
482
+ if selected_chunk_title and "No chunks available" not in selected_chunk_title:
483
+ current_id = int(selected_chunk_title.split(':')[0].strip())
484
+ new_id = min(len(chunk_manager.get_chunks()) - 1, current_id + 1)
485
+ new_chunk = chunk_manager.get_chunk_by_id(new_id)
486
+ if new_chunk:
487
+ st.session_state.chunk_selector = f"{new_chunk['id']}: {new_chunk['title']}"
488
+
489
+ # Get the currently selected chunk
490
+ selected_chunk = None
491
+ if selected_chunk_title and "No chunks available" not in selected_chunk_title:
492
+ current_id = int(selected_chunk_title.split(':')[0].strip())
493
+ selected_chunk = chunk_manager.get_chunk_by_id(current_id)
494
+
495
+ if selected_chunk:
496
+ st.text_input(
497
+ label="Chunk Title (Auto-detected)",
498
+ value=selected_chunk["title"],
499
+ disabled=True
500
+ )
501
+
502
+ chunk_content_editor = st.text_area(
503
+ label="Chunk Content",
504
+ value=selected_chunk["content"],
505
+ height=250,
506
+ key=f"editor_{selected_chunk['id']}" # Unique key to prevent state loss
507
+ )
508
+
509
+ st.markdown(
510
+ chunk_manager.format_chunk_stats(selected_chunk['stats']),
511
+ unsafe_allow_html=True
512
+ )
513
+
514
+ update_col, delete_col, _ = st.columns([1, 1, 3])
515
+ with update_col:
516
+ if st.button("Update Selected Chunk", use_container_width=True):
517
+ chunk_manager.update_chunk_content(selected_chunk['id'], chunk_content_editor)
518
+ st.session_state.status_message = f"Chunk '{selected_chunk_title}' updated successfully!"
519
+ # Force a re-render to update the dropdown with the new title
520
+ st.session_state.chunk_selector = f"{selected_chunk['id']}: {chunk_manager.get_chunk_by_id(selected_chunk['id'])['title']}"
521
+
522
+ with delete_col:
523
+ if st.button("Delete Selected Chunk", use_container_width=True):
524
+ chunk_manager.delete_chunk(selected_chunk['id'])
525
+ st.session_state.status_message = f"Chunk '{selected_chunk_title}' deleted successfully!"
526
+ if chunk_manager.get_chunks():
527
+ st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
528
+ else:
529
+ st.session_state.chunk_selector = None
530
+
531
+
532
+ else:
533
+ st.text_input("Chunk Title (Auto-detected)", "Title of the selected chunk", disabled=True)
534
+ st.text_area("Chunk Content", "Content of the selected chunk will appear here for editing.", height=250, disabled=True)
535
+ st.markdown("Chunk statistics will appear here.")
536
+
537
+ st.markdown("---")
538
+ st.markdown("## Final Compiled Markdown")
539
+
540
+ if st.button("Compile All Chunks", use_container_width=True):
541
+ st.session_state.final_markdown = chunk_manager.get_final_markdown()
542
+
543
+ st.text_area(
544
+ label="Compiled Markdown",
545
+ value=st.session_state.final_markdown,
546
+ height=400,
547
+ key="final_markdown_output",
548
+ disabled=False
549
+ )
550
+
551
+ with tab2:
552
+ st.markdown("## Document Summary Statistics")
553
+ st.markdown(chunk_manager.get_document_summary_stats(), unsafe_allow_html=True)
554
+
555
+ st.markdown("---")
556
+ st.markdown("## Content Targets")
557
+ st.markdown("Adjust these targets to guide your writing and see visual feedback in the chunk selector (green=good, red=needs attention).")
558
+
559
+ with st.form("targets_form"):
560
+ col1, col2 = st.columns(2)
561
+ with col1:
562
+ target_flesch_min_input = st.number_input("Min Flesch Reading Ease", value=float(chunk_manager.target_flesch_min))
563
+ target_min_chunk_words_input = st.number_input("Min Chunk Words", value=chunk_manager.target_min_chunk_words)
564
+ with col2:
565
+ target_grade_max_input = st.number_input("Max Flesch-Kincaid Grade", value=float(chunk_manager.target_grade_max))
566
+ target_max_chunk_words_input = st.number_input("Max Chunk Words", value=chunk_manager.target_max_chunk_words)
567
+
568
+ submitted = st.form_submit_button("Set New Targets", use_container_width=True)
569
+ if submitted:
570
+ chunk_manager.set_targets(
571
+ target_flesch_min_input,
572
+ target_grade_max_input,
573
+ int(target_min_chunk_words_input),
574
+ int(target_max_chunk_words_input)
575
+ )
576
+ st.session_state.status_message = "Target settings updated."
577
  st.rerun()