Em4e commited on
Commit
825f0b9
·
verified ·
1 Parent(s): 63b41a0

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +573 -0
app.py ADDED
@@ -0,0 +1,573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from html_to_markdown import convert_to_markdown
5
+ import re
6
+ from llama_index.core.node_parser import MarkdownNodeParser
7
+ from llama_index.core.schema import Document, MetadataMode
8
+ import textstat # For readability metrics
9
+
10
+ class WebpageContentProcessor:
11
+ """
12
+ Handles fetching, converting, and parsing webpage content into structured chunks.
13
+ Adheres to the Single Responsibility Principle (SRP) for content processing.
14
+ """
15
+ def __init__(self):
16
+ pass
17
+
18
+ def fetch_and_convert_to_markdown(self, url: str) -> str:
19
+ """
20
+ Fetches HTML content from a given URL, attempts to isolate the main content,
21
+ removes common boilerplate, and converts to Markdown.
22
+ Prioritizes semantic content tags over H1-based identification for robust extraction.
23
+ """
24
+ try:
25
+ response = requests.get(url, timeout=10) # Add a timeout for robustness
26
+ response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
27
+ html_content = response.text
28
+
29
+ soup = BeautifulSoup(html_content, 'html.parser')
30
+
31
+ # Aggressive initial removal of script, style, and meta tags that are never content.
32
+ for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
33
+ for element in soup.find_all(tag_name):
34
+ element.decompose()
35
+
36
+ content_for_conversion = None
37
+
38
+ # Prioritize finding main content containers first (semantic tags and common divs)
39
+ content_for_conversion = soup.find('article') or soup.find('main') or \
40
+ soup.find('div', class_='main-content') or \
41
+ soup.find('div', {'role': 'main'})
42
+
43
+ # Fallback logic if main content container wasn't found
44
+ if not content_for_conversion:
45
+ first_h1 = soup.find('h1')
46
+ if first_h1:
47
+ candidate_container = first_h1.parent
48
+ found_main_wrapper_via_h1_parent = False
49
+ # Check up to 5 parent levels for a suitable content wrapper
50
+ for _ in range(5):
51
+ if candidate_container is None:
52
+ break
53
+ if candidate_container.name in ['article', 'main', 'section', 'div'] and \
54
+ any(cls in candidate_container.get('class', []) for cls in ['content', 'post-body', 'article-content', 'entry-content', 'main-content']) or \
55
+ candidate_container.get('role') == 'main':
56
+ content_for_conversion = candidate_container
57
+ found_main_wrapper_via_h1_parent = True
58
+ break
59
+ candidate_container = candidate_container.parent
60
+
61
+ # If no clear wrapper found via H1's parent, take H1 and its direct siblings as a fallback
62
+ if not found_main_wrapper_via_h1_parent:
63
+ temp_soup = BeautifulSoup('', 'html.parser')
64
+ temp_soup.append(first_h1)
65
+ current_element = first_h1.next_sibling
66
+ while current_element:
67
+ temp_soup.append(current_element)
68
+ current_element = current_element.next_sibling
69
+ content_for_conversion = temp_soup
70
+ else:
71
+ # Ultimate fallback: use the entire body if no specific content tags or H1 found
72
+ content_for_conversion = soup.body
73
+
74
+ if not content_for_conversion:
75
+ return "Error: Could not identify main content for conversion."
76
+
77
+ # Selective boilerplate removal within the *identified* main content tag
78
+ unwanted_elements_in_content = [
79
+ 'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button', 'input',
80
+ 'textarea', 'svg', 'canvas', 'audio', 'video', 'picture', 'source', 'track',
81
+ 'map', 'area', 'embed', 'object', 'param', 'applet', 'bgsound', 'frame',
82
+ 'frameset', 'noframes', 'template', 'slot', 'portal', 'datalist', 'keygen',
83
+ 'output', 'progress', 'meter', 'details', 'summary', 'dialog', 'menu',
84
+ 'menuitem', 'command', 'hr', 'figure', 'figcaption', 'cite',
85
+ '.social-share', '.comments', '.related-posts', '.pagination',
86
+ '.breadcrumbs', '.pop-up', '.modal', '.overlay', '.cookie-consent',
87
+ '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
88
+ '[role="complementary"]', '[role="search"]', '[role="menubar"]', '[role="toolbar"]',
89
+ '[class*="utility"]', '[class*="global-nav"]', '[class*="skip"]', '[class*="toast"]',
90
+ '[class*="announcement"]', '[class*="fixed-bottom"]', '[class*="fixed-top"]',
91
+ '[id*="promo"]', '[id*="ad"]', '[id*="banner"]', '[id*="popup"]', '[id*="modal"]',
92
+ '[id*="overlay"]', '[id*="cookie"]', '[id*="skip"]', '[id*="navbar"]', '[id*="menu"]',
93
+ '.hidden', '.visually-hidden',
94
+ '.no-print', '.print-hide',
95
+ '.wp-block-navigation', '.wp-block-group.is-style-stripes',
96
+ '[class*="column"]', '[class*="grid"]'
97
+ ]
98
+
99
+ for selector in unwanted_elements_in_content:
100
+ if re.match(r'^[a-zA-Z0-9]+$', selector):
101
+ for element in content_for_conversion.find_all(selector):
102
+ element.decompose()
103
+ else:
104
+ for element in content_for_conversion.select(selector):
105
+ element.decompose()
106
+
107
+ markdown_output = convert_to_markdown(str(content_for_conversion))
108
+
109
+ # Post-processing: Clean up resulting Markdown
110
+ markdown_output = re.sub(r'\n\s*\n\s*\n+', '\n\n', markdown_output)
111
+ markdown_output = re.sub(r'^\s*[\*\-]\s*$', '', markdown_output, flags=re.MULTILINE)
112
+ markdown_output = re.sub(r'\*{3,}', '', markdown_output)
113
+ markdown_output = markdown_output.strip()
114
+
115
+ return markdown_output
116
+
117
+ except requests.exceptions.Timeout:
118
+ return "Error: Request timed out. The server took too long to respond."
119
+ except requests.exceptions.RequestException as e:
120
+ return f"Error fetching URL: {e}. Please check the URL or your internet connection."
121
+ except Exception as e:
122
+ return f"An unexpected error occurred during HTML conversion: {e}"
123
+
124
+ def parse_markdown_into_chunks(self, markdown_content: str) -> list:
125
+ """
126
+ Parses Markdown content into LlamaIndex nodes (chunks) and extracts title and content.
127
+ Adheres to SRP for parsing logic.
128
+ """
129
+ if not markdown_content or "Error fetching URL" in markdown_content or "An unexpected error occurred" in markdown_content:
130
+ return []
131
+
132
+ doc = Document(text=markdown_content, metadata={"filename": "webpage_content"})
133
+ parser = MarkdownNodeParser(include_metadata=True)
134
+ nodes = parser.get_nodes_from_documents([doc])
135
+ print(f"✅ Parsed {len(nodes)} nodes from Markdown.") # Debug print
136
+
137
+ structured_chunks = []
138
+ current_id = 0
139
+
140
+ for node in nodes:
141
+ pure_text_content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
142
+
143
+ heading_title = ""
144
+ content_text = pure_text_content
145
+
146
+ heading_match = re.match(r"^(#+)\s*(.*)", pure_text_content)
147
+
148
+ if heading_match:
149
+ heading_title = heading_match.group(2).strip()
150
+ content_text = pure_text_content[len(heading_match.group(0)):].strip()
151
+ if not heading_title:
152
+ heading_title = "[Untitled Section]"
153
+ else:
154
+ first_line = content_text.split('\n')[0].strip()
155
+ heading_title = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
156
+ if not heading_title:
157
+ heading_title = "[Empty Section]"
158
+ elif not content_text:
159
+ heading_title = "[Empty Section]"
160
+
161
+ structured_chunks.append({
162
+ "id": current_id,
163
+ "title": heading_title,
164
+ "content": content_text,
165
+ "original_node": node # Keep reference to the original LlamaIndex node
166
+ })
167
+ current_id += 1
168
+
169
+ return structured_chunks
170
+
171
+ class ChunkManager:
172
+ """
173
+ Manages the collection of content chunks, their statistics, and target settings.
174
+ Adheres to SRP for chunk data management and OCP by allowing new statistics
175
+ or formatting without changing core chunk operations.
176
+ """
177
+ def __init__(self):
178
+ self._chunks = []
179
+ self.target_flesch_min = 60
180
+ self.target_grade_max = 8
181
+ self.target_min_chunk_words = 50
182
+ self.target_max_chunk_words = 500
183
+
184
+ def set_chunks(self, chunks: list):
185
+ """Sets the internal list of chunks and calculates their initial statistics."""
186
+ self._chunks = []
187
+ for chunk in chunks:
188
+ chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
189
+ self._chunks.append(chunk)
190
+
191
+ def get_chunks(self) -> list:
192
+ """Returns the current list of processed chunks."""
193
+ return self._chunks
194
+
195
+ def _calculate_chunk_stats(self, text: str) -> dict:
196
+ """
197
+ Calculates various linguistic statistics for a given text chunk.
198
+ (Private helper method, SRP for stats calculation)
199
+ """
200
+ stats = {}
201
+ cleaned_text = re.sub(r'#+\s*', '', text)
202
+ cleaned_text = re.sub(r'[\*\-]\s*', '', cleaned_text)
203
+ cleaned_text = re.sub(r'\n\s*\n+', ' ', cleaned_text).strip()
204
+
205
+ stats['word_count'] = textstat.lexicon_count(cleaned_text, removepunct=True)
206
+ stats['char_count'] = len(cleaned_text)
207
+ stats['sentence_count'] = textstat.sentence_count(cleaned_text)
208
+
209
+ if stats['sentence_count'] > 0:
210
+ stats['avg_sentence_length'] = stats['word_count'] / stats['sentence_count']
211
+ else:
212
+ stats['avg_sentence_length'] = 0
213
+
214
+ stats['paragraph_count'] = cleaned_text.count('\n\n') + 1 if cleaned_text else 0
215
+
216
+ try:
217
+ stats['flesch_reading_ease'] = textstat.flesch_reading_ease(cleaned_text)
218
+ except Exception:
219
+ stats['flesch_reading_ease'] = 0
220
+
221
+ try:
222
+ stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(cleaned_text)
223
+ except Exception:
224
+ stats['flesch_kincaid_grade'] = 0
225
+
226
+ try:
227
+ stats['gunning_fog_score'] = textstat.gunning_fog(cleaned_text)
228
+ except Exception:
229
+ stats['gunning_fog_score'] = 0
230
+
231
+ return stats
232
+
233
+ def format_chunk_stats(self, stats: dict) -> str:
234
+ """
235
+ Formats chunk statistics into a readable string, including explanations for readability scores.
236
+ Adheres to SRP for formatting.
237
+ """
238
+ flesch_ease_color = "red" if stats['flesch_reading_ease'] < self.target_flesch_min else "green"
239
+ kincaid_grade_color = "red" if stats['flesch_kincaid_grade'] > self.target_grade_max else "green"
240
+ word_count_color = "red" if not (self.target_min_chunk_words <= stats['word_count'] <= self.target_max_chunk_words) else "green"
241
+
242
+ stats_str = "#### Chunk Statistics:\n"
243
+ stats_str += f"- **Word Count:** <span style='color:{word_count_color}'>{stats['word_count']}</span> (Target: {self.target_min_chunk_words}-{self.target_max_chunk_words})\n"
244
+ stats_str += f"- **Character Count:** {stats['char_count']}\n"
245
+ stats_str += f"- **Sentence Count:** {stats['sentence_count']}\n"
246
+ stats_str += f"- **Avg Sentence Length:** {stats['avg_sentence_length']:.2f} words\n"
247
+ stats_str += f"- **Paragraph Count:** {stats['paragraph_count']}\n"
248
+ stats_str += f"- **Flesch Reading Ease:** <span style='color:{flesch_ease_color}'>{stats['flesch_reading_ease']:.2f}</span> (Higher scores mean easier to read.)\n"
249
+ stats_str += f"- **Flesch-Kincaid Grade:** <span style='color:{kincaid_grade_color}'>{stats['flesch_kincaid_grade']:.2f}</span> (Indicates the U.S. grade level needed to understand the text.)\n"
250
+ stats_str += f"- **Gunning Fog Score:** {stats['gunning_fog_score']:.2f}\n"
251
+ return stats_str
252
+
253
+ def get_document_summary_stats(self) -> str:
254
+ """
255
+ Aggregates statistics for the entire document across all managed chunks.
256
+ Adheres to SRP for document-level summary.
257
+ """
258
+ if not self._chunks:
259
+ return "No document loaded to generate statistics."
260
+
261
+ total_words = 0
262
+ total_chars = 0
263
+ total_sentences = 0
264
+ total_paragraphs = 0
265
+
266
+ all_content_text = ""
267
+ for chunk in self._chunks:
268
+ content_text_for_stats = chunk['content']
269
+ # Re-calculate stats for each chunk content to ensure summary is up-to-date
270
+ current_chunk_stats = self._calculate_chunk_stats(content_text_for_stats)
271
+ total_words += current_chunk_stats['word_count']
272
+ total_chars += current_chunk_stats['char_count']
273
+ total_sentences += current_chunk_stats['sentence_count']
274
+ total_paragraphs += current_chunk_stats['paragraph_count']
275
+ all_content_text += content_text_for_stats + "\n\n"
276
+
277
+ doc_stats_str = "## Overall Document Statistics:\n"
278
+ doc_stats_str += f"- **Total Chunks:** {len(self._chunks)}\n"
279
+ doc_stats_str += f"- **Total Words:** {total_words}\n"
280
+ doc_stats_str += f"- **Total Characters:** {total_chars}\n"
281
+ doc_stats_str += f"- **Total Sentences:** {total_sentences}\n"
282
+ doc_stats_str += f"- **Total Paragraphs:** {total_paragraphs}\n"
283
+
284
+ if len(self._chunks) > 0:
285
+ doc_stats_str += f"- **Average Words per Chunk:** {total_words / len(self._chunks):.2f}\n"
286
+
287
+ if all_content_text.strip():
288
+ overall_stats = self._calculate_chunk_stats(all_content_text)
289
+ doc_stats_str += f"- **Overall Flesch Reading Ease:** {overall_stats['flesch_reading_ease']:.2f}\n"
290
+ doc_stats_str += f"- **Overall Flesch-Kincaid Grade Level:** {overall_stats['flesch_kincaid_grade']:.2f}\n"
291
+ doc_stats_str += f"- **Overall Gunning Fog Score:** {overall_stats['gunning_fog_score']:.2f}\n"
292
+ doc_stats_str += f"- **Overall Average Sentence Length:** {overall_stats['avg_sentence_length']:.2f} words\n"
293
+ else:
294
+ doc_stats_str += "- No content available for overall readability metrics.\n"
295
+
296
+ return doc_stats_str
297
+
298
+ def get_chunk_by_id(self, chunk_id: int) -> dict | None:
299
+ """Retrieves a chunk by its ID."""
300
+ return next((chunk for chunk in self._chunks if chunk["id"] == chunk_id), None)
301
+
302
+ def get_chunk_titles_for_dropdown(self) -> list:
303
+ """Generates dropdown choices using plain text (no HTML)."""
304
+ dropdown_choices = []
305
+ for chunk in self._chunks:
306
+ title = chunk['title']
307
+ dropdown_choices.append(f"{chunk['id']}: {title}")
308
+ return dropdown_choices
309
+
310
+ def update_chunk_content(self, chunk_id: int, new_content: str) -> bool:
311
+ """
312
+ Updates the content of a chunk, recalculates its stats, and updates its title if needed.
313
+ Returns True if successful, False otherwise.
314
+ """
315
+ for chunk in self._chunks:
316
+ if chunk["id"] == chunk_id:
317
+ chunk["content"] = new_content
318
+ chunk["stats"] = self._calculate_chunk_stats(new_content)
319
+ # Update chunk title if it was a placeholder or empty
320
+ if chunk["title"].startswith("[") and chunk["title"].endswith("]") or not chunk["title"]:
321
+ first_line = new_content.split('\n')[0].strip()
322
+ chunk["title"] = first_line[:70].strip() + "..." if len(first_line) > 70 else first_line
323
+ if not chunk["title"]:
324
+ chunk["title"] = "[Empty Section]"
325
+ elif not new_content:
326
+ chunk["title"] = "[Empty Section]"
327
+ return True
328
+ return False
329
+
330
+ def delete_chunk(self, chunk_id: int) -> bool:
331
+ """
332
+ Deletes a chunk by ID and re-indexes remaining chunks.
333
+ Returns True if successful, False otherwise.
334
+ """
335
+ initial_chunk_count = len(self._chunks)
336
+ self._chunks = [chunk for chunk in self._chunks if chunk["id"] != chunk_id]
337
+ if len(self._chunks) == initial_chunk_count:
338
+ return False # Chunk not found
339
+
340
+ # Re-index IDs to be sequential again
341
+ for i, chunk in enumerate(self._chunks):
342
+ chunk['id'] = i
343
+
344
+ return True
345
+
346
+ def get_final_markdown(self) -> str:
347
+ """Compiles all current chunks into a single Markdown string."""
348
+ final_md = ""
349
+ if not self._chunks:
350
+ return "No content to compile. Please process a URL first."
351
+
352
+ for chunk in self._chunks:
353
+ # Use H1 heading if title is meaningful
354
+ if not chunk["title"].startswith("[") and chunk["title"]:
355
+ final_md += f"# {chunk['title']}\n\n"
356
+ final_md += f"{chunk['content']}\n\n"
357
+
358
+ return final_md.strip()
359
+
360
+ def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
361
+ """Sets the global readability and word count targets."""
362
+ self.target_flesch_min = flesch_min
363
+ self.target_grade_max = grade_max
364
+ self.target_min_chunk_words = min_words
365
+ self.target_max_chunk_words = max_words
366
+ # Recalculate stats for all chunks to reflect new targets in color coding (if displayed)
367
+ for chunk in self._chunks:
368
+ chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
369
+
370
+ # --- Streamlit UI Definition ---
371
+ st.set_page_config(layout="wide", page_title="Chunk-Powered Webpage Editor")
372
+
373
+ # Initialize session state
374
+ if 'chunk_manager' not in st.session_state:
375
+ st.session_state.chunk_manager = ChunkManager()
376
+ if 'content_processor' not in st.session_state:
377
+ st.session_state.content_processor = WebpageContentProcessor()
378
+ if 'status_message' not in st.session_state:
379
+ st.session_state.status_message = ""
380
+ if 'chunk_selector' not in st.session_state:
381
+ st.session_state.chunk_selector = None
382
+ if 'chunk_content_editor' not in st.session_state:
383
+ st.session_state.chunk_content_editor = ""
384
+ if 'final_markdown' not in st.session_state:
385
+ st.session_state.final_markdown = "Click 'Compile All Chunks' to see the final document with your edits."
386
+
387
+
388
+ # Instantiate the managers
389
+ content_processor = st.session_state.content_processor
390
+ chunk_manager = st.session_state.chunk_manager
391
+
392
+ st.markdown("# <center>✨ Chunk-Powered Webpage Editor ✨</center>", unsafe_allow_html=True)
393
+ st.markdown(
394
+ "Enter a URL, fetch its content, and break it into editable 'chunks'. "
395
+ "Review statistics, set targets, edit chunks, and compile your final Markdown."
396
+ )
397
+
398
+ # --- URL Input and Processing ---
399
+ col1, col2 = st.columns([4, 1])
400
+ with col1:
401
+ url_input = st.text_input(
402
+ label="Enter Webpage URL",
403
+ placeholder="e.g., https://www.llamaindex.ai/blog/what-is-llamaindex",
404
+ key="url_input"
405
+ )
406
+ with col2:
407
+ st.write("") # Spacer
408
+ st.write("") # Spacer
409
+ process_button = st.button("Process URL", use_container_width=True)
410
+
411
+ if st.session_state.status_message:
412
+ st.info(st.session_state.status_message)
413
+
414
+ if process_button:
415
+ if not url_input:
416
+ st.session_state.status_message = "Please enter a URL to process."
417
+ else:
418
+ with st.spinner("Processing URL..."):
419
+ markdown_content = content_processor.fetch_and_convert_to_markdown(url_input)
420
+
421
+ if "Error" in markdown_content:
422
+ chunk_manager.set_chunks([])
423
+ st.session_state.status_message = markdown_content
424
+ else:
425
+ chunks = content_processor.parse_markdown_into_chunks(markdown_content)
426
+ chunk_manager.set_chunks(chunks)
427
+ st.session_state.status_message = "URL processed successfully!" if chunks else "URL processed, but no content chunks could be extracted."
428
+
429
+ if chunks:
430
+ st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
431
+ else:
432
+ st.session_state.chunk_selector = None
433
+
434
+
435
+ # --- Tabs for Editor and Overview ---
436
+ tab1, tab2 = st.tabs(["Editor", "Document Overview & Targets"])
437
+
438
+ with tab1:
439
+ st.markdown("## Edit Chunks Individually")
440
+
441
+ col1, col2 = st.columns([2, 1])
442
+
443
+ with col1:
444
+ chunk_selector_options = chunk_manager.get_chunk_titles_for_dropdown()
445
+ if chunk_selector_options:
446
+ try:
447
+ # Find the index of the currently selected item to handle updates
448
+ current_selection_index = chunk_selector_options.index(st.session_state.chunk_selector)
449
+ except (ValueError, TypeError):
450
+ current_selection_index = 0
451
+
452
+ selected_chunk_title = st.selectbox(
453
+ label="Select Chunk to Edit",
454
+ options=chunk_selector_options,
455
+ index=current_selection_index,
456
+ key="chunk_selector"
457
+ )
458
+ else:
459
+ selected_chunk_title = st.selectbox(
460
+ label="Select Chunk to Edit",
461
+ options=["No chunks available"],
462
+ disabled=True
463
+ )
464
+
465
+ with col2:
466
+ nav_col1, nav_col2 = st.columns(2)
467
+ with nav_col1:
468
+ if st.button("⬅️ Previous Chunk", use_container_width=True):
469
+ if selected_chunk_title and "No chunks available" not in selected_chunk_title:
470
+ current_id = int(selected_chunk_title.split(':')[0].strip())
471
+ new_id = max(0, current_id - 1)
472
+ new_chunk = chunk_manager.get_chunk_by_id(new_id)
473
+ if new_chunk:
474
+ st.session_state.chunk_selector = f"{new_chunk['id']}: {new_chunk['title']}"
475
+
476
+ with nav_col2:
477
+ if st.button("Next Chunk ➡️", use_container_width=True):
478
+ if selected_chunk_title and "No chunks available" not in selected_chunk_title:
479
+ current_id = int(selected_chunk_title.split(':')[0].strip())
480
+ new_id = min(len(chunk_manager.get_chunks()) - 1, current_id + 1)
481
+ new_chunk = chunk_manager.get_chunk_by_id(new_id)
482
+ if new_chunk:
483
+ st.session_state.chunk_selector = f"{new_chunk['id']}: {new_chunk['title']}"
484
+
485
+ # Get the currently selected chunk
486
+ selected_chunk = None
487
+ if selected_chunk_title and "No chunks available" not in selected_chunk_title:
488
+ current_id = int(selected_chunk_title.split(':')[0].strip())
489
+ selected_chunk = chunk_manager.get_chunk_by_id(current_id)
490
+
491
+ if selected_chunk:
492
+ st.text_input(
493
+ label="Chunk Title (Auto-detected)",
494
+ value=selected_chunk["title"],
495
+ disabled=True
496
+ )
497
+
498
+ chunk_content_editor = st.text_area(
499
+ label="Chunk Content",
500
+ value=selected_chunk["content"],
501
+ height=250,
502
+ key=f"editor_{selected_chunk['id']}" # Unique key to prevent state loss
503
+ )
504
+
505
+ st.markdown(
506
+ chunk_manager.format_chunk_stats(selected_chunk['stats']),
507
+ unsafe_allow_html=True
508
+ )
509
+
510
+ update_col, delete_col, _ = st.columns([1, 1, 3])
511
+ with update_col:
512
+ if st.button("Update Selected Chunk", use_container_width=True):
513
+ chunk_manager.update_chunk_content(selected_chunk['id'], chunk_content_editor)
514
+ st.session_state.status_message = f"Chunk '{selected_chunk_title}' updated successfully!"
515
+ # Force a re-render to update the dropdown with the new title
516
+ st.session_state.chunk_selector = f"{selected_chunk['id']}: {chunk_manager.get_chunk_by_id(selected_chunk['id'])['title']}"
517
+
518
+ with delete_col:
519
+ if st.button("Delete Selected Chunk", use_container_width=True):
520
+ chunk_manager.delete_chunk(selected_chunk['id'])
521
+ st.session_state.status_message = f"Chunk '{selected_chunk_title}' deleted successfully!"
522
+ if chunk_manager.get_chunks():
523
+ st.session_state.chunk_selector = chunk_manager.get_chunk_titles_for_dropdown()[0]
524
+ else:
525
+ st.session_state.chunk_selector = None
526
+
527
+
528
+ else:
529
+ st.text_input("Chunk Title (Auto-detected)", "Title of the selected chunk", disabled=True)
530
+ st.text_area("Chunk Content", "Content of the selected chunk will appear here for editing.", height=250, disabled=True)
531
+ st.markdown("Chunk statistics will appear here.")
532
+
533
+ st.markdown("---")
534
+ st.markdown("## Final Compiled Markdown")
535
+
536
+ if st.button("Compile All Chunks", use_container_width=True):
537
+ st.session_state.final_markdown = chunk_manager.get_final_markdown()
538
+
539
+ st.text_area(
540
+ label="Compiled Markdown",
541
+ value=st.session_state.final_markdown,
542
+ height=400,
543
+ key="final_markdown_output",
544
+ disabled=False
545
+ )
546
+
547
+ with tab2:
548
+ st.markdown("## Document Summary Statistics")
549
+ st.markdown(chunk_manager.get_document_summary_stats(), unsafe_allow_html=True)
550
+
551
+ st.markdown("---")
552
+ st.markdown("## Content Targets")
553
+ st.markdown("Adjust these targets to guide your writing and see visual feedback in the chunk selector (green=good, red=needs attention).")
554
+
555
+ with st.form("targets_form"):
556
+ col1, col2 = st.columns(2)
557
+ with col1:
558
+ target_flesch_min_input = st.number_input("Min Flesch Reading Ease", value=float(chunk_manager.target_flesch_min))
559
+ target_min_chunk_words_input = st.number_input("Min Chunk Words", value=chunk_manager.target_min_chunk_words)
560
+ with col2:
561
+ target_grade_max_input = st.number_input("Max Flesch-Kincaid Grade", value=float(chunk_manager.target_grade_max))
562
+ target_max_chunk_words_input = st.number_input("Max Chunk Words", value=chunk_manager.target_max_chunk_words)
563
+
564
+ submitted = st.form_submit_button("Set New Targets", use_container_width=True)
565
+ if submitted:
566
+ chunk_manager.set_targets(
567
+ target_flesch_min_input,
568
+ target_grade_max_input,
569
+ int(target_min_chunk_words_input),
570
+ int(target_max_chunk_words_input)
571
+ )
572
+ st.session_state.status_message = "Target settings updated."
573
+ st.rerun()