Em4e commited on
Commit
92ff4ac
·
verified ·
1 Parent(s): c7506fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +293 -296
app.py CHANGED
@@ -1,297 +1,294 @@
1
- import streamlit as st
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from html_to_markdown import convert_to_markdown
5
- import re
6
- from llama_index.core.node_parser import MarkdownNodeParser
7
- from llama_index.core.schema import Document, MetadataMode
8
- import textstat # For readability metrics
9
-
10
- class WebpageContentProcessor:
11
- """
12
- Handles fetching, converting, and parsing webpage content into structured chunks.
13
- Adheres to the Single Responsibility Principle (SRP) for content processing.
14
- """
15
- def __init__(self):
16
- pass
17
-
18
- def fetch_and_convert_to_markdown(self, url: str) -> str:
19
- """
20
- Fetches HTML content from a given URL, attempts to isolate the main content,
21
- removes common boilerplate, and converts to Markdown.
22
- Prioritizes semantic content tags over H1-based identification for robust extraction.
23
- """
24
- try:
25
- headers = {
26
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
27
- }
28
- response = requests.get(url, headers=headers, timeout=15)
29
- response.raise_for_status()
30
- html_content = response.text
31
- soup = BeautifulSoup(html_content, 'html.parser')
32
-
33
- for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
34
- for element in soup.find_all(tag_name):
35
- element.decompose()
36
-
37
- content_for_conversion = soup.find('article') or soup.find('main') or \
38
- soup.find('div', class_='main-content') or \
39
- soup.find('div', {'role': 'main'})
40
-
41
- if not content_for_conversion:
42
- first_h1 = soup.find('h1')
43
- if first_h1:
44
- candidate_container = first_h1.parent
45
- for _ in range(5):
46
- if candidate_container is None: break
47
- if candidate_container.name in ['article', 'main', 'section', 'div']:
48
- content_for_conversion = candidate_container
49
- break
50
- candidate_container = candidate_container.parent
51
- if not content_for_conversion:
52
- content_for_conversion = first_h1.find_parent()
53
- else:
54
- content_for_conversion = soup.body
55
-
56
- if not content_for_conversion:
57
- return "Error: Could not identify main content for conversion."
58
-
59
- unwanted_selectors = [
60
- 'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button', 'input',
61
- 'textarea', 'svg', 'figure', 'figcaption',
62
- '.social-share', '.comments', '.related-posts', '.pagination',
63
- '.breadcrumbs', '.cookie-consent', '[role="navigation"]',
64
- '[role="banner"]', '[role="contentinfo"]', '[class*="ad"]', '[id*="ad"]'
65
- ]
66
- for selector in unwanted_selectors:
67
- for element in content_for_conversion.select(selector):
68
- element.decompose()
69
-
70
- markdown_output = convert_to_markdown(str(content_for_conversion))
71
- markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
72
- markdown_output = markdown_output.strip()
73
-
74
- return markdown_output
75
-
76
- except requests.exceptions.Timeout:
77
- return "Error: Request timed out. The server took too long to respond."
78
- except requests.exceptions.RequestException as e:
79
- return f"Error fetching URL: {e}."
80
- except Exception as e:
81
- return f"An unexpected error occurred: {e}"
82
-
83
- def parse_markdown_into_chunks(self, markdown_content: str) -> list:
84
- if not markdown_content or "Error" in markdown_content:
85
- return []
86
- doc = Document(text=markdown_content)
87
- parser = MarkdownNodeParser(include_metadata=True)
88
- nodes = parser.get_nodes_from_documents([doc])
89
- structured_chunks = []
90
- for i, node in enumerate(nodes):
91
- content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
92
- title_match = re.match(r"^(#+)\s*(.*)", content)
93
- title = title_match.group(2).strip() if title_match else (content.split('\n')[0][:70] + "...")
94
- structured_chunks.append({"id": i, "title": title, "content": content})
95
- return structured_chunks
96
-
97
- class ChunkManager:
98
- def __init__(self):
99
- self._chunks = []
100
- self.target_flesch_min = 60
101
- self.target_grade_max = 8
102
- self.target_min_chunk_words = 50
103
- self.target_max_chunk_words = 500
104
-
105
- def set_chunks(self, chunks: list):
106
- self._chunks = [self._add_stats_to_chunk(chunk) for chunk in chunks]
107
-
108
- def get_chunks(self) -> list:
109
- return self._chunks
110
-
111
- def _add_stats_to_chunk(self, chunk: dict) -> dict:
112
- chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
113
- return chunk
114
-
115
- def _calculate_chunk_stats(self, text: str) -> dict:
116
- stats = {}
117
- try:
118
- stats['word_count'] = textstat.lexicon_count(text, removepunct=True)
119
- stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
120
- stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
121
- except Exception:
122
- stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0})
123
- return stats
124
-
125
- def format_chunk_stats(self, stats: dict) -> str:
126
- flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red"
127
- grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red"
128
- word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red"
129
-
130
- return (
131
- f"**Word Count:** <span style='color:{word_color}'>{stats.get('word_count', 0)}</span> | "
132
- f"**Reading Ease:** <span style='color:{flesch_color}'>{stats.get('flesch_reading_ease', 0):.2f}</span> | "
133
- f"**Grade Level:** <span style='color:{grade_color}'>{stats.get('flesch_kincaid_grade', 0):.2f}</span>"
134
- )
135
-
136
- def get_document_summary_stats(self) -> str:
137
- if not self._chunks:
138
- return "No document loaded."
139
-
140
- total_words = sum(c['stats']['word_count'] for c in self._chunks)
141
- avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks) if self._chunks else 0
142
- avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks) if self._chunks else 0
143
-
144
- return (
145
- f"**Total Chunks:** {len(self._chunks)} | "
146
- f"**Total Words:** {total_words} | "
147
- f"**Avg. Reading Ease:** {avg_ease:.2f} | "
148
- f"**Avg. Grade Level:** {avg_grade:.2f}"
149
- )
150
-
151
- def get_chunk_by_id(self, chunk_id: int) -> dict | None:
152
- return next((c for c in self._chunks if c["id"] == chunk_id), None)
153
-
154
- def update_chunk_content(self, chunk_id: int, new_content: str):
155
- chunk = self.get_chunk_by_id(chunk_id)
156
- if chunk:
157
- chunk["content"] = new_content
158
- self._add_stats_to_chunk(chunk)
159
-
160
- def delete_chunk(self, chunk_id: int):
161
- self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
162
- for i, chunk in enumerate(self._chunks):
163
- chunk['id'] = i
164
-
165
- def get_final_markdown(self) -> str:
166
- if not self._chunks:
167
- return "No content to display."
168
- return "\n\n".join(f"# {c['title']}\n{c['content']}" for c in self._chunks)
169
-
170
- def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
171
- self.target_flesch_min = flesch_min
172
- self.target_grade_max = grade_max
173
- self.target_min_chunk_words = min_words
174
- self.target_max_chunk_words = max_words
175
- self.set_chunks(self.get_chunks()) # Recalculate stats with new targets
176
-
177
- st.set_page_config(layout="wide", page_title="Webpage Content Editor")
178
-
179
- # Initialize session state variables
180
- if 'chunk_manager' not in st.session_state:
181
- st.session_state.chunk_manager = ChunkManager()
182
- if 'content_processor' not in st.session_state:
183
- st.session_state.content_processor = WebpageContentProcessor()
184
- if 'selected_chunk_id' not in st.session_state:
185
- st.session_state.selected_chunk_id = None
186
- if 'status_message' not in st.session_state:
187
- st.session_state.status_message = ""
188
-
189
- processor = st.session_state.content_processor
190
- manager = st.session_state.chunk_manager
191
-
192
- st.title("✨ Webpage Content Editor")
193
- st.caption("Created by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/) | Inspired by Andrea Volpini's work on content chunking.")
194
-
195
- st.info(
196
- "**Note:** Some URLs may be blocked due to server policies (like bot detection). "
197
- "This is an early version, so expect a few bugs!",
198
- icon="ℹ️"
199
- )
200
-
201
- url_input = st.text_input("Enter a webpage URL to begin", key="url_input")
202
-
203
- if st.button("Process URL", use_container_width=True):
204
- if url_input:
205
- with st.spinner("Fetching and processing content..."):
206
- markdown = processor.fetch_and_convert_to_markdown(url_input)
207
- if "Error" in markdown:
208
- st.session_state.status_message = markdown
209
- manager.set_chunks([])
210
- else:
211
- chunks = processor.parse_markdown_into_chunks(markdown)
212
- manager.set_chunks(chunks)
213
- st.session_state.status_message = f"Successfully processed {len(chunks)} chunks." if chunks else "Could not extract content chunks."
214
-
215
- if manager.get_chunks():
216
- st.session_state.selected_chunk_id = manager.get_chunks()[0]['id']
217
- else:
218
- st.session_state.selected_chunk_id = None
219
- st.rerun()
220
-
221
- if st.session_state.status_message:
222
- st.toast(st.session_state.status_message)
223
- st.session_state.status_message = "" # Clear message after showing
224
-
225
- tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
226
-
227
- with tab1:
228
- chunks = manager.get_chunks()
229
- if not chunks:
230
- st.write("Process a URL to start editing chunks.")
231
- else:
232
- # Ensure selected_chunk_id is valid
233
- if st.session_state.selected_chunk_id not in [c['id'] for c in chunks]:
234
- st.session_state.selected_chunk_id = chunks[0]['id'] if chunks else None
235
-
236
- if st.session_state.selected_chunk_id is not None:
237
- chunk_options = {c['id']: f"Chunk {c['id']}: {c['title']}" for c in chunks}
238
-
239
- def on_select_change():
240
- # Callback to update the selected ID in session state
241
- st.session_state.selected_chunk_id = st.session_state.chunk_selector
242
-
243
- selected_id_from_widget = st.selectbox(
244
- "Select a chunk to edit",
245
- options=list(chunk_options.keys()),
246
- format_func=lambda x: chunk_options[x],
247
- key="chunk_selector",
248
- on_change=on_select_change,
249
- index=list(chunk_options.keys()).index(st.session_state.selected_chunk_id)
250
- )
251
-
252
- selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
253
-
254
- if selected_chunk:
255
- st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
256
-
257
- edited_content = st.text_area(
258
- "Chunk Content",
259
- value=selected_chunk['content'],
260
- height=300,
261
- key=f"editor_{selected_chunk['id']}" # Unique key forces re-render
262
- )
263
-
264
- col1, col2, _ = st.columns([1, 1, 4])
265
- if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
266
- manager.update_chunk_content(selected_chunk['id'], edited_content)
267
- st.session_state.status_message = "Chunk updated!"
268
- st.rerun()
269
-
270
- if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
271
- old_id = selected_chunk['id']
272
- manager.delete_chunk(old_id)
273
- st.session_state.status_message = "Chunk deleted!"
274
- # Select the next available chunk or none if empty
275
- remaining_chunks = manager.get_chunks()
276
- st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
277
- st.rerun()
278
-
279
- with tab2:
280
- st.subheader("Document Overview")
281
- st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
282
-
283
- st.subheader("Content Targets")
284
- with st.form("targets_form"):
285
- c1, c2 = st.columns(2)
286
- f_min = c1.number_input("Min Flesch Reading Ease", value=float(manager.target_flesch_min))
287
- g_max = c2.number_input("Max Flesch-Kincaid Grade", value=float(manager.target_grade_max))
288
- w_min = c1.number_input("Min Chunk Words", value=int(manager.target_min_chunk_words))
289
- w_max = c2.number_input("Max Chunk Words", value=int(manager.target_max_chunk_words))
290
-
291
- if st.form_submit_button("Set New Targets", use_container_width=True):
292
- manager.set_targets(f_min, g_max, w_min, w_max)
293
- st.session_state.status_message = "Targets updated."
294
- st.rerun()
295
-
296
- st.subheader("Final Document")
297
  st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from html_to_markdown import convert_to_markdown
5
+ import re
6
+ from llama_index.core.node_parser import MarkdownNodeParser
7
+ from llama_index.core.schema import Document, MetadataMode
8
+ import textstat # For readability metrics
9
+
10
+ class WebpageContentProcessor:
11
+ """
12
+ Handles fetching, converting, and parsing webpage content into structured chunks.
13
+ Adheres to the Single Responsibility Principle (SRP) for content processing.
14
+ """
15
+ def __init__(self):
16
+ pass
17
+
18
+ def fetch_and_convert_to_markdown(self, url: str) -> str:
19
+ """
20
+ Fetches HTML content from a given URL, attempts to isolate the main content,
21
+ removes common boilerplate, and converts to Markdown.
22
+ Prioritizes semantic content tags over H1-based identification for robust extraction.
23
+ """
24
+ try:
25
+ headers = {
26
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
27
+ }
28
+ response = requests.get(url, headers=headers, timeout=15)
29
+ response.raise_for_status()
30
+ html_content = response.text
31
+ soup = BeautifulSoup(html_content, 'html.parser')
32
+
33
+ for tag_name in ['script', 'style', 'noscript', 'meta', 'link']:
34
+ for element in soup.find_all(tag_name):
35
+ element.decompose()
36
+
37
+ content_for_conversion = soup.find('article') or soup.find('main') or \
38
+ soup.find('div', class_='main-content') or \
39
+ soup.find('div', {'role': 'main'})
40
+
41
+ if not content_for_conversion:
42
+ first_h1 = soup.find('h1')
43
+ if first_h1:
44
+ candidate_container = first_h1.parent
45
+ for _ in range(5):
46
+ if candidate_container is None: break
47
+ if candidate_container.name in ['article', 'main', 'section', 'div']:
48
+ content_for_conversion = candidate_container
49
+ break
50
+ candidate_container = candidate_container.parent
51
+ if not content_for_conversion:
52
+ content_for_conversion = first_h1.find_parent()
53
+ else:
54
+ content_for_conversion = soup.body
55
+
56
+ if not content_for_conversion:
57
+ return "Error: Could not identify main content for conversion."
58
+
59
+ unwanted_selectors = [
60
+ 'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button', 'input',
61
+ 'textarea', 'svg', 'figure', 'figcaption',
62
+ '.social-share', '.comments', '.related-posts', '.pagination',
63
+ '.breadcrumbs', '.cookie-consent', '[role="navigation"]',
64
+ '[role="banner"]', '[role="contentinfo"]', '[class*="ad"]', '[id*="ad"]'
65
+ ]
66
+ for selector in unwanted_selectors:
67
+ for element in content_for_conversion.select(selector):
68
+ element.decompose()
69
+
70
+ markdown_output = convert_to_markdown(str(content_for_conversion))
71
+ markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
72
+ markdown_output = markdown_output.strip()
73
+
74
+ return markdown_output
75
+
76
+ except requests.exceptions.Timeout:
77
+ return "Error: Request timed out. The server took too long to respond."
78
+ except requests.exceptions.RequestException as e:
79
+ return f"Error fetching URL: {e}."
80
+ except Exception as e:
81
+ return f"An unexpected error occurred: {e}"
82
+
83
+ def parse_markdown_into_chunks(self, markdown_content: str) -> list:
84
+ if not markdown_content or "Error" in markdown_content:
85
+ return []
86
+ doc = Document(text=markdown_content)
87
+ parser = MarkdownNodeParser(include_metadata=True)
88
+ nodes = parser.get_nodes_from_documents([doc])
89
+ structured_chunks = []
90
+ for i, node in enumerate(nodes):
91
+ content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
92
+ title_match = re.match(r"^(#+)\s*(.*)", content)
93
+ title = title_match.group(2).strip() if title_match else (content.split('\n')[0][:70] + "...")
94
+ structured_chunks.append({"id": i, "title": title, "content": content})
95
+ return structured_chunks
96
+
97
+ class ChunkManager:
98
+ def __init__(self):
99
+ self._chunks = []
100
+ self.target_flesch_min = 60
101
+ self.target_grade_max = 8
102
+ self.target_min_chunk_words = 50
103
+ self.target_max_chunk_words = 500
104
+
105
+ def set_chunks(self, chunks: list):
106
+ self._chunks = [self._add_stats_to_chunk(chunk) for chunk in chunks]
107
+
108
+ def get_chunks(self) -> list:
109
+ return self._chunks
110
+
111
+ def _add_stats_to_chunk(self, chunk: dict) -> dict:
112
+ chunk['stats'] = self._calculate_chunk_stats(chunk['content'])
113
+ return chunk
114
+
115
+ def _calculate_chunk_stats(self, text: str) -> dict:
116
+ stats = {}
117
+ try:
118
+ stats['word_count'] = textstat.lexicon_count(text, removepunct=True)
119
+ stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
120
+ stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
121
+ except Exception:
122
+ stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0})
123
+ return stats
124
+
125
+ def format_chunk_stats(self, stats: dict) -> str:
126
+ flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red"
127
+ grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red"
128
+ word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red"
129
+
130
+ return (
131
+ f"**Word Count:** <span style='color:{word_color}'>{stats.get('word_count', 0)}</span> | "
132
+ f"**Reading Ease:** <span style='color:{flesch_color}'>{stats.get('flesch_reading_ease', 0):.2f}</span> | "
133
+ f"**Grade Level:** <span style='color:{grade_color}'>{stats.get('flesch_kincaid_grade', 0):.2f}</span>"
134
+ )
135
+
136
+ def get_document_summary_stats(self) -> str:
137
+ if not self._chunks:
138
+ return "No document loaded."
139
+
140
+ total_words = sum(c['stats']['word_count'] for c in self._chunks)
141
+ avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks) if self._chunks else 0
142
+ avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks) if self._chunks else 0
143
+
144
+ return (
145
+ f"**Total Chunks:** {len(self._chunks)} | "
146
+ f"**Total Words:** {total_words} | "
147
+ f"**Avg. Reading Ease:** {avg_ease:.2f} | "
148
+ f"**Avg. Grade Level:** {avg_grade:.2f}"
149
+ )
150
+
151
+ def get_chunk_by_id(self, chunk_id: int) -> dict | None:
152
+ return next((c for c in self._chunks if c["id"] == chunk_id), None)
153
+
154
+ def update_chunk_content(self, chunk_id: int, new_content: str):
155
+ chunk = self.get_chunk_by_id(chunk_id)
156
+ if chunk:
157
+ chunk["content"] = new_content
158
+ self._add_stats_to_chunk(chunk)
159
+
160
+ def delete_chunk(self, chunk_id: int):
161
+ self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
162
+ for i, chunk in enumerate(self._chunks):
163
+ chunk['id'] = i
164
+
165
+ def get_final_markdown(self) -> str:
166
+ if not self._chunks:
167
+ return "No content to display."
168
+ return "\n\n".join(f"# {c['title']}\n{c['content']}" for c in self._chunks)
169
+
170
+ def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
171
+ self.target_flesch_min = flesch_min
172
+ self.target_grade_max = grade_max
173
+ self.target_min_chunk_words = min_words
174
+ self.target_max_chunk_words = max_words
175
+ self.set_chunks(self.get_chunks()) # Recalculate stats with new targets
176
+
177
+ st.set_page_config(layout="wide", page_title="Webpage Content Editor")
178
+
179
+ # Initialize session state variables
180
+ if 'chunk_manager' not in st.session_state:
181
+ st.session_state.chunk_manager = ChunkManager()
182
+ if 'content_processor' not in st.session_state:
183
+ st.session_state.content_processor = WebpageContentProcessor()
184
+ if 'selected_chunk_id' not in st.session_state:
185
+ st.session_state.selected_chunk_id = None
186
+ if 'status_message' not in st.session_state:
187
+ st.session_state.status_message = ""
188
+
189
+ processor = st.session_state.content_processor
190
+ manager = st.session_state.chunk_manager
191
+
192
+ st.title("✨ Webpage Content Editor")
193
+ st.caption("Created by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/) | Inspired by Andrea Volpini's work on content chunking.")
194
+
195
+ st.info(
196
+ "**Note:** Some URLs may be blocked due to server policies (like bot detection). "
197
+ "This is an early version, so expect a few bugs!",
198
+ icon="ℹ️"
199
+ )
200
+
201
+ url_input = st.text_input("Enter a webpage URL to begin", key="url_input")
202
+
203
+ if st.button("Process URL", use_container_width=True):
204
+ if url_input:
205
+ with st.spinner("Fetching and processing content..."):
206
+ markdown = processor.fetch_and_convert_to_markdown(url_input)
207
+ if "Error" in markdown:
208
+ st.session_state.status_message = markdown
209
+ manager.set_chunks([])
210
+ else:
211
+ chunks = processor.parse_markdown_into_chunks(markdown)
212
+ manager.set_chunks(chunks)
213
+ st.session_state.status_message = f"Successfully processed {len(chunks)} chunks." if chunks else "Could not extract content chunks."
214
+
215
+ if manager.get_chunks():
216
+ st.session_state.selected_chunk_id = manager.get_chunks()[0]['id']
217
+ else:
218
+ st.session_state.selected_chunk_id = None
219
+ st.rerun()
220
+
221
+ if st.session_state.status_message:
222
+ st.toast(st.session_state.status_message)
223
+ st.session_state.status_message = "" # Clear message after showing
224
+
225
+ tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
226
+
227
+ with tab1:
228
+ chunks = manager.get_chunks()
229
+ if not chunks:
230
+ st.write("Process a URL to start editing chunks.")
231
+ else:
232
+ # Ensure selected_chunk_id is valid
233
+ if st.session_state.selected_chunk_id not in [c['id'] for c in chunks]:
234
+ st.session_state.selected_chunk_id = chunks[0]['id'] if chunks else None
235
+
236
+ if st.session_state.selected_chunk_id is not None:
237
+ chunk_options = {c['id']: f"Chunk {c['id']}: {c['title']}" for c in chunks}
238
+
239
+ # This selectbox now directly manages `selected_chunk_id` in the session state.
240
+ # When a user makes a selection, Streamlit automatically updates the state and reruns the script.
241
+ st.selectbox(
242
+ "Select a chunk to edit",
243
+ options=list(chunk_options.keys()),
244
+ format_func=lambda x: chunk_options.get(x, "Invalid Chunk"),
245
+ key="selected_chunk_id", # The key is now the session state variable itself
246
+ index=list(chunk_options.keys()).index(st.session_state.selected_chunk_id)
247
+ )
248
+
249
+ selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
250
+
251
+ if selected_chunk:
252
+ st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
253
+
254
+ edited_content = st.text_area(
255
+ "Chunk Content",
256
+ value=selected_chunk['content'],
257
+ height=300,
258
+ key=f"editor_{selected_chunk['id']}" # Unique key forces re-render
259
+ )
260
+
261
+ col1, col2, _ = st.columns([1, 1, 4])
262
+ if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
263
+ manager.update_chunk_content(selected_chunk['id'], edited_content)
264
+ st.session_state.status_message = "Chunk updated!"
265
+ st.rerun()
266
+
267
+ if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
268
+ old_id = selected_chunk['id']
269
+ manager.delete_chunk(old_id)
270
+ st.session_state.status_message = "Chunk deleted!"
271
+ # Select the next available chunk or none if empty
272
+ remaining_chunks = manager.get_chunks()
273
+ st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
274
+ st.rerun()
275
+
276
+ with tab2:
277
+ st.subheader("Document Overview")
278
+ st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
279
+
280
+ st.subheader("Content Targets")
281
+ with st.form("targets_form"):
282
+ c1, c2 = st.columns(2)
283
+ f_min = c1.number_input("Min Flesch Reading Ease", value=float(manager.target_flesch_min))
284
+ g_max = c2.number_input("Max Flesch-Kincaid Grade", value=float(manager.target_grade_max))
285
+ w_min = c1.number_input("Min Chunk Words", value=int(manager.target_min_chunk_words))
286
+ w_max = c2.number_input("Max Chunk Words", value=int(manager.target_max_chunk_words))
287
+
288
+ if st.form_submit_button("Set New Targets", use_container_width=True):
289
+ manager.set_targets(f_min, g_max, w_min, w_max)
290
+ st.session_state.status_message = "Targets updated."
291
+ st.rerun()
292
+
293
+ st.subheader("Final Document")
 
 
 
294
  st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")