Em4e commited on
Commit
4f72763
·
verified ·
1 Parent(s): de55a7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -110
app.py CHANGED
@@ -198,39 +198,49 @@ init_session_state()
198
  processor = st.session_state.processor
199
  manager = st.session_state.manager
200
 
201
- st.title("Chunk Webpage Content Editor")
202
- st.caption("A tool to fetch, chunk, and refine web content for AI synthesis.")
203
- st.markdown(
204
- "Developed by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/). "
205
- "Inspired by Andrea Volpini's [work on content chunking](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)")
206
-
207
- # --- MODIFIED: Added concise guidelines to the expander ---
208
- with st.expander("ℹ️ App Information & AI Writing Guidelines", expanded=False):
209
- st.info(
210
- """
211
- ### How Layout-Based Chunking is Implemented Here
212
- This app uses a two-step process to create meaningful chunks based on a document’s structure:
213
- 1. **Structural Preservation (HTML → Markdown):** It converts a webpage’s HTML into Markdown, preserving the original layout and hierarchy (e.g., `<h1>` becomes `#`).
214
- 2. **Layout-Aware Parsing (`MarkdownNodeParser`):** It then uses LlamaIndex’s `MarkdownNodeParser` to split the Markdown at logical boundaries (like headers), yielding context-aware chunks that respect the original sections.
 
 
 
 
215
 
216
- ---
217
-
218
- ### Writing for AI Verifiability: A Quick Guide
219
- To ensure your content is selected and cited by AI, focus on making each chunk clear, coherent, and verifiable.
220
-
221
- * **Structure with Headers:** Use a logical hierarchy of headings (H1, H2, H3) in your source content. The app uses these to create chunks.
222
- * **Write for Clarity:**
223
- * Use short, direct sentences.
224
- * State facts explicitly—don't make the AI guess.
225
- * Follow the "one idea per paragraph" rule to create self-contained, meaningful chunks.
226
- * **Create Verifiable Blocks:** Format content as direct definitions, Q&A sections, or step-by-step guides. These are ideal formats for AI to extract and use as answers.
227
- * **Use the Editor's Metrics:** In the "Chunk Editor" tab, use the real-time stats to guide your writing.
228
- * **Reading Ease:** Aim for a score **above 60**.
229
- * **Word Count:** Keep chunks within the target range (e.g., 40-600 words).
230
- * The colors (red/green) will show if you are meeting the targets set in the "Settings" tab.
231
- """
232
- , icon="💡")
233
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
  url_input = st.text_input("Enter a webpage URL to start", key="url_input")
236
 
@@ -257,85 +267,60 @@ if st.session_state.status_message:
257
  st.toast(st.session_state.status_message)
258
  st.session_state.status_message = ""
259
 
260
- tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
261
-
262
- with tab1:
263
- chunks = manager.get_chunks()
264
- if not chunks:
265
- st.write("Process a URL to begin editing content chunks.")
266
- else:
267
- chunk_ids = [c['id'] for c in chunks]
268
- if st.session_state.selected_chunk_id not in chunk_ids:
269
- st.session_state.selected_chunk_id = chunk_ids[0] if chunk_ids else None
270
-
271
- if st.session_state.selected_chunk_id is not None:
272
- chunk_options = {c['id']: c['title'] for c in chunks}
273
-
274
- selected_id = st.selectbox(
275
- "Select a chunk to edit",
276
- options=chunk_ids,
277
- format_func=lambda x: f"Chunk {x}: {chunk_options.get(x, 'N/A')}",
278
- index=chunk_ids.index(st.session_state.selected_chunk_id)
279
- )
280
-
281
- if selected_id != st.session_state.selected_chunk_id:
282
- st.session_state.selected_chunk_id = selected_id
283
- st.rerun()
284
-
285
- selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
286
-
287
- if selected_chunk:
288
- # --- Side-by-side layout for editor and live preview ---
289
- editor_col, preview_col = st.columns(2)
290
-
291
- with editor_col:
292
- st.markdown(f"**Editing: {selected_chunk['title']}**")
293
- st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
294
-
295
- edited_content = st.text_area(
296
- "Chunk Content (Markdown)",
297
- value=selected_chunk['content'],
298
- height=400,
299
- key=f"editor_{selected_chunk['id']}"
300
- )
301
-
302
- b_col1, b_col2, _ = st.columns([1, 1, 3])
303
-
304
- if b_col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
305
- manager.update_chunk_content(selected_chunk['id'], edited_content)
306
- st.session_state.status_message = "Chunk updated successfully!"
307
- st.rerun()
308
-
309
- if b_col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
310
- manager.delete_chunk(selected_chunk['id'])
311
- st.session_state.status_message = "Chunk deleted."
312
- remaining_chunks = manager.get_chunks()
313
- st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
314
- st.rerun()
315
-
316
- with preview_col:
317
- st.markdown("**Live Preview**")
318
- with st.container(height=525, border=True):
319
- st.markdown(edited_content, unsafe_allow_html=True)
320
-
321
-
322
- with tab2:
323
- st.subheader("Document Overview")
324
- st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
325
-
326
- st.subheader("Content Targets")
327
- with st.form("targets_form"):
328
- st.write("Set readability targets to guide your editing. See color feedback in the editor.")
329
- c1, c2 = st.columns(2)
330
- f_min = c1.number_input("Min Flesch Reading Ease", value=float(manager.target_flesch_min))
331
- g_max = c2.number_input("Max Flesch-Kincaid Grade", value=float(manager.target_grade_max))
332
- w_min = c1.number_input("Min Chunk Words", value=int(manager.target_min_chunk_words))
333
- w_max = c2.number_input("Max Chunk Words", value=int(manager.target_max_chunk_words))
334
 
335
- if st.form_submit_button("Set New Targets", use_container_width=True):
336
- manager.set_targets(f_min, g_max, w_min, w_max)
337
- st.session_state.status_message = "Content targets have been updated."
338
  st.rerun()
339
 
340
- st.subheader("Final Compiled Document")
341
- st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  processor = st.session_state.processor
199
  manager = st.session_state.manager
200
 
201
+
202
+ # --- MODIFIED: Settings and Overview moved to a persistent sidebar ---
203
+ with st.sidebar:
204
+ st.image("https://www.wordlift.io/wp-content/uploads/2024/05/wl-logo-wordlift-ai-white.png", width=150)
205
+ st.title("Settings & Overview")
206
+
207
+ with st.expander("About this App & AI Writing Guidelines", expanded=True):
208
+ st.info(
209
+ """
210
+ This app helps you refine web content for AI synthesis by chunking it into logical, verifiable blocks.
211
+
212
+ **Writing for AI Verifiability:**
213
+ * **Structure with Headers:** Use H1, H2, H3 tags logically.
214
+ * **Write for Clarity:** Use short, direct sentences. State facts explicitly.
215
+ * **Create Verifiable Blocks:** Format content as definitions, Q&As, or step-by-step guides.
216
+ * **Use the Editor's Metrics:** Aim for a **Reading Ease > 60** and a **Word Count** between 40-600 per chunk. The colors will guide you.
217
+ """, icon="💡"
218
+ )
219
 
220
+ st.subheader("📊 Document Overview")
221
+ st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
222
+
223
+ st.subheader("🎯 Content Targets")
224
+ with st.form("targets_form"):
225
+ st.write("Set readability targets to guide your editing. Colors in the editor will reflect these targets.")
226
+ c1, c2 = st.columns(2)
227
+ f_min = c1.number_input("Min Flesch Reading Ease", value=float(manager.target_flesch_min))
228
+ g_max = c2.number_input("Max Flesch-Kincaid Grade", value=float(manager.target_grade_max))
229
+ w_min = c1.number_input("Min Chunk Words", value=int(manager.target_min_chunk_words))
230
+ w_max = c2.number_input("Max Chunk Words", value=int(manager.target_max_chunk_words))
 
 
 
 
 
 
231
 
232
+ if st.form_submit_button("Set New Targets", use_container_width=True):
233
+ manager.set_targets(f_min, g_max, w_min, w_max)
234
+ st.session_state.status_message = "Content targets have been updated."
235
+ st.rerun()
236
+
237
+ st.subheader("📋 Final Compiled Document")
238
+ st.text_area("Final Markdown Output", manager.get_final_markdown(), height=300, key="final_markdown")
239
+
240
+
241
+ # --- Main Page Layout ---
242
+ st.title("📝 Chunk Webpage Content Editor")
243
+ st.caption("A tool to fetch, chunk, and refine web content for AI synthesis. Inspired by Andrea Volpini's work.")
244
 
245
  url_input = st.text_input("Enter a webpage URL to start", key="url_input")
246
 
 
267
  st.toast(st.session_state.status_message)
268
  st.session_state.status_message = ""
269
 
270
+ # --- MODIFIED: Removed tabs, editor is now the main view ---
271
+ chunks = manager.get_chunks()
272
+ if not chunks:
273
+ st.write("Process a URL to begin editing content chunks, or adjust settings in the sidebar.")
274
+ else:
275
+ chunk_ids = [c['id'] for c in chunks]
276
+ if st.session_state.selected_chunk_id not in chunk_ids:
277
+ st.session_state.selected_chunk_id = chunk_ids[0] if chunk_ids else None
278
+
279
+ if st.session_state.selected_chunk_id is not None:
280
+ chunk_options = {c['id']: c['title'] for c in chunks}
281
+
282
+ selected_id = st.selectbox(
283
+ "Select a chunk to edit",
284
+ options=chunk_ids,
285
+ format_func=lambda x: f"Chunk {x}: {chunk_options.get(x, 'N/A')}",
286
+ index=chunk_ids.index(st.session_state.selected_chunk_id)
287
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
+ if selected_id != st.session_state.selected_chunk_id:
290
+ st.session_state.selected_chunk_id = selected_id
 
291
  st.rerun()
292
 
293
+ selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
294
+
295
+ if selected_chunk:
296
+ editor_col, preview_col = st.columns(2)
297
+
298
+ with editor_col:
299
+ st.markdown(f"**Editing: {selected_chunk['title']}**")
300
+ st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
301
+
302
+ edited_content = st.text_area(
303
+ "Chunk Content (Markdown)",
304
+ value=selected_chunk['content'],
305
+ height=400,
306
+ key=f"editor_{selected_chunk['id']}"
307
+ )
308
+
309
+ b_col1, b_col2, _ = st.columns([1, 1, 3])
310
+
311
+ if b_col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
312
+ manager.update_chunk_content(selected_chunk['id'], edited_content)
313
+ st.session_state.status_message = "Chunk updated successfully!"
314
+ st.rerun()
315
+
316
+ if b_col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
317
+ manager.delete_chunk(selected_chunk['id'])
318
+ st.session_state.status_message = "Chunk deleted."
319
+ remaining_chunks = manager.get_chunks()
320
+ st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
321
+ st.rerun()
322
+
323
+ with preview_col:
324
+ st.markdown("**Live Preview**")
325
+ with st.container(height=525, border=True):
326
+ st.markdown(edited_content, unsafe_allow_html=True)