Em4e commited on
Commit
0915f87
·
verified ·
1 Parent(s): 37f325d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -23
app.py CHANGED
@@ -5,10 +5,9 @@ import re
5
  from llama_index.core.node_parser import MarkdownNodeParser
6
  from llama_index.core.schema import Document, MetadataMode
7
  import textstat
8
- from markdownify import markdownify as md
9
 
10
  # --- Core Logic Classes ---
11
-
12
  class WebpageContentProcessor:
13
  """
14
  Handles fetching, converting, and parsing webpage content into structured chunks.
@@ -30,27 +29,22 @@ class WebpageContentProcessor:
30
  response.raise_for_status()
31
  html_content = response.text
32
  soup = BeautifulSoup(html_content, 'html.parser')
33
-
34
  # Remove common boilerplate and non-content tags from the entire document
35
  tags_to_remove = ['nav', 'header', 'footer', 'aside', 'script', 'style', 'noscript', 'form']
36
  for tag_name in tags_to_remove:
37
  for element in soup.find_all(tag_name):
38
  element.decompose()
39
-
40
  # Process the entire remaining body
41
  content_container = soup.find('body')
42
  if not content_container:
43
  return "Error: Could not find the <body> of the webpage."
44
-
45
  # --- MODIFIED: Switched to markdownify for conversion ---
46
  # markdownify is a simple function call.
47
  markdown_output = md(str(content_container))
48
  # -----------------------------------------------
49
-
50
  # Post-processing to clean up the resulting Markdown
51
  markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
52
  markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
53
-
54
  return markdown_output.strip()
55
  except requests.exceptions.Timeout:
56
  return "Error: The request timed out. The server is taking too long to respond."
@@ -202,6 +196,9 @@ def init_session_state():
202
  st.session_state.selected_chunk_id = None
203
  if 'status_message' not in st.session_state:
204
  st.session_state.status_message = ""
 
 
 
205
 
206
  init_session_state()
207
 
@@ -213,25 +210,24 @@ st.caption("A tool to fetch, chunk, and refine web content.")
213
  st.markdown(
214
  "Developed by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/). "
215
  "Inspired by Andrea Volpini's [work on content chunking](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)")
 
216
  with st.expander("ℹ️ App Information & Chunking Details", expanded=False):
217
  st.info(
218
  """
219
- • **App version:** v0.0 (alpha) — this is the very first public release, so you may run into bugs or incomplete features.
220
- • **Server policy warning:** this app relies on automated requests (“bots”) under the hood.
221
- If the target server enforces a restrictive bot policy (e.g., rate-limits requests, blocks unknown user-agents or IP addresses), parts of the app **may not work** as expected.
222
-
223
- **What to do if you hit an issue:**
224
- 1. Check the server’s logs or policy settings to ensure it allows automated clients.
225
  2. Keep an eye out for updates — v0.x → v1.0 is coming soon!
226
-
227
  ---
228
- **How Layout-Based Chunking is Implemented Here**
229
- This app uses a sophisticated, two-step process to create meaningful chunks based on the document’s visual and semantic structure:
230
- 1. **Structural Preservation (HTML → Markdown):**
231
- Converts the webpage’s HTML into Markdown, translating tags (`<h1>`, `<p>`, `<ul>`) into their Markdown equivalents (`#`, paragraph breaks, `*`) to preserve layout and hierarchy.
232
- 2. **Layout-Aware Parsing (`MarkdownNodeParser`):**
233
- Uses LlamaIndex’s `MarkdownNodeParser` to read the structured Markdown and split it at logical boundaries (headers like `#`, `##`, etc.), yielding context-aware chunks that respect original sections.
234
- """
235
  , icon="ℹ️")
236
 
237
  url_input = st.text_input("Enter a webpage URL to start", key="url_input")
@@ -253,6 +249,7 @@ if st.button("Process URL", use_container_width=True, type="primary"):
253
  else:
254
  st.session_state.status_message = "Could not extract any content chunks."
255
  st.session_state.selected_chunk_id = None
 
256
  st.rerun()
257
 
258
  if st.session_state.status_message:
@@ -282,6 +279,7 @@ with tab1:
282
 
283
  if selected_id != st.session_state.selected_chunk_id:
284
  st.session_state.selected_chunk_id = selected_id
 
285
  st.rerun()
286
 
287
  selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
@@ -297,11 +295,13 @@ with tab1:
297
  key=f"editor_{selected_chunk['id']}"
298
  )
299
 
300
- col1, col2, _ = st.columns([1, 1, 5])
 
301
 
302
  if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
303
  manager.update_chunk_content(selected_chunk['id'], edited_content)
304
  st.session_state.status_message = "Chunk updated successfully!"
 
305
  st.rerun()
306
 
307
  if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
@@ -309,8 +309,26 @@ with tab1:
309
  st.session_state.status_message = "Chunk deleted."
310
  remaining_chunks = manager.get_chunks()
311
  st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
 
312
  st.rerun()
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  with tab2:
315
  st.subheader("Document Overview")
316
  st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
@@ -330,4 +348,4 @@ with tab2:
330
  st.rerun()
331
 
332
  st.subheader("Final Compiled Document")
333
- st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")
 
5
  from llama_index.core.node_parser import MarkdownNodeParser
6
  from llama_index.core.schema import Document, MetadataMode
7
  import textstat
8
+ from markdownify import markdownify as md
9
 
10
  # --- Core Logic Classes ---
 
11
  class WebpageContentProcessor:
12
  """
13
  Handles fetching, converting, and parsing webpage content into structured chunks.
 
29
  response.raise_for_status()
30
  html_content = response.text
31
  soup = BeautifulSoup(html_content, 'html.parser')
 
32
  # Remove common boilerplate and non-content tags from the entire document
33
  tags_to_remove = ['nav', 'header', 'footer', 'aside', 'script', 'style', 'noscript', 'form']
34
  for tag_name in tags_to_remove:
35
  for element in soup.find_all(tag_name):
36
  element.decompose()
 
37
  # Process the entire remaining body
38
  content_container = soup.find('body')
39
  if not content_container:
40
  return "Error: Could not find the <body> of the webpage."
 
41
  # --- MODIFIED: Switched to markdownify for conversion ---
42
  # markdownify is a simple function call.
43
  markdown_output = md(str(content_container))
44
  # -----------------------------------------------
 
45
  # Post-processing to clean up the resulting Markdown
46
  markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
47
  markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
 
48
  return markdown_output.strip()
49
  except requests.exceptions.Timeout:
50
  return "Error: The request timed out. The server is taking too long to respond."
 
196
  st.session_state.selected_chunk_id = None
197
  if 'status_message' not in st.session_state:
198
  st.session_state.status_message = ""
199
+ # NEW: State for toggling the content preview
200
+ if 'show_preview' not in st.session_state:
201
+ st.session_state.show_preview = False
202
 
203
  init_session_state()
204
 
 
210
  st.markdown(
211
  "Developed by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/). "
212
  "Inspired by Andrea Volpini's [work on content chunking](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)")
213
+
214
  with st.expander("ℹ️ App Information & Chunking Details", expanded=False):
215
  st.info(
216
  """
217
+ • **App version:** v0.0 (alpha) — this is the very first public release, so you may run into bugs or incomplete features.
218
+ • **Server policy warning:** this app relies on automated requests (“bots”) under the hood.
219
+ If the target server enforces a restrictive bot policy (e.g., rate-limits requests, blocks unknown user-agents or IP addresses), parts of the app **may not work** as expected.
220
+ **What to do if you hit an issue:**
221
+ 1. Check the server’s logs or policy settings to ensure it allows automated clients.
 
222
  2. Keep an eye out for updates — v0.x → v1.0 is coming soon!
 
223
  ---
224
+ **How Layout-Based Chunking is Implemented Here**
225
+ This app uses a sophisticated, two-step process to create meaningful chunks based on the document’s visual and semantic structure:
226
+ 1. **Structural Preservation (HTML → Markdown):**
227
+ Converts the webpage’s HTML into Markdown, translating tags (`<h1>`, `<p>`, `<ul>`) into their Markdown equivalents (`#`, paragraph breaks, `*`) to preserve layout and hierarchy.
228
+ 2. **Layout-Aware Parsing (`MarkdownNodeParser`):**
229
+ Uses LlamaIndex’s `MarkdownNodeParser` to read the structured Markdown and split it at logical boundaries (headers like `#`, `##`, etc.), yielding context-aware chunks that respect original sections.
230
+ """
231
  , icon="ℹ️")
232
 
233
  url_input = st.text_input("Enter a webpage URL to start", key="url_input")
 
249
  else:
250
  st.session_state.status_message = "Could not extract any content chunks."
251
  st.session_state.selected_chunk_id = None
252
+ st.session_state.show_preview = False # Ensure preview is off when processing new URL
253
  st.rerun()
254
 
255
  if st.session_state.status_message:
 
279
 
280
  if selected_id != st.session_state.selected_chunk_id:
281
  st.session_state.selected_chunk_id = selected_id
282
+ st.session_state.show_preview = False # NEW: Reset preview when changing chunk
283
  st.rerun()
284
 
285
  selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
 
295
  key=f"editor_{selected_chunk['id']}"
296
  )
297
 
298
+ # MODIFIED: Added a third column for the Preview button
299
+ col1, col2, col3, _ = st.columns([1, 1, 1, 4])
300
 
301
  if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
302
  manager.update_chunk_content(selected_chunk['id'], edited_content)
303
  st.session_state.status_message = "Chunk updated successfully!"
304
+ st.session_state.show_preview = False # Hide preview after update
305
  st.rerun()
306
 
307
  if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
 
309
  st.session_state.status_message = "Chunk deleted."
310
  remaining_chunks = manager.get_chunks()
311
  st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
312
+ st.session_state.show_preview = False # Hide preview after delete
313
  st.rerun()
314
 
315
+ # NEW: Preview button in the third column
316
+ with col3:
317
+ if st.button("Preview Content", use_container_width=True, key=f"preview_{selected_chunk['id']}"):
318
+ # Toggle the preview state
319
+ st.session_state.show_preview = not st.session_state.show_preview
320
+ st.rerun()
321
+
322
+ # NEW: Conditional container to show the rendered Markdown
323
+ if st.session_state.show_preview:
324
+ st.markdown("---")
325
+ with st.container(border=True):
326
+ st.markdown("**Rendered Preview** (showing current editor content)")
327
+ # Renders the content directly from the text_area above
328
+ st.markdown(edited_content, unsafe_allow_html=True)
329
+ st.markdown("---")
330
+
331
+
332
  with tab2:
333
  st.subheader("Document Overview")
334
  st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
 
348
  st.rerun()
349
 
350
  st.subheader("Final Compiled Document")
351
+ st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")