Em4e commited on
Commit
c063934
·
verified ·
1 Parent(s): fe63ffc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -27
app.py CHANGED
@@ -3,7 +3,7 @@ import requests
3
  from bs4 import BeautifulSoup
4
  from html_to_markdown import convert_to_markdown
5
  import re
6
- from llama_index.core.node_parser import SentenceSplitter
7
  from llama_index.core.schema import Document, MetadataMode
8
  import textstat # For readability metrics
9
 
@@ -82,38 +82,44 @@ class WebpageContentProcessor:
82
 
83
  def parse_markdown_into_chunks(self, markdown_content: str) -> list:
84
  """
85
- Parses markdown content into chunks using a sentence splitter for more
86
- reliable chunking than header-based splitting.
87
  """
88
  if not markdown_content or "Error" in markdown_content:
89
  return []
90
 
91
- # Use SentenceSplitter for more reliable chunking based on size,
92
- # rather than relying on Markdown headers which may not exist.
93
- parser = SentenceSplitter(
94
- chunk_size=2000, # Characters per chunk
95
- chunk_overlap=200 # Characters to overlap between chunks
96
- )
97
-
98
- doc = Document(text=markdown_content)
99
  nodes = parser.get_nodes_from_documents([doc])
100
 
101
  structured_chunks = []
102
  for i, node in enumerate(nodes):
103
- content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
104
- if not content:
105
  continue
 
 
 
106
 
107
- # Generate a title from the first non-empty line of the chunk
108
- first_line = next((line for line in content.split('\n') if line.strip()), "")
109
- title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
110
- if not title:
111
- title = f"Chunk {i+1}"
112
-
 
 
 
 
 
 
 
 
 
113
  structured_chunks.append({
114
- "id": i,
115
- "title": title,
116
- "content": content
117
  })
118
 
119
  return structured_chunks
@@ -180,6 +186,12 @@ class ChunkManager:
180
  if chunk:
181
  chunk["content"] = new_content
182
  self._add_stats_to_chunk(chunk)
 
 
 
 
 
 
183
 
184
  def delete_chunk(self, chunk_id: int):
185
  self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
@@ -221,8 +233,8 @@ init_session_state()
221
  processor = st.session_state.content_processor
222
  manager = st.session_state.chunk_manager
223
 
224
- st.title("✨ Chunk Webpage Content Editor")
225
- st.caption("Created by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/) | Inspired by Andrea Volpini's [work on content chunking.](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)")
226
 
227
  st.info(
228
  "**Note:** Some URLs may be blocked due to server policies (like bot detection). "
@@ -272,7 +284,6 @@ with tab1:
272
  chunk_options = {c['id']: f"Chunk {c['id']}: {c['title']}" for c in chunks}
273
 
274
  # The selectbox's state is now managed directly by st.session_state.selected_chunk_id
275
- # When the user selects a new option, Streamlit automatically updates this state variable and reruns the script.
276
  selected_id = st.selectbox(
277
  "Select a chunk to edit",
278
  options=chunk_ids,
@@ -294,7 +305,7 @@ with tab1:
294
  "Chunk Content",
295
  value=selected_chunk['content'],
296
  height=300,
297
- key=f"editor_{selected_chunk['id']}" # Unique key forces widget to re-render on selection change
298
  )
299
 
300
  col1, col2, _ = st.columns([1, 1, 4])
@@ -328,4 +339,4 @@ with tab2:
328
  st.rerun()
329
 
330
  st.subheader("Final Document")
331
- st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")
 
3
  from bs4 import BeautifulSoup
4
  from html_to_markdown import convert_to_markdown
5
  import re
6
+ from llama_index.core.node_parser import MarkdownNodeParser
7
  from llama_index.core.schema import Document, MetadataMode
8
  import textstat # For readability metrics
9
 
 
82
 
83
  def parse_markdown_into_chunks(self, markdown_content: str) -> list:
84
  """
85
+ Parses Markdown content into LlamaIndex nodes (chunks) and extracts title and content.
86
+ This version uses MarkdownNodeParser to leverage the document's structure.
87
  """
88
  if not markdown_content or "Error" in markdown_content:
89
  return []
90
 
91
+ doc = Document(text=markdown_content, metadata={"filename": "webpage_content"})
92
+ parser = MarkdownNodeParser(include_metadata=True)
 
 
 
 
 
 
93
  nodes = parser.get_nodes_from_documents([doc])
94
 
95
  structured_chunks = []
96
  for i, node in enumerate(nodes):
97
+ pure_text_content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
98
+ if not pure_text_content:
99
  continue
100
+
101
+ heading_title = ""
102
+ content_text = pure_text_content
103
 
104
+ # Attempt to find a title from a markdown header
105
+ heading_match = re.match(r"^(#+)\s*(.*)", pure_text_content)
106
+ if heading_match:
107
+ heading_title = heading_match.group(2).strip()
108
+ # Remove the title from the content itself
109
+ content_text = pure_text_content[len(heading_match.group(0)):].strip()
110
+ if not heading_title:
111
+ heading_title = "[Untitled Section]"
112
+ else:
113
+ # Fallback to using the first line as the title
114
+ first_line = content_text.split('\n')[0].strip()
115
+ heading_title = (first_line[:75] + "...") if len(first_line) > 75 else first_line
116
+ if not heading_title:
117
+ heading_title = "[Empty Section]"
118
+
119
  structured_chunks.append({
120
+ "id": i,
121
+ "title": heading_title,
122
+ "content": content_text
123
  })
124
 
125
  return structured_chunks
 
186
  if chunk:
187
  chunk["content"] = new_content
188
  self._add_stats_to_chunk(chunk)
189
+ # Optionally update title if it's derived from content
190
+ if chunk["title"].startswith("[") or not re.match(r"^(#+)\s*(.*)", chunk["content"]):
191
+ first_line = new_content.split('\n')[0].strip()
192
+ chunk["title"] = (first_line[:75] + '...') if len(first_line) > 75 else first_line
193
+ if not chunk["title"]: chunk["title"] = "[Empty Section]"
194
+
195
 
196
  def delete_chunk(self, chunk_id: int):
197
  self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
 
233
  processor = st.session_state.content_processor
234
  manager = st.session_state.chunk_manager
235
 
236
+ st.title("✨ Webpage Content Editor")
237
+ st.caption("Created by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/) | Inspired by Andrea Volpini's work on content chunking.")
238
 
239
  st.info(
240
  "**Note:** Some URLs may be blocked due to server policies (like bot detection). "
 
284
  chunk_options = {c['id']: f"Chunk {c['id']}: {c['title']}" for c in chunks}
285
 
286
  # The selectbox's state is now managed directly by st.session_state.selected_chunk_id
 
287
  selected_id = st.selectbox(
288
  "Select a chunk to edit",
289
  options=chunk_ids,
 
305
  "Chunk Content",
306
  value=selected_chunk['content'],
307
  height=300,
308
+ key=f"editor_{selected_chunk['id']}" # Unique key forces widget to re-render
309
  )
310
 
311
  col1, col2, _ = st.columns([1, 1, 4])
 
339
  st.rerun()
340
 
341
  st.subheader("Final Document")
342
+ st.text_area("Compiled Markdown", manager.get_final_markdown(), height=400, disabled=False, key="final_markdown")