Em4e commited on
Commit
07f83ac
·
verified ·
1 Parent(s): f27d6ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -31
app.py CHANGED
@@ -1,11 +1,11 @@
1
  import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
4
- from html_to_markdown import convert_to_markdown
5
  import re
6
  from llama_index.core.node_parser import MarkdownNodeParser
7
  from llama_index.core.schema import Document, MetadataMode
8
  import textstat
 
9
 
10
  # --- Core Logic Classes ---
11
 
@@ -15,12 +15,14 @@ class WebpageContentProcessor:
15
  This class is responsible for the entire content processing pipeline.
16
  """
17
  def __init__(self):
18
- pass
 
 
19
 
20
  def fetch_and_convert_to_markdown(self, url: str) -> str:
21
  """
22
  Fetches HTML content, removes common boilerplate tags from the entire page,
23
- and then converts the remaining body content to Markdown.
24
  """
25
  try:
26
  headers = {
@@ -42,15 +44,17 @@ class WebpageContentProcessor:
42
  if not content_container:
43
  return "Error: Could not find the <body> of the webpage."
44
 
45
- # Convert the cleaned HTML to Markdown
46
- markdown_output = convert_to_markdown(str(content_container))
47
-
 
 
 
48
  # Post-processing to clean up the resulting Markdown
49
  markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
50
  markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
51
-
52
- return markdown_output.strip()
53
 
 
54
  except requests.exceptions.Timeout:
55
  return "Error: The request timed out. The server is taking too long to respond."
56
  except requests.exceptions.RequestException as e:
@@ -65,17 +69,14 @@ class WebpageContentProcessor:
65
  """
66
  if not markdown_content or "Error" in markdown_content:
67
  return []
68
-
69
  parser = MarkdownNodeParser(include_metadata=True)
70
  doc = Document(text=markdown_content)
71
  nodes = parser.get_nodes_from_documents([doc])
72
-
73
  structured_chunks = []
74
  for i, node in enumerate(nodes):
75
  content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
76
  if not content:
77
  continue
78
-
79
  title_match = re.match(r"^(#+)\s*(.*)", content)
80
  if title_match:
81
  title = title_match.group(2).strip()
@@ -84,10 +85,8 @@ class WebpageContentProcessor:
84
  first_line = content.split('\n')[0].strip()
85
  title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
86
  content_text = content
87
-
88
  if not title:
89
  title = f"[Chunk {i+1}]"
90
-
91
  structured_chunks.append({
92
  "id": i,
93
  "title": title,
@@ -132,7 +131,6 @@ class ChunkManager:
132
  flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red"
133
  grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red"
134
  word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red"
135
-
136
  return (
137
  f"**Word Count:** <span style='color:{word_color};'>{stats.get('word_count', 0)}</span> &nbsp;&nbsp;|&nbsp;&nbsp; "
138
  f"**Reading Ease:** <span style='color:{flesch_color};'>{stats.get('flesch_reading_ease', 0):.2f}</span> &nbsp;&nbsp;|&nbsp;&nbsp; "
@@ -143,14 +141,12 @@ class ChunkManager:
143
  """Calculates and formats stats for the entire document."""
144
  if not self._chunks:
145
  return "No document loaded."
146
-
147
  total_words = sum(c['stats']['word_count'] for c in self._chunks)
148
  if len(self._chunks) > 0:
149
  avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks)
150
  avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks)
151
  else:
152
  avg_ease = avg_grade = 0
153
-
154
  return (
155
  f"- **Total Chunks:** {len(self._chunks)}\n"
156
  f"- **Total Words:** {total_words}\n"
@@ -189,7 +185,6 @@ class ChunkManager:
189
  final_doc_parts.append(c['content'])
190
  return "\n\n---\n\n".join(final_doc_parts)
191
 
192
-
193
  def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
194
  self.target_flesch_min = flesch_min
195
  self.target_grade_max = grade_max
@@ -198,7 +193,6 @@ class ChunkManager:
198
  self.set_chunks(self.get_chunks())
199
 
200
  # --- Streamlit UI Application ---
201
-
202
  st.set_page_config(layout="wide", page_title="Webpage Content Editor")
203
 
204
  def init_session_state():
@@ -220,28 +214,22 @@ st.title("Chunk Webpage Content Editor")
220
  st.caption("A tool to fetch, chunk, and refine web content.")
221
  st.markdown(
222
  "Developed by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/). "
223
- "Inspired by Andrea Volpini's [work on content chunking](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)"
224
- )
225
  st.info(
226
  """
227
  **How Layout-Based Chunking is Implemented Here**
228
-
229
  This app uses a sophisticated, two-step process to create meaningful chunks based on the document's visual and semantic structure:
230
-
231
  1. **Structural Preservation (HTML → Markdown):**
232
  The code first converts the webpage's HTML into Markdown. This is a critical step because it translates structural tags (`<h1>`, `<p>`, `<ul>`) into their Markdown equivalents (`#`, paragraph breaks, `*`). This preserves the document's original layout and hierarchy.
233
-
234
  2. **Layout-Aware Parsing (`MarkdownNodeParser`):**
235
  Next, it uses the `MarkdownNodeParser` from the LlamaIndex library. This specialized tool is designed to read the structured Markdown and split it at its logical boundaries—primarily the headers (`#`, `##`, etc.).
236
-
237
  The result is a set of context-aware chunks that respect the original document's sections, rather than being arbitrary splits.
238
-
239
  "**Note:** Some websites may block content scraping. This is an early version, so you might encounter bugs.",
240
  """,
241
- icon="ℹ️"
242
- )
243
 
244
  url_input = st.text_input("Enter a webpage URL to start", key="url_input")
 
245
  if st.button("Process URL", use_container_width=True, type="primary"):
246
  if url_input:
247
  with st.spinner("Fetching and chunking content..."):
@@ -278,7 +266,7 @@ with tab1:
278
 
279
  if st.session_state.selected_chunk_id is not None:
280
  chunk_options = {c['id']: c['title'] for c in chunks}
281
-
282
  selected_id = st.selectbox(
283
  "Select a chunk to edit",
284
  options=chunk_ids,
@@ -295,7 +283,7 @@ with tab1:
295
  if selected_chunk:
296
  st.markdown(f"**Editing: {selected_chunk['title']}**")
297
  st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
298
-
299
  edited_content = st.text_area(
300
  "Chunk Content",
301
  value=selected_chunk['content'],
@@ -304,11 +292,12 @@ with tab1:
304
  )
305
 
306
  col1, col2, _ = st.columns([1, 1, 5])
 
307
  if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
308
  manager.update_chunk_content(selected_chunk['id'], edited_content)
309
  st.session_state.status_message = "Chunk updated successfully!"
310
  st.rerun()
311
-
312
  if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
313
  manager.delete_chunk(selected_chunk['id'])
314
  st.session_state.status_message = "Chunk deleted."
@@ -319,7 +308,7 @@ with tab1:
319
  with tab2:
320
  st.subheader("Document Overview")
321
  st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
322
-
323
  st.subheader("Content Targets")
324
  with st.form("targets_form"):
325
  st.write("Set readability targets to guide your editing. See color feedback in the editor.")
 
1
  import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
 
4
  import re
5
  from llama_index.core.node_parser import MarkdownNodeParser
6
  from llama_index.core.schema import Document, MetadataMode
7
  import textstat
8
+ from markitdown import Markitdown # <-- MODIFIED: Import Markitdown
9
 
10
  # --- Core Logic Classes ---
11
 
 
15
  This class is responsible for the entire content processing pipeline.
16
  """
17
  def __init__(self):
18
+ # --- MODIFIED: Instantiate Markitdown converter ---
19
+ self.markdown_converter = Markitdown()
20
+ # -------------------------------------------------
21
 
22
  def fetch_and_convert_to_markdown(self, url: str) -> str:
23
  """
24
  Fetches HTML content, removes common boilerplate tags from the entire page,
25
+ and then converts the remaining body content to Markdown using Markitdown.
26
  """
27
  try:
28
  headers = {
 
44
  if not content_container:
45
  return "Error: Could not find the <body> of the webpage."
46
 
47
+ # --- MODIFIED: Use Markitdown for conversion ---
48
+ # The .convert() method returns an object; the HTML is in the .text attribute
49
+ conversion_result = self.markdown_converter.convert(str(content_container))
50
+ markdown_output = conversion_result.text
51
+ # -----------------------------------------------
52
+
53
  # Post-processing to clean up the resulting Markdown
54
  markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
55
  markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
 
 
56
 
57
+ return markdown_output.strip()
58
  except requests.exceptions.Timeout:
59
  return "Error: The request timed out. The server is taking too long to respond."
60
  except requests.exceptions.RequestException as e:
 
69
  """
70
  if not markdown_content or "Error" in markdown_content:
71
  return []
 
72
  parser = MarkdownNodeParser(include_metadata=True)
73
  doc = Document(text=markdown_content)
74
  nodes = parser.get_nodes_from_documents([doc])
 
75
  structured_chunks = []
76
  for i, node in enumerate(nodes):
77
  content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
78
  if not content:
79
  continue
 
80
  title_match = re.match(r"^(#+)\s*(.*)", content)
81
  if title_match:
82
  title = title_match.group(2).strip()
 
85
  first_line = content.split('\n')[0].strip()
86
  title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
87
  content_text = content
 
88
  if not title:
89
  title = f"[Chunk {i+1}]"
 
90
  structured_chunks.append({
91
  "id": i,
92
  "title": title,
 
131
  flesch_color = "green" if stats.get('flesch_reading_ease', 0) >= self.target_flesch_min else "red"
132
  grade_color = "green" if stats.get('flesch_kincaid_grade', 0) <= self.target_grade_max else "red"
133
  word_color = "green" if self.target_min_chunk_words <= stats.get('word_count', 0) <= self.target_max_chunk_words else "red"
 
134
  return (
135
  f"**Word Count:** <span style='color:{word_color};'>{stats.get('word_count', 0)}</span> &nbsp;&nbsp;|&nbsp;&nbsp; "
136
  f"**Reading Ease:** <span style='color:{flesch_color};'>{stats.get('flesch_reading_ease', 0):.2f}</span> &nbsp;&nbsp;|&nbsp;&nbsp; "
 
141
  """Calculates and formats stats for the entire document."""
142
  if not self._chunks:
143
  return "No document loaded."
 
144
  total_words = sum(c['stats']['word_count'] for c in self._chunks)
145
  if len(self._chunks) > 0:
146
  avg_ease = sum(c['stats']['flesch_reading_ease'] for c in self._chunks) / len(self._chunks)
147
  avg_grade = sum(c['stats']['flesch_kincaid_grade'] for c in self._chunks) / len(self._chunks)
148
  else:
149
  avg_ease = avg_grade = 0
 
150
  return (
151
  f"- **Total Chunks:** {len(self._chunks)}\n"
152
  f"- **Total Words:** {total_words}\n"
 
185
  final_doc_parts.append(c['content'])
186
  return "\n\n---\n\n".join(final_doc_parts)
187
 
 
188
  def set_targets(self, flesch_min: float, grade_max: float, min_words: int, max_words: int):
189
  self.target_flesch_min = flesch_min
190
  self.target_grade_max = grade_max
 
193
  self.set_chunks(self.get_chunks())
194
 
195
  # --- Streamlit UI Application ---
 
196
  st.set_page_config(layout="wide", page_title="Webpage Content Editor")
197
 
198
  def init_session_state():
 
214
  st.caption("A tool to fetch, chunk, and refine web content.")
215
  st.markdown(
216
  "Developed by [Emilija Gjorgjevska](https://www.linkedin.com/in/emilijagjorgjevska/). "
217
+ "Inspired by Andrea Volpini's [work on content chunking](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)")
 
218
  st.info(
219
  """
220
  **How Layout-Based Chunking is Implemented Here**
 
221
  This app uses a sophisticated, two-step process to create meaningful chunks based on the document's visual and semantic structure:
 
222
  1. **Structural Preservation (HTML → Markdown):**
223
  The code first converts the webpage's HTML into Markdown. This is a critical step because it translates structural tags (`<h1>`, `<p>`, `<ul>`) into their Markdown equivalents (`#`, paragraph breaks, `*`). This preserves the document's original layout and hierarchy.
 
224
  2. **Layout-Aware Parsing (`MarkdownNodeParser`):**
225
  Next, it uses the `MarkdownNodeParser` from the LlamaIndex library. This specialized tool is designed to read the structured Markdown and split it at its logical boundaries—primarily the headers (`#`, `##`, etc.).
 
226
  The result is a set of context-aware chunks that respect the original document's sections, rather than being arbitrary splits.
 
227
  "**Note:** Some websites may block content scraping. This is an early version, so you might encounter bugs.",
228
  """,
229
+ icon="ℹ️")
 
230
 
231
  url_input = st.text_input("Enter a webpage URL to start", key="url_input")
232
+
233
  if st.button("Process URL", use_container_width=True, type="primary"):
234
  if url_input:
235
  with st.spinner("Fetching and chunking content..."):
 
266
 
267
  if st.session_state.selected_chunk_id is not None:
268
  chunk_options = {c['id']: c['title'] for c in chunks}
269
+
270
  selected_id = st.selectbox(
271
  "Select a chunk to edit",
272
  options=chunk_ids,
 
283
  if selected_chunk:
284
  st.markdown(f"**Editing: {selected_chunk['title']}**")
285
  st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
286
+
287
  edited_content = st.text_area(
288
  "Chunk Content",
289
  value=selected_chunk['content'],
 
292
  )
293
 
294
  col1, col2, _ = st.columns([1, 1, 5])
295
+
296
  if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
297
  manager.update_chunk_content(selected_chunk['id'], edited_content)
298
  st.session_state.status_message = "Chunk updated successfully!"
299
  st.rerun()
300
+
301
  if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
302
  manager.delete_chunk(selected_chunk['id'])
303
  st.session_state.status_message = "Chunk deleted."
 
308
  with tab2:
309
  st.subheader("Document Overview")
310
  st.markdown(manager.get_document_summary_stats(), unsafe_allow_html=True)
311
+
312
  st.subheader("Content Targets")
313
  with st.form("targets_form"):
314
  st.write("Set readability targets to guide your editing. See color feedback in the editor.")