Em4e commited on
Commit
66603bd
·
verified ·
1 Parent(s): 4c95011

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -46
app.py CHANGED
@@ -19,8 +19,8 @@ class WebpageContentProcessor:
19
 
20
  def fetch_and_convert_to_markdown(self, url: str) -> str:
21
  """
22
- Fetches HTML content from a URL, starts from the <body>, removes common
23
- boilerplate tags, and converts the remaining content to Markdown.
24
  """
25
  try:
26
  headers = {
@@ -31,23 +31,48 @@ class WebpageContentProcessor:
31
  html_content = response.text
32
  soup = BeautifulSoup(html_content, 'html.parser')
33
 
34
- # Find the body of the HTML document
35
- body = soup.find('body')
36
- if not body:
37
- return "Error: Could not find the <body> of the webpage."
38
-
39
- # Tags to remove from the content to reduce boilerplate
40
- tags_to_remove = ['script', 'style', 'noscript', 'header', 'footer', 'nav', 'aside', 'form', 'figure']
41
- for tag_name in tags_to_remove:
42
- # Find all instances of the tag within the body and remove them
43
- for element in body.find_all(tag_name):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  element.decompose()
 
 
 
 
45
 
46
- # Convert the cleaned body content to Markdown
47
- markdown_output = convert_to_markdown(str(body))
48
- # Clean up excessive newlines for better readability
49
- markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output).strip()
50
- return markdown_output
 
 
 
 
 
51
 
52
  except requests.exceptions.Timeout:
53
  return "Error: The request timed out. The server is taking too long to respond."
@@ -64,7 +89,6 @@ class WebpageContentProcessor:
64
  if not markdown_content or "Error" in markdown_content:
65
  return []
66
 
67
- # This parser understands Markdown structure (headings, lists) and splits accordingly.
68
  parser = MarkdownNodeParser(include_metadata=True)
69
  doc = Document(text=markdown_content)
70
  nodes = parser.get_nodes_from_documents([doc])
@@ -75,19 +99,15 @@ class WebpageContentProcessor:
75
  if not content:
76
  continue
77
 
78
- # Extract title from the markdown header if it exists
79
  title_match = re.match(r"^(#+)\s*(.*)", content)
80
  if title_match:
81
  title = title_match.group(2).strip()
82
- # The content should not include the title line itself
83
  content_text = content[len(title_match.group(0)):].strip()
84
  else:
85
- # If no header, use the first line as a fallback title
86
  first_line = content.split('\n')[0].strip()
87
  title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
88
  content_text = content
89
 
90
- # Ensure there's a title even for empty sections
91
  if not title:
92
  title = f"[Chunk {i+1}]"
93
 
@@ -126,7 +146,7 @@ class ChunkManager:
126
  stats['word_count'] = textstat.lexicon_count(text, removepunct=True)
127
  stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
128
  stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
129
- except (Exception, TypeError): # Catch potential errors from textstat
130
  stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0})
131
  return stats
132
 
@@ -168,25 +188,30 @@ class ChunkManager:
168
  chunk = self.get_chunk_by_id(chunk_id)
169
  if chunk:
170
  chunk["content"] = new_content
171
- self._add_stats_to_chunk(chunk) # Recalculate stats after update
 
 
 
 
 
 
172
 
173
  def delete_chunk(self, chunk_id: int):
174
  self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
175
- # Re-index remaining chunks to maintain sequential IDs
176
  for i, chunk in enumerate(self._chunks):
177
  chunk['id'] = i
178
 
179
  def get_final_markdown(self) -> str:
180
  if not self._chunks:
181
  return "No content to display."
182
- # Compile final document, adding headers back for chunks that have them
183
  final_doc_parts = []
184
  for c in self._chunks:
185
- title_is_header = re.match(r"^(#+)\s*(.*)", c['title']) is None
186
- if not c['title'].startswith("[") and not title_is_header:
187
- final_doc_parts.append(f"## {c['title']}\n\n{c['content']}")
 
188
  else:
189
- final_doc_parts.append(c['content'])
190
  return "\n\n---\n\n".join(final_doc_parts)
191
 
192
 
@@ -195,14 +220,12 @@ class ChunkManager:
195
  self.target_grade_max = grade_max
196
  self.target_min_chunk_words = min_words
197
  self.target_max_chunk_words = max_words
198
- # Recalculate stats for all chunks to reflect new targets
199
  self.set_chunks(self.get_chunks())
200
 
201
  # --- Streamlit UI Application ---
202
 
203
  st.set_page_config(layout="wide", page_title="Webpage Content Editor")
204
 
205
- # Initialize session state for managers and UI state
206
  def init_session_state():
207
  if 'processor' not in st.session_state:
208
  st.session_state.processor = WebpageContentProcessor()
@@ -218,8 +241,6 @@ init_session_state()
218
  processor = st.session_state.processor
219
  manager = st.session_state.manager
220
 
221
- # --- Page Layout ---
222
-
223
  st.title("✨ Webpage Content Editor")
224
  st.caption("A tool to fetch, chunk, and refine web content.")
225
 
@@ -228,7 +249,6 @@ st.info(
228
  icon="ℹ️"
229
  )
230
 
231
- # URL input and processing button
232
  url_input = st.text_input("Enter a webpage URL to start", key="url_input")
233
  if st.button("Process URL", use_container_width=True, type="primary"):
234
  if url_input:
@@ -247,14 +267,12 @@ if st.button("Process URL", use_container_width=True, type="primary"):
247
  else:
248
  st.session_state.status_message = "Could not extract any content chunks."
249
  st.session_state.selected_chunk_id = None
250
- st.rerun() # Rerun to update the UI with new state
251
 
252
- # Display status messages as toasts
253
  if st.session_state.status_message:
254
  st.toast(st.session_state.status_message)
255
- st.session_state.status_message = "" # Clear after displaying
256
 
257
- # Main UI with tabs
258
  tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
259
 
260
  with tab1:
@@ -263,14 +281,12 @@ with tab1:
263
  st.write("Process a URL to begin editing content chunks.")
264
  else:
265
  chunk_ids = [c['id'] for c in chunks]
266
- # Ensure the selected chunk ID is valid
267
  if st.session_state.selected_chunk_id not in chunk_ids:
268
  st.session_state.selected_chunk_id = chunk_ids[0] if chunk_ids else None
269
 
270
  if st.session_state.selected_chunk_id is not None:
271
  chunk_options = {c['id']: c['title'] for c in chunks}
272
 
273
- # Dropdown to select a chunk for editing
274
  selected_id = st.selectbox(
275
  "Select a chunk to edit",
276
  options=chunk_ids,
@@ -278,7 +294,6 @@ with tab1:
278
  index=chunk_ids.index(st.session_state.selected_chunk_id)
279
  )
280
 
281
- # Update state if the selection changes
282
  if selected_id != st.session_state.selected_chunk_id:
283
  st.session_state.selected_chunk_id = selected_id
284
  st.rerun()
@@ -289,15 +304,13 @@ with tab1:
289
  st.markdown(f"**Editing: {selected_chunk['title']}**")
290
  st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
291
 
292
- # Text area for editing the selected chunk's content
293
  edited_content = st.text_area(
294
  "Chunk Content",
295
  value=selected_chunk['content'],
296
  height=350,
297
- key=f"editor_{selected_chunk['id']}" # Unique key ensures the widget updates
298
  )
299
 
300
- # Action buttons for the selected chunk
301
  col1, col2, _ = st.columns([1, 1, 5])
302
  if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
303
  manager.update_chunk_content(selected_chunk['id'], edited_content)
@@ -307,7 +320,6 @@ with tab1:
307
  if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
308
  manager.delete_chunk(selected_chunk['id'])
309
  st.session_state.status_message = "Chunk deleted."
310
- # Select the next available chunk or reset
311
  remaining_chunks = manager.get_chunks()
312
  st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
313
  st.rerun()
 
19
 
20
  def fetch_and_convert_to_markdown(self, url: str) -> str:
21
  """
22
+ Fetches HTML content from a URL, isolates the main content, aggressively
23
+ removes boilerplate, and converts the result to Markdown.
24
  """
25
  try:
26
  headers = {
 
31
  html_content = response.text
32
  soup = BeautifulSoup(html_content, 'html.parser')
33
 
34
+ # First, try to find a specific main content container.
35
+ main_content = soup.find('article') or soup.find('main') or \
36
+ soup.find('div', class_=re.compile(r'(post|content|entry|main-content)')) or \
37
+ soup.find('div', {'role': 'main'})
38
+
39
+ # If a main content container is found, use it. Otherwise, fall back to the whole body.
40
+ content_container = main_content if main_content else soup.find('body')
41
+ if not content_container:
42
+ return "Error: Could not find any processable content on the webpage."
43
+
44
+ # Aggressively remove common boilerplate elements by tag, class, or role.
45
+ # This list is more comprehensive to catch varied web designs.
46
+ unwanted_selectors = [
47
+ 'nav', 'header', 'footer', 'aside', 'form', 'figure', 'figcaption',
48
+ '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
49
+ '[role="search"]', '[role="complementary"]',
50
+ '.nav', '.navbar', '.header', '.footer', '.sidebar', '.aside',
51
+ '.menu', '.pagination', '.breadcrumbs', '.comments', '.comment-list',
52
+ '.social-links', '.share-buttons', '.cookie-notice', '.banner',
53
+ '#nav', '#header', '#footer', '#sidebar', '#comments',
54
+ # Add specific selectors for common ad and promo blocks
55
+ '[class*="ad"]', '[id*="ad"]', '[class*="promo"]', '[id*="promo"]'
56
+ ]
57
+
58
+ for selector in unwanted_selectors:
59
+ for element in content_container.select(selector):
60
  element.decompose()
61
+
62
+ # Also specifically remove script and style tags which are never content.
63
+ for tag in content_container.find_all(['script', 'style', 'noscript']):
64
+ tag.decompose()
65
 
66
+ # Convert the cleaned HTML to Markdown
67
+ markdown_output = convert_to_markdown(str(content_container))
68
+
69
+ # Post-processing to clean up the resulting Markdown
70
+ # Collapse extra newlines
71
+ markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
72
+ # Remove empty list items or lines with just navigation symbols that might remain
73
+ markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
74
+
75
+ return markdown_output.strip()
76
 
77
  except requests.exceptions.Timeout:
78
  return "Error: The request timed out. The server is taking too long to respond."
 
89
  if not markdown_content or "Error" in markdown_content:
90
  return []
91
 
 
92
  parser = MarkdownNodeParser(include_metadata=True)
93
  doc = Document(text=markdown_content)
94
  nodes = parser.get_nodes_from_documents([doc])
 
99
  if not content:
100
  continue
101
 
 
102
  title_match = re.match(r"^(#+)\s*(.*)", content)
103
  if title_match:
104
  title = title_match.group(2).strip()
 
105
  content_text = content[len(title_match.group(0)):].strip()
106
  else:
 
107
  first_line = content.split('\n')[0].strip()
108
  title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
109
  content_text = content
110
 
 
111
  if not title:
112
  title = f"[Chunk {i+1}]"
113
 
 
146
  stats['word_count'] = textstat.lexicon_count(text, removepunct=True)
147
  stats['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
148
  stats['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
149
+ except (Exception, TypeError):
150
  stats.update({'word_count': 0, 'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0})
151
  return stats
152
 
 
188
  chunk = self.get_chunk_by_id(chunk_id)
189
  if chunk:
190
  chunk["content"] = new_content
191
+ self._add_stats_to_chunk(chunk)
192
+ # Update title if it was a placeholder
193
+ if chunk["title"].startswith("["):
194
+ first_line = new_content.split('\n')[0].strip()
195
+ new_title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
196
+ if new_title:
197
+ chunk["title"] = new_title
198
 
199
  def delete_chunk(self, chunk_id: int):
200
  self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
 
201
  for i, chunk in enumerate(self._chunks):
202
  chunk['id'] = i
203
 
204
  def get_final_markdown(self) -> str:
205
  if not self._chunks:
206
  return "No content to display."
 
207
  final_doc_parts = []
208
  for c in self._chunks:
209
+ # Check if title is a real header or just derived text
210
+ is_header = re.match(r"^(#+)\s*(.*)", c['title'])
211
+ if not c['title'].startswith("[") and not is_header:
212
+ final_doc_parts.append(f"## {c['title']}\n\n{c['content']}")
213
  else:
214
+ final_doc_parts.append(c['content'])
215
  return "\n\n---\n\n".join(final_doc_parts)
216
 
217
 
 
220
  self.target_grade_max = grade_max
221
  self.target_min_chunk_words = min_words
222
  self.target_max_chunk_words = max_words
 
223
  self.set_chunks(self.get_chunks())
224
 
225
  # --- Streamlit UI Application ---
226
 
227
  st.set_page_config(layout="wide", page_title="Webpage Content Editor")
228
 
 
229
  def init_session_state():
230
  if 'processor' not in st.session_state:
231
  st.session_state.processor = WebpageContentProcessor()
 
241
  processor = st.session_state.processor
242
  manager = st.session_state.manager
243
 
 
 
244
  st.title("✨ Webpage Content Editor")
245
  st.caption("A tool to fetch, chunk, and refine web content.")
246
 
 
249
  icon="ℹ️"
250
  )
251
 
 
252
  url_input = st.text_input("Enter a webpage URL to start", key="url_input")
253
  if st.button("Process URL", use_container_width=True, type="primary"):
254
  if url_input:
 
267
  else:
268
  st.session_state.status_message = "Could not extract any content chunks."
269
  st.session_state.selected_chunk_id = None
270
+ st.rerun()
271
 
 
272
  if st.session_state.status_message:
273
  st.toast(st.session_state.status_message)
274
+ st.session_state.status_message = ""
275
 
 
276
  tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
277
 
278
  with tab1:
 
281
  st.write("Process a URL to begin editing content chunks.")
282
  else:
283
  chunk_ids = [c['id'] for c in chunks]
 
284
  if st.session_state.selected_chunk_id not in chunk_ids:
285
  st.session_state.selected_chunk_id = chunk_ids[0] if chunk_ids else None
286
 
287
  if st.session_state.selected_chunk_id is not None:
288
  chunk_options = {c['id']: c['title'] for c in chunks}
289
 
 
290
  selected_id = st.selectbox(
291
  "Select a chunk to edit",
292
  options=chunk_ids,
 
294
  index=chunk_ids.index(st.session_state.selected_chunk_id)
295
  )
296
 
 
297
  if selected_id != st.session_state.selected_chunk_id:
298
  st.session_state.selected_chunk_id = selected_id
299
  st.rerun()
 
304
  st.markdown(f"**Editing: {selected_chunk['title']}**")
305
  st.markdown(manager.format_chunk_stats(selected_chunk['stats']), unsafe_allow_html=True)
306
 
 
307
  edited_content = st.text_area(
308
  "Chunk Content",
309
  value=selected_chunk['content'],
310
  height=350,
311
+ key=f"editor_{selected_chunk['id']}"
312
  )
313
 
 
314
  col1, col2, _ = st.columns([1, 1, 5])
315
  if col1.button("Update Chunk", use_container_width=True, key=f"update_{selected_chunk['id']}"):
316
  manager.update_chunk_content(selected_chunk['id'], edited_content)
 
320
  if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
321
  manager.delete_chunk(selected_chunk['id'])
322
  st.session_state.status_message = "Chunk deleted."
 
323
  remaining_chunks = manager.get_chunks()
324
  st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
325
  st.rerun()