Em4e commited on
Commit
875975c
·
verified ·
1 Parent(s): 7c8ac1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -8
app.py CHANGED
@@ -42,7 +42,6 @@ class WebpageContentProcessor:
42
  return "Error: Could not find any processable content on the webpage."
43
 
44
  # Aggressively remove common boilerplate elements by tag, class, or role.
45
- # This list is more comprehensive to catch varied web designs.
46
  unwanted_selectors = [
47
  'nav', 'header', 'footer', 'aside', 'form', 'figure', 'figcaption',
48
  '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
@@ -51,7 +50,6 @@ class WebpageContentProcessor:
51
  '.menu', '.pagination', '.breadcrumbs', '.comments', '.comment-list',
52
  '.social-links', '.share-buttons', '.cookie-notice', '.banner',
53
  '#nav', '#header', '#footer', '#sidebar', '#comments',
54
- # Add specific selectors for common ad and promo blocks
55
  '[class*="ad"]', '[id*="ad"]', '[class*="promo"]', '[id*="promo"]'
56
  ]
57
 
@@ -67,9 +65,7 @@ class WebpageContentProcessor:
67
  markdown_output = convert_to_markdown(str(content_container))
68
 
69
  # Post-processing to clean up the resulting Markdown
70
- # Collapse extra newlines
71
  markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
72
- # Remove empty list items or lines with just navigation symbols that might remain
73
  markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
74
 
75
  return markdown_output.strip()
@@ -189,7 +185,6 @@ class ChunkManager:
189
  if chunk:
190
  chunk["content"] = new_content
191
  self._add_stats_to_chunk(chunk)
192
- # Update title if it was a placeholder
193
  if chunk["title"].startswith("["):
194
  first_line = new_content.split('\n')[0].strip()
195
  new_title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
@@ -206,7 +201,6 @@ class ChunkManager:
206
  return "No content to display."
207
  final_doc_parts = []
208
  for c in self._chunks:
209
- # Check if title is a real header or just derived text
210
  is_header = re.match(r"^(#+)\s*(.*)", c['title'])
211
  if not c['title'].startswith("[") and not is_header:
212
  final_doc_parts.append(f"## {c['title']}\n\n{c['content']}")
@@ -241,8 +235,8 @@ init_session_state()
241
  processor = st.session_state.processor
242
  manager = st.session_state.manager
243
 
244
- st.title("✨ Chunk Webpage Content Editor")
245
- st.caption("A tool to fetch, chunk, and refine web content. Inspired by Andrea Volpini's [work on content chunking](https://www.linkedin.com/pulse/understanding-chunking-google-ai-mode-practical-content-volpini-zseaf/)")
246
 
247
  st.info(
248
  "**Note:** Some websites may block content scraping. This is an early version, so you might encounter bugs.",
 
42
  return "Error: Could not find any processable content on the webpage."
43
 
44
  # Aggressively remove common boilerplate elements by tag, class, or role.
 
45
  unwanted_selectors = [
46
  'nav', 'header', 'footer', 'aside', 'form', 'figure', 'figcaption',
47
  '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
 
50
  '.menu', '.pagination', '.breadcrumbs', '.comments', '.comment-list',
51
  '.social-links', '.share-buttons', '.cookie-notice', '.banner',
52
  '#nav', '#header', '#footer', '#sidebar', '#comments',
 
53
  '[class*="ad"]', '[id*="ad"]', '[class*="promo"]', '[id*="promo"]'
54
  ]
55
 
 
65
  markdown_output = convert_to_markdown(str(content_container))
66
 
67
  # Post-processing to clean up the resulting Markdown
 
68
  markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output)
 
69
  markdown_output = re.sub(r'(\n\s*[\*\-]\s*\n)|(^\s*[\*\-]\s*$)', '\n', markdown_output, flags=re.MULTILINE)
70
 
71
  return markdown_output.strip()
 
185
  if chunk:
186
  chunk["content"] = new_content
187
  self._add_stats_to_chunk(chunk)
 
188
  if chunk["title"].startswith("["):
189
  first_line = new_content.split('\n')[0].strip()
190
  new_title = (first_line[:75] + '...') if len(first_line) > 75 else first_line
 
201
  return "No content to display."
202
  final_doc_parts = []
203
  for c in self._chunks:
 
204
  is_header = re.match(r"^(#+)\s*(.*)", c['title'])
205
  if not c['title'].startswith("[") and not is_header:
206
  final_doc_parts.append(f"## {c['title']}\n\n{c['content']}")
 
235
  processor = st.session_state.processor
236
  manager = st.session_state.manager
237
 
238
+ st.title("✨ Webpage Content Editor")
239
+ st.caption("A tool to fetch, chunk, and refine web content.")
240
 
241
  st.info(
242
  "**Note:** Some websites may block content scraping. This is an early version, so you might encounter bugs.",