Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,8 +19,8 @@ class WebpageContentProcessor:
|
|
| 19 |
|
| 20 |
def fetch_and_convert_to_markdown(self, url: str) -> str:
|
| 21 |
"""
|
| 22 |
-
Fetches HTML content
|
| 23 |
-
|
| 24 |
"""
|
| 25 |
try:
|
| 26 |
headers = {
|
|
@@ -31,35 +31,16 @@ class WebpageContentProcessor:
|
|
| 31 |
html_content = response.text
|
| 32 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 33 |
|
| 34 |
-
#
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
| 38 |
|
| 39 |
-
#
|
| 40 |
-
content_container =
|
| 41 |
if not content_container:
|
| 42 |
-
return "Error: Could not find
|
| 43 |
-
|
| 44 |
-
# Aggressively remove common boilerplate elements by tag, class, or role.
|
| 45 |
-
unwanted_selectors = [
|
| 46 |
-
'nav', 'header', 'footer', 'aside', 'form', 'figure', 'figcaption',
|
| 47 |
-
'[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
|
| 48 |
-
'[role="search"]', '[role="complementary"]',
|
| 49 |
-
'.nav', '.navbar', '.header', '.footer', '.sidebar', '.aside',
|
| 50 |
-
'.menu', '.pagination', '.breadcrumbs', '.comments', '.comment-list',
|
| 51 |
-
'.social-links', '.share-buttons', '.cookie-notice', '.banner',
|
| 52 |
-
'#nav', '#header', '#footer', '#sidebar', '#comments',
|
| 53 |
-
'[class*="ad"]', '[id*="ad"]', '[class*="promo"]', '[id*="promo"]'
|
| 54 |
-
]
|
| 55 |
-
|
| 56 |
-
for selector in unwanted_selectors:
|
| 57 |
-
for element in content_container.select(selector):
|
| 58 |
-
element.decompose()
|
| 59 |
-
|
| 60 |
-
# Also specifically remove script and style tags which are never content.
|
| 61 |
-
for tag in content_container.find_all(['script', 'style', 'noscript']):
|
| 62 |
-
tag.decompose()
|
| 63 |
|
| 64 |
# Convert the cleaned HTML to Markdown
|
| 65 |
markdown_output = convert_to_markdown(str(content_container))
|
|
@@ -337,4 +318,4 @@ with tab2:
|
|
| 337 |
st.rerun()
|
| 338 |
|
| 339 |
st.subheader("Final Compiled Document")
|
| 340 |
-
st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")
|
|
|
|
| 19 |
|
| 20 |
def fetch_and_convert_to_markdown(self, url: str) -> str:
|
| 21 |
"""
|
| 22 |
+
Fetches HTML content, removes common boilerplate tags from the entire page,
|
| 23 |
+
and then converts the remaining body content to Markdown.
|
| 24 |
"""
|
| 25 |
try:
|
| 26 |
headers = {
|
|
|
|
| 31 |
html_content = response.text
|
| 32 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 33 |
|
| 34 |
+
# Remove common boilerplate and non-content tags from the entire document
|
| 35 |
+
tags_to_remove = ['nav', 'header', 'footer', 'aside', 'script', 'style', 'noscript', 'form']
|
| 36 |
+
for tag_name in tags_to_remove:
|
| 37 |
+
for element in soup.find_all(tag_name):
|
| 38 |
+
element.decompose()
|
| 39 |
|
| 40 |
+
# Process the entire remaining body
|
| 41 |
+
content_container = soup.find('body')
|
| 42 |
if not content_container:
|
| 43 |
+
return "Error: Could not find the <body> of the webpage."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
# Convert the cleaned HTML to Markdown
|
| 46 |
markdown_output = convert_to_markdown(str(content_container))
|
|
|
|
| 318 |
st.rerun()
|
| 319 |
|
| 320 |
st.subheader("Final Compiled Document")
|
| 321 |
+
st.text_area("Final Markdown Output", manager.get_final_markdown(), height=500, key="final_markdown")
|