Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,8 +19,8 @@ class WebpageContentProcessor:
|
|
| 19 |
|
| 20 |
def fetch_and_convert_to_markdown(self, url: str) -> str:
|
| 21 |
"""
|
| 22 |
-
Fetches HTML content from a URL,
|
| 23 |
-
|
| 24 |
"""
|
| 25 |
try:
|
| 26 |
headers = {
|
|
@@ -31,25 +31,21 @@ class WebpageContentProcessor:
|
|
| 31 |
html_content = response.text
|
| 32 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 33 |
|
| 34 |
-
#
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
# Find the main content area of the webpage
|
| 40 |
-
content_for_conversion = soup.find('article') or soup.find('main') or \
|
| 41 |
-
soup.find('div', class_=re.compile(r'content|post|body')) or \
|
| 42 |
-
soup.find('div', {'role': 'main'})
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
| 49 |
|
| 50 |
-
# Convert the cleaned
|
| 51 |
-
markdown_output = convert_to_markdown(str(
|
| 52 |
-
# Clean up excessive newlines
|
| 53 |
markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output).strip()
|
| 54 |
return markdown_output
|
| 55 |
|
|
|
|
| 19 |
|
| 20 |
def fetch_and_convert_to_markdown(self, url: str) -> str:
|
| 21 |
"""
|
| 22 |
+
Fetches HTML content from a URL, starts from the <body>, removes common
|
| 23 |
+
boilerplate tags, and converts the remaining content to Markdown.
|
| 24 |
"""
|
| 25 |
try:
|
| 26 |
headers = {
|
|
|
|
| 31 |
html_content = response.text
|
| 32 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 33 |
|
| 34 |
+
# Find the body of the HTML document
|
| 35 |
+
body = soup.find('body')
|
| 36 |
+
if not body:
|
| 37 |
+
return "Error: Could not find the <body> of the webpage."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
# Tags to remove from the content to reduce boilerplate
|
| 40 |
+
tags_to_remove = ['script', 'style', 'noscript', 'header', 'footer', 'nav', 'aside', 'form', 'figure']
|
| 41 |
+
for tag_name in tags_to_remove:
|
| 42 |
+
# Find all instances of the tag within the body and remove them
|
| 43 |
+
for element in body.find_all(tag_name):
|
| 44 |
+
element.decompose()
|
| 45 |
|
| 46 |
+
# Convert the cleaned body content to Markdown
|
| 47 |
+
markdown_output = convert_to_markdown(str(body))
|
| 48 |
+
# Clean up excessive newlines for better readability
|
| 49 |
markdown_output = re.sub(r'\n{3,}', '\n\n', markdown_output).strip()
|
| 50 |
return markdown_output
|
| 51 |
|