Spaces:
Sleeping
Sleeping
Added recursion
Browse files
app.py
CHANGED
|
@@ -23,7 +23,8 @@ def scrape_and_convert(url, depth):
|
|
| 23 |
return f"Error fetching {url}: {str(e)}\n"
|
| 24 |
|
| 25 |
# Convert to Markdown
|
| 26 |
-
markdown_content =
|
|
|
|
| 27 |
html=html_content,
|
| 28 |
base_url=url,
|
| 29 |
parser_features='html.parser',
|
|
@@ -32,10 +33,12 @@ def scrape_and_convert(url, depth):
|
|
| 32 |
|
| 33 |
# If depth > 0, extract links and process them
|
| 34 |
if current_depth > 0:
|
| 35 |
-
links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL
|
|
|
|
| 36 |
for link in links:
|
| 37 |
-
|
| 38 |
-
|
|
|
|
| 39 |
|
| 40 |
return markdown_content
|
| 41 |
|
|
|
|
| 23 |
return f"Error fetching {url}: {str(e)}\n"
|
| 24 |
|
| 25 |
# Convert to Markdown
|
| 26 |
+
markdown_content = f"## Extracted from: {url}\n\n"
|
| 27 |
+
markdown_content += Converter.html_to_markdown(
|
| 28 |
html=html_content,
|
| 29 |
base_url=url,
|
| 30 |
parser_features='html.parser',
|
|
|
|
| 33 |
|
| 34 |
# If depth > 0, extract links and process them
|
| 35 |
if current_depth > 0:
|
| 36 |
+
links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)
|
| 37 |
+
|
| 38 |
for link in links:
|
| 39 |
+
if link not in visited_urls:
|
| 40 |
+
markdown_content += f"\n\n### Extracted from: {link}\n"
|
| 41 |
+
markdown_content += recursive_scrape(link, current_depth - 1)
|
| 42 |
|
| 43 |
return markdown_content
|
| 44 |
|