Spaces:

CultriX
/

RAG-Scraper

Sleeping

CultriX commited on Mar 6, 2025

Commit

ad147d8

1 Parent(s): c071d8b

Added recursion

Files changed (1) hide show

app.py CHANGED Viewed

@@ -23,7 +23,8 @@ def scrape_and_convert(url, depth):
                 return f"Error fetching {url}: {str(e)}\n"
             # Convert to Markdown
-            markdown_content = Converter.html_to_markdown(
                 html=html_content,
                 base_url=url,
                 parser_features='html.parser',
@@ -32,10 +33,12 @@ def scrape_and_convert(url, depth):
             # If depth > 0, extract links and process them
             if current_depth > 0:
-                links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL, depth=current_depth, visited_urls=visited_urls)
                 for link in links:
-                    markdown_content += f"\n\n## Extracted from: {link}\n"
-                    markdown_content += recursive_scrape(link, current_depth - 1)
             return markdown_content

                 return f"Error fetching {url}: {str(e)}\n"
             # Convert to Markdown
+            markdown_content = f"## Extracted from: {url}\n\n"
+            markdown_content += Converter.html_to_markdown(
                 html=html_content,
                 base_url=url,
                 parser_features='html.parser',
             # If depth > 0, extract links and process them
             if current_depth > 0:
+                links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)
                 for link in links:
+                    if link not in visited_urls:
+                        markdown_content += f"\n\n### Extracted from: {link}\n"
+                        markdown_content += recursive_scrape(link, current_depth - 1)
             return markdown_content