Spaces:

Rsr2425
/

SimpliFi

Sleeping

App Files Files Community

Rsr2425 commited on Mar 28, 2025

Commit

e344fab

1 Parent(s): 9fc8e5c

Fixed crawler code and added tests

Browse files

Files changed (4) hide show

backend/app/crawler.py +59 -7
backend/app/main.py +19 -3
backend/tests/test_api.py +1 -2
backend/tests/test_crawler.py +115 -119

backend/app/crawler.py CHANGED Viewed

@@ -8,18 +8,42 @@ from scrapy.utils.project import get_project_settings
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 logger = logging.getLogger(__name__)
 class WebsiteSpider(CrawlSpider):
     name = "website_spider"
     def __init__(self, start_url, output_dir, *args, **kwargs):
         self.start_urls = [start_url]
         self.allowed_domains = [urlparse(start_url).netloc]
         self.output_dir = output_dir
-        # Define rules for link extraction
         self.rules = (
             Rule(
                 LinkExtractor(allow_domains=self.allowed_domains),
@@ -28,9 +52,17 @@ class WebsiteSpider(CrawlSpider):
             ),
         )
-        super().__init__(*args, **kwargs)
     def parse_item(self, response):
         try:
             # Parse the HTML with BeautifulSoup
             soup = BeautifulSoup(response.body, "html.parser")
@@ -76,15 +108,30 @@ class WebsiteSpider(CrawlSpider):
 class DomainCrawler:
     def __init__(self, start_url, output_dir="crawled_content"):
         self.start_url = start_url
         self.domain = urlparse(start_url).netloc
         self.output_dir = output_dir
-        # Create output directory if it doesn't exist
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-            logger.info(f"Created output directory: {output_dir}")
         # Configure Scrapy settings
         self.settings = get_project_settings()
@@ -100,7 +147,12 @@ class DomainCrawler:
         )
     def start(self):
-        """Start the crawling process"""
         logger.info(f"Starting crawl from {self.start_url}")
         process = CrawlerProcess(self.settings)

 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
+"""
+A web crawler module for extracting content from documentation websites.
+This module provides classes for crawling a domain and extracting main content
+from web pages, saving the results as text files.
+"""
 logger = logging.getLogger(__name__)
 class WebsiteSpider(CrawlSpider):
+    """
+    A Scrapy spider for crawling documentation websites and extracting content.
+    This spider follows links within the allowed domain and extracts the main content
+    from each page, saving it to a text file. It attempts to find content within <main>,
+    <article> or content div tags, falling back to the full body if none are found.
+    Args:
+        start_url (str): The URL where crawling should begin
+        output_dir (str): Directory where extracted content should be saved
+        *args: Additional positional arguments passed to CrawlSpider
+        **kwargs: Additional keyword arguments passed to CrawlSpider
+    """
     name = "website_spider"
     def __init__(self, start_url, output_dir, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.start_urls = [start_url]
         self.allowed_domains = [urlparse(start_url).netloc]
         self.output_dir = output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info(f"Created output directory: {output_dir}")
         self.rules = (
             Rule(
                 LinkExtractor(allow_domains=self.allowed_domains),
             ),
         )
     def parse_item(self, response):
+        """
+        Parse a webpage and extract its content.
+        Args:
+            response: The Scrapy response object containing the webpage
+        The extracted content is saved to a text file in the output directory,
+        including the page URL, title and main content.
+        """
         try:
             # Parse the HTML with BeautifulSoup
             soup = BeautifulSoup(response.body, "html.parser")
 class DomainCrawler:
+    """
+    High-level crawler class for extracting content from a documentation website.
+    This class provides a simple interface for crawling a website and extracting its
+    content. It configures and runs a Scrapy crawler with sensible defaults for
+    crawling documentation sites.
+    Example:
+        crawler = DomainCrawler("https://docs.example.com")
+        crawler.start()  # Crawls the site and saves content to ./crawled_content/
+    Args:
+        start_url (str): The URL where crawling should begin
+        output_dir (str, optional): Directory where extracted content should be saved.
+            Defaults to "crawled_content"
+    """
     def __init__(self, start_url, output_dir="crawled_content"):
         self.start_url = start_url
         self.domain = urlparse(start_url).netloc
         self.output_dir = output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info(f"Created output directory: {output_dir}")
         # Configure Scrapy settings
         self.settings = get_project_settings()
         )
     def start(self):
+        """
+        Start the crawling process.
+        This method initiates the crawler and blocks until crawling is complete.
+        The extracted content will be saved to the configured output directory.
+        """
         logger.info(f"Starting crawl from {self.start_url}")
         process = CrawlerProcess(self.settings)

backend/app/main.py CHANGED Viewed

@@ -11,6 +11,7 @@ import logging
 import os
 from backend.app.crawler import DomainCrawler
 from backend.app.vectorstore import get_all_unique_source_of_docs_in_collection_DUMB
 app = FastAPI()
@@ -23,6 +24,20 @@ app.add_middleware(
 )
 class UrlInput(BaseModel):
     url: str
@@ -46,10 +61,11 @@ class TopicsResponse(BaseModel):
     sources: List[str]
-@app.post("/api/ingest/")
-async def ingest_documentation(input_data: UrlInput):
     print(f"Received url {input_data.url}")
-    return {"status": "received"}
 @app.post("/api/problems/")

 import os
 from backend.app.crawler import DomainCrawler
 from backend.app.vectorstore import get_all_unique_source_of_docs_in_collection_DUMB
+from enum import Enum
 app = FastAPI()
 )
+class IngestStatus(Enum):
+    RECEIVED = "RECEIVED"
+    FAILURE = "FAILURE"
+class IngestRequest(BaseModel):
+    topic: str
+    url: str
+class IngestResponse(BaseModel):
+    status: IngestStatus
 class UrlInput(BaseModel):
     url: str
     sources: List[str]
+# TODO maybe call this /api/scan/ just to be consistent and match FE?
+@app.post("/api/ingest/", response_model=IngestResponse)
+async def ingest_documentation(input_data: IngestRequest):
     print(f"Received url {input_data.url}")
+    return IngestResponse(status=IngestStatus.RECEIVED)
 @app.post("/api/problems/")

backend/tests/test_api.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from fastapi.testclient import TestClient
 from backend.app.main import app
-import pytest
 client = TestClient(app)
@@ -8,7 +7,7 @@ client = TestClient(app)
 def test_crawl_endpoint():
     response = client.post("/api/ingest/", json={"url": "https://example.com"})
     assert response.status_code == 200
-    assert response.json() == {"status": "received"}
 def test_problems_endpoint():

 from fastapi.testclient import TestClient
 from backend.app.main import app
 client = TestClient(app)
 def test_crawl_endpoint():
     response = client.post("/api/ingest/", json={"url": "https://example.com"})
     assert response.status_code == 200
+    assert response.json() == {"status": "RECEIVED"}
 def test_problems_endpoint():

backend/tests/test_crawler.py CHANGED Viewed

@@ -1,10 +1,9 @@
-import pytest
 import os
 from unittest.mock import Mock, patch
 from bs4 import BeautifulSoup
-from scrapy.http import Response, Request, TextResponse
-from backend.app.crawler import DomainCrawler, WebsiteSpider
 @pytest.fixture
 def sample_html():
@@ -14,124 +13,121 @@ def sample_html():
         <body>
             <main>
                 <h1>Main Content</h1>
-                <p>This is the main content of the page.</p>
             </main>
         </body>
     </html>
     """
-@pytest.fixture
-def crawler():
-    return DomainCrawler("https://example.com", output_dir="test_output")
 @pytest.fixture
-def spider():
-    return WebsiteSpider(start_url="https://example.com", output_dir="test_output")
-# def test_crawler_initialization(crawler):
-#     assert crawler.start_url == "https://example.com"
-#     assert crawler.domain == "example.com"
-#     assert crawler.output_dir == "test_output"
-#     assert os.path.exists("test_output")
-#     # Test Scrapy settings
-#     assert crawler.settings.get('BOT_NAME') == 'website_crawler'
-#     assert crawler.settings.get('ROBOTSTXT_OBEY') is True
-#     assert crawler.settings.get('DOWNLOAD_DELAY') == 1
-def create_response(url, body):
-    request = Request(url=url)
-    return TextResponse(
-        url=url, body=body.encode("utf-8"), encoding="utf-8", request=request
-    )
-# def test_spider_parse_with_main_content(spider, sample_html):
-#     url = "https://example.com/test"
-#     response = create_response(url, sample_html)
-#     # Process the page
-#     list(spider.parse_item(response))
-#     # Check if file was created
-#     files = os.listdir(spider.output_dir)
-#     assert len(files) == 1
-#     # Read the saved file
-#     with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
-#         content = f.read()
-#     # Verify content
-#     assert "URL: https://example.com/test" in content
-#     assert "Title: Test Page" in content
-#     assert "Main Content" in content
-#     assert "This is the main content of the page." in content
-# def test_spider_parse_without_main_content(spider):
-#     html_without_main = """
-#     <html>
-#         <head><title>No Main</title></head>
-#         <body>
-#             <div>Some body content</div>
-#         </body>
-#     </html>
-#     """
-#     url = "https://example.com/no-main"
-#     response = create_response(url, html_without_main)
-#     # Process the page
-#     list(spider.parse_item(response))
-#     files = os.listdir(spider.output_dir)
-#     assert len(files) == 1
-#     with open(os.path.join(spider.output_dir, files[0]), 'r', encoding='utf-8') as f:
-#         content = f.read()
-#     assert "URL: https://example.com/no-main" in content
-#     assert "Title: No Main" in content
-#     assert "Some body content" in content
-# def test_spider_parse_with_invalid_html(spider):
-#     invalid_html = "<invalid><<html>"
-#     url = "https://example.com/invalid"
-#     response = create_response(url, invalid_html)
-#     # Process should not raise an exception
-#     list(spider.parse_item(response))
-#     # Should still create a file
-#     files = os.listdir(spider.output_dir)
-#     assert len(files) == 1
-# @patch('scrapy.crawler.CrawlerProcess')
-# def test_start_crawling(mock_crawler_process_class, crawler):
-#     # Configure the mock
-#     mock_process = Mock()
-#     mock_crawler_process_class.return_value = mock_process
-#     # Start crawling
-#     crawler.start()
-#     # Verify process was created with correct settings
-#     mock_crawler_process_class.assert_called_once_with(crawler.settings)
-#     # Verify crawl method was called
-#     mock_process.crawl.assert_called_once()
-#     mock_process.start.assert_called_once()
-@pytest.fixture(autouse=True)
-def cleanup():
-    # Setup - nothing needed
-    yield
-    # Cleanup after each test
-    if os.path.exists("test_output"):
-        for file in os.listdir("test_output"):
-            os.remove(os.path.join("test_output", file))
-        os.rmdir("test_output")

 import os
+import pytest
 from unittest.mock import Mock, patch
 from bs4 import BeautifulSoup
+from scrapy.http import Response, Request
+from backend.app.crawler import WebsiteSpider, DomainCrawler
 @pytest.fixture
 def sample_html():
         <body>
             <main>
                 <h1>Main Content</h1>
+                <p>This is the main content.</p>
             </main>
         </body>
     </html>
     """
 @pytest.fixture
+def output_dir(tmp_path):
+    """Create a temporary directory for test outputs"""
+    return str(tmp_path / "test_crawled_content")
+def test_website_spider_initialization():
+    """Test WebsiteSpider initialization with correct parameters"""
+    start_url = "https://example.com"
+    output_dir = "test_output"
+    spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)
+    assert spider.start_urls == [start_url]
+    assert spider.allowed_domains == ["example.com"]
+    assert spider.output_dir == output_dir
+    assert len(spider.rules) == 1
+def test_parse_item_with_main_content(sample_html, output_dir):
+    """Test parsing a page with main content section"""
+    start_url = "https://example.com"
+    spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)
+    # Create a mock response
+    mock_response = Mock(spec=Response)
+    mock_response.url = "https://example.com/test"
+    mock_response.body = sample_html.encode('utf-8')
+    # Process the mock response
+    spider.parse_item(mock_response)
+    # Check if file was created and contains correct content
+    files = os.listdir(output_dir)
+    assert len(files) == 1
+    with open(os.path.join(output_dir, files[0]), 'r', encoding='utf-8') as f:
+        content = f.read()
+        assert "Test Page" in content
+        assert "Main Content" in content
+        assert "This is the main content" in content
+        assert "URL: https://example.com/test" in content
+def test_parse_item_without_main_content(output_dir):
+    """Test parsing a page without main content section"""
+    html_without_main = """
+    <html>
+        <head><title>No Main Page</title></head>
+        <body>
+            <div>Some body content</div>
+        </body>
+    </html>
+    """
+    start_url = "https://example.com"
+    spider = WebsiteSpider(start_url=start_url, output_dir=output_dir)
+    mock_response = Mock(spec=Response)
+    mock_response.url = "https://example.com/no-main"
+    mock_response.body = html_without_main.encode('utf-8')
+    spider.parse_item(mock_response)
+    files = os.listdir(output_dir)
+    assert len(files) == 1
+    with open(os.path.join(output_dir, files[0]), 'r', encoding='utf-8') as f:
+        content = f.read()
+        assert "No Main Page" in content
+        assert "Some body content" in content
+def test_domain_crawler_initialization():
+    """Test DomainCrawler initialization"""
+    start_url = "https://example.com"
+    output_dir = "test_output"
+    crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
+    assert crawler.start_url == start_url
+    assert crawler.domain == "example.com"
+    assert crawler.output_dir == output_dir
+    assert crawler.settings.get('BOT_NAME') == "website_crawler"
+    assert crawler.settings.get('ROBOTSTXT_OBEY') is True
+    assert crawler.settings.get('CONCURRENT_REQUESTS') == 16
+    assert crawler.settings.get('DOWNLOAD_DELAY') == 1
+@patch('backend.app.crawler.CrawlerProcess')
+def test_domain_crawler_start(mock_crawler_process):
+    """Test starting the domain crawler"""
+    start_url = "https://example.com"
+    output_dir = "test_output"
+    crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
+    crawler.start()
+    # Verify that CrawlerProcess was instantiated and crawl was started
+    mock_crawler_process.assert_called_once_with(crawler.settings)
+    mock_crawler_process.return_value.crawl.assert_called_once()
+    mock_crawler_process.return_value.start.assert_called_once()
+def test_output_directory_creation():
+    """Test that output directory is created if it doesn't exist"""
+    start_url = "https://example.com"
+    output_dir = "test_output_dir"
+    # Ensure directory doesn't exist
+    if os.path.exists(output_dir):
+        os.rmdir(output_dir)
+    crawler = DomainCrawler(start_url=start_url, output_dir=output_dir)
+    assert os.path.exists(output_dir)
+    # Cleanup
+    os.rmdir(output_dir)