Spaces:
Sleeping
Sleeping
| """ | |
| Test script to validate the crawler module and system components. | |
| """ | |
| import sys | |
| from src.crawler import DocumentationCrawler | |
| def test_crawler(): | |
| """Test the crawler with a simple website.""" | |
| print("=" * 60) | |
| print("Testing Crawler Module") | |
| print("=" * 60) | |
| # Test with a simple, stable website | |
| test_urls = [ | |
| "https://example.com", | |
| "https://httpbin.org/html" | |
| ] | |
| for test_url in test_urls: | |
| print(f"\nTesting crawler on: {test_url}") | |
| try: | |
| crawler = DocumentationCrawler( | |
| base_url=test_url, | |
| max_depth=1, | |
| delay=0.5, | |
| max_pages=5 | |
| ) | |
| print(f" Max depth: 1") | |
| print(f" Max pages: 5") | |
| print(f" Request delay: 0.5s") | |
| documents = crawler.crawl() | |
| print(f"\n β Successfully crawled!") | |
| print(f" - Documents: {len(documents)}") | |
| print(f" - Failed URLs: {len(crawler.failed_urls)}") | |
| if documents: | |
| print(f"\n Sample document:") | |
| doc = documents[0] | |
| print(f" URL: {doc['url']}") | |
| print(f" Content length: {len(doc['content'])} chars") | |
| print(f" Preview: {doc['content'][:200]}...") | |
| except Exception as e: | |
| print(f" β Error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| def test_crawler_normalization(): | |
| """Test crawler URL normalization and filtering logic.""" | |
| crawler = DocumentationCrawler(base_url="https://example.com", respect_robots_txt=False) | |
| normalized = crawler._normalize_url("https://Example.com/path/?utm_source=test#section") | |
| assert normalized == "https://example.com/path" | |
| sorted_query = crawler._normalize_url("https://example.com/path/?b=2&a=1") | |
| assert sorted_query == "https://example.com/path?a=1&b=2" | |
| assert not crawler._should_crawl_url("javascript:alert(1)") | |
| assert not crawler._should_crawl_url("https://example.org/other") | |
| assert not crawler._should_crawl_url("https://example.com/image.jpg") | |
| assert crawler._should_crawl_url("https://example.com/docs") | |
| def test_imports(): | |
| """Test that all required modules can be imported.""" | |
| print("\n" + "=" * 60) | |
| print("Testing Imports") | |
| print("=" * 60) | |
| modules_to_test = [ | |
| ("langchain", "LangChain"), | |
| ("langchain_community", "LangChain Community"), | |
| ("langchain_ollama", "LangChain Ollama"), | |
| ("chromadb", "ChromaDB"), | |
| ("sentence_transformers", "Sentence Transformers"), | |
| ("gradio", "Gradio"), | |
| ("fastapi", "FastAPI"), | |
| ("uvicorn", "Uvicorn"), | |
| ("bs4", "BeautifulSoup4"), | |
| ("requests", "Requests"), | |
| ] | |
| all_ok = True | |
| for module_name, display_name in modules_to_test: | |
| try: | |
| __import__(module_name) | |
| print(f" β {display_name}") | |
| except ImportError as e: | |
| print(f" β {display_name}: {e}") | |
| all_ok = False | |
| return all_ok | |
| def test_app_imports(): | |
| """Test app module imports.""" | |
| print("\n" + "=" * 60) | |
| print("Testing App Module Imports") | |
| print("=" * 60) | |
| try: | |
| from src.app_enhanced import ( | |
| answer_question, | |
| index_crawler_results, | |
| load_documents_from_crawler | |
| ) | |
| print(" β Enhanced app_enhanced.py imports") | |
| except Exception as e: | |
| print(f" β Enhanced app_enhanced.py: {e}") | |
| try: | |
| from src.crawler import DocumentationCrawler | |
| print(" β Crawler module imports") | |
| except Exception as e: | |
| print(f" β Crawler: {e}") | |
| if __name__ == "__main__": | |
| print("\nπ RAG System Test Suite\n") | |
| # Test imports first | |
| if not test_imports(): | |
| print("\nβ οΈ Some dependencies are missing. Run: pip install -r requiements.txt") | |
| test_app_imports() | |
| # Optionally test crawler (disabled by default as it makes network calls) | |
| if len(sys.argv) > 1 and sys.argv[1] == "--crawl": | |
| test_crawler() | |
| else: | |
| print("\n" + "=" * 60) | |
| print("Crawler Test (Skipped)") | |
| print("=" * 60) | |
| print("To test crawler connectivity, run: python test_system.py --crawl") | |
| print("\nβ All tests completed!\n") | |