""" Test script to validate the crawler module and system components. """ import sys from src.crawler import DocumentationCrawler def test_crawler(): """Test the crawler with a simple website.""" print("=" * 60) print("Testing Crawler Module") print("=" * 60) # Test with a simple, stable website test_urls = [ "https://example.com", "https://httpbin.org/html" ] for test_url in test_urls: print(f"\nTesting crawler on: {test_url}") try: crawler = DocumentationCrawler( base_url=test_url, max_depth=1, delay=0.5, max_pages=5 ) print(f" Max depth: 1") print(f" Max pages: 5") print(f" Request delay: 0.5s") documents = crawler.crawl() print(f"\n ✓ Successfully crawled!") print(f" - Documents: {len(documents)}") print(f" - Failed URLs: {len(crawler.failed_urls)}") if documents: print(f"\n Sample document:") doc = documents[0] print(f" URL: {doc['url']}") print(f" Content length: {len(doc['content'])} chars") print(f" Preview: {doc['content'][:200]}...") except Exception as e: print(f" ✗ Error: {e}") import traceback traceback.print_exc() def test_crawler_normalization(): """Test crawler URL normalization and filtering logic.""" crawler = DocumentationCrawler(base_url="https://example.com", respect_robots_txt=False) normalized = crawler._normalize_url("https://Example.com/path/?utm_source=test#section") assert normalized == "https://example.com/path" sorted_query = crawler._normalize_url("https://example.com/path/?b=2&a=1") assert sorted_query == "https://example.com/path?a=1&b=2" assert not crawler._should_crawl_url("javascript:alert(1)") assert not crawler._should_crawl_url("https://example.org/other") assert not crawler._should_crawl_url("https://example.com/image.jpg") assert crawler._should_crawl_url("https://example.com/docs") def test_imports(): """Test that all required modules can be imported.""" print("\n" + "=" * 60) print("Testing Imports") print("=" * 60) modules_to_test = [ ("langchain", "LangChain"), ("langchain_community", "LangChain Community"), ("langchain_ollama", "LangChain Ollama"), ("chromadb", "ChromaDB"), ("sentence_transformers", "Sentence Transformers"), ("gradio", "Gradio"), ("fastapi", "FastAPI"), ("uvicorn", "Uvicorn"), ("bs4", "BeautifulSoup4"), ("requests", "Requests"), ] all_ok = True for module_name, display_name in modules_to_test: try: __import__(module_name) print(f" ✓ {display_name}") except ImportError as e: print(f" ✗ {display_name}: {e}") all_ok = False return all_ok def test_app_imports(): """Test app module imports.""" print("\n" + "=" * 60) print("Testing App Module Imports") print("=" * 60) try: from src.app_enhanced import ( answer_question, index_crawler_results, load_documents_from_crawler ) print(" ✓ Enhanced app_enhanced.py imports") except Exception as e: print(f" ✗ Enhanced app_enhanced.py: {e}") try: from src.crawler import DocumentationCrawler print(" ✓ Crawler module imports") except Exception as e: print(f" ✗ Crawler: {e}") if __name__ == "__main__": print("\n🔍 RAG System Test Suite\n") # Test imports first if not test_imports(): print("\n⚠️ Some dependencies are missing. Run: pip install -r requiements.txt") test_app_imports() # Optionally test crawler (disabled by default as it makes network calls) if len(sys.argv) > 1 and sys.argv[1] == "--crawl": test_crawler() else: print("\n" + "=" * 60) print("Crawler Test (Skipped)") print("=" * 60) print("To test crawler connectivity, run: python test_system.py --crawl") print("\n✓ All tests completed!\n")