documentation-crawler-rag / tests /test_system.py
chintu4's picture
ok
e3b17f1
Raw
History Blame Contribute Delete
4.43 kB
"""
Test script to validate the crawler module and system components.
"""
import sys
from src.crawler import DocumentationCrawler
def test_crawler():
"""Test the crawler with a simple website."""
print("=" * 60)
print("Testing Crawler Module")
print("=" * 60)
# Test with a simple, stable website
test_urls = [
"https://example.com",
"https://httpbin.org/html"
]
for test_url in test_urls:
print(f"\nTesting crawler on: {test_url}")
try:
crawler = DocumentationCrawler(
base_url=test_url,
max_depth=1,
delay=0.5,
max_pages=5
)
print(f" Max depth: 1")
print(f" Max pages: 5")
print(f" Request delay: 0.5s")
documents = crawler.crawl()
print(f"\n βœ“ Successfully crawled!")
print(f" - Documents: {len(documents)}")
print(f" - Failed URLs: {len(crawler.failed_urls)}")
if documents:
print(f"\n Sample document:")
doc = documents[0]
print(f" URL: {doc['url']}")
print(f" Content length: {len(doc['content'])} chars")
print(f" Preview: {doc['content'][:200]}...")
except Exception as e:
print(f" βœ— Error: {e}")
import traceback
traceback.print_exc()
def test_crawler_normalization():
"""Test crawler URL normalization and filtering logic."""
crawler = DocumentationCrawler(base_url="https://example.com", respect_robots_txt=False)
normalized = crawler._normalize_url("https://Example.com/path/?utm_source=test#section")
assert normalized == "https://example.com/path"
sorted_query = crawler._normalize_url("https://example.com/path/?b=2&a=1")
assert sorted_query == "https://example.com/path?a=1&b=2"
assert not crawler._should_crawl_url("javascript:alert(1)")
assert not crawler._should_crawl_url("https://example.org/other")
assert not crawler._should_crawl_url("https://example.com/image.jpg")
assert crawler._should_crawl_url("https://example.com/docs")
def test_imports():
"""Test that all required modules can be imported."""
print("\n" + "=" * 60)
print("Testing Imports")
print("=" * 60)
modules_to_test = [
("langchain", "LangChain"),
("langchain_community", "LangChain Community"),
("langchain_ollama", "LangChain Ollama"),
("chromadb", "ChromaDB"),
("sentence_transformers", "Sentence Transformers"),
("gradio", "Gradio"),
("fastapi", "FastAPI"),
("uvicorn", "Uvicorn"),
("bs4", "BeautifulSoup4"),
("requests", "Requests"),
]
all_ok = True
for module_name, display_name in modules_to_test:
try:
__import__(module_name)
print(f" βœ“ {display_name}")
except ImportError as e:
print(f" βœ— {display_name}: {e}")
all_ok = False
return all_ok
def test_app_imports():
"""Test app module imports."""
print("\n" + "=" * 60)
print("Testing App Module Imports")
print("=" * 60)
try:
from src.app_enhanced import (
answer_question,
index_crawler_results,
load_documents_from_crawler
)
print(" βœ“ Enhanced app_enhanced.py imports")
except Exception as e:
print(f" βœ— Enhanced app_enhanced.py: {e}")
try:
from src.crawler import DocumentationCrawler
print(" βœ“ Crawler module imports")
except Exception as e:
print(f" βœ— Crawler: {e}")
if __name__ == "__main__":
print("\nπŸ” RAG System Test Suite\n")
# Test imports first
if not test_imports():
print("\n⚠️ Some dependencies are missing. Run: pip install -r requiements.txt")
test_app_imports()
# Optionally test crawler (disabled by default as it makes network calls)
if len(sys.argv) > 1 and sys.argv[1] == "--crawl":
test_crawler()
else:
print("\n" + "=" * 60)
print("Crawler Test (Skipped)")
print("=" * 60)
print("To test crawler connectivity, run: python test_system.py --crawl")
print("\nβœ“ All tests completed!\n")