Spaces:

chintu4
/

documentation-crawler-rag

Sleeping

documentation-crawler-rag / tests /test_system.py

e3b17f1 about 1 month ago

4.43 kB

	"""
	Test script to validate the crawler module and system components.
	"""

	import sys
	from src.crawler import DocumentationCrawler


	def test_crawler():
	"""Test the crawler with a simple website."""
	print("=" * 60)
	print("Testing Crawler Module")
	print("=" * 60)

	# Test with a simple, stable website
	test_urls = [
	"https://example.com",
	"https://httpbin.org/html"
	]

	for test_url in test_urls:
	print(f"\nTesting crawler on: {test_url}")
	try:
	crawler = DocumentationCrawler(
	base_url=test_url,
	max_depth=1,
	delay=0.5,
	max_pages=5
	)

	print(f" Max depth: 1")
	print(f" Max pages: 5")
	print(f" Request delay: 0.5s")

	documents = crawler.crawl()

	print(f"\n ✓ Successfully crawled!")
	print(f" - Documents: {len(documents)}")
	print(f" - Failed URLs: {len(crawler.failed_urls)}")

	if documents:
	print(f"\n Sample document:")
	doc = documents[0]
	print(f" URL: {doc['url']}")
	print(f" Content length: {len(doc['content'])} chars")
	print(f" Preview: {doc['content'][:200]}...")

	except Exception as e:
	print(f" ✗ Error: {e}")
	import traceback
	traceback.print_exc()


	def test_crawler_normalization():
	"""Test crawler URL normalization and filtering logic."""
	crawler = DocumentationCrawler(base_url="https://example.com", respect_robots_txt=False)

	normalized = crawler._normalize_url("https://Example.com/path/?utm_source=test#section")
	assert normalized == "https://example.com/path"

	sorted_query = crawler._normalize_url("https://example.com/path/?b=2&a=1")
	assert sorted_query == "https://example.com/path?a=1&b=2"

	assert not crawler._should_crawl_url("javascript:alert(1)")
	assert not crawler._should_crawl_url("https://example.org/other")
	assert not crawler._should_crawl_url("https://example.com/image.jpg")
	assert crawler._should_crawl_url("https://example.com/docs")


	def test_imports():
	"""Test that all required modules can be imported."""
	print("\n" + "=" * 60)
	print("Testing Imports")
	print("=" * 60)

	modules_to_test = [
	("langchain", "LangChain"),
	("langchain_community", "LangChain Community"),
	("langchain_ollama", "LangChain Ollama"),
	("chromadb", "ChromaDB"),
	("sentence_transformers", "Sentence Transformers"),
	("gradio", "Gradio"),
	("fastapi", "FastAPI"),
	("uvicorn", "Uvicorn"),
	("bs4", "BeautifulSoup4"),
	("requests", "Requests"),
	]

	all_ok = True
	for module_name, display_name in modules_to_test:
	try:
	__import__(module_name)
	print(f" ✓ {display_name}")
	except ImportError as e:
	print(f" ✗ {display_name}: {e}")
	all_ok = False

	return all_ok


	def test_app_imports():
	"""Test app module imports."""
	print("\n" + "=" * 60)
	print("Testing App Module Imports")
	print("=" * 60)

	try:
	from src.app_enhanced import (
	answer_question,
	index_crawler_results,
	load_documents_from_crawler
	)
	print(" ✓ Enhanced app_enhanced.py imports")
	except Exception as e:
	print(f" ✗ Enhanced app_enhanced.py: {e}")

	try:
	from src.crawler import DocumentationCrawler
	print(" ✓ Crawler module imports")
	except Exception as e:
	print(f" ✗ Crawler: {e}")


	if __name__ == "__main__":
	print("\n🔍 RAG System Test Suite\n")

	# Test imports first
	if not test_imports():
	print("\n⚠️ Some dependencies are missing. Run: pip install -r requiements.txt")

	test_app_imports()

	# Optionally test crawler (disabled by default as it makes network calls)
	if len(sys.argv) > 1 and sys.argv[1] == "--crawl":
	test_crawler()
	else:
	print("\n" + "=" * 60)
	print("Crawler Test (Skipped)")
	print("=" * 60)
	print("To test crawler connectivity, run: python test_system.py --crawl")

	print("\n✓ All tests completed!\n")