Spaces:

NavyDevilDoc
/

AI_Toolkit

Running

App Files Files Community

AI_Toolkit / src /test_integration.py

NavyDevilDoc

Create test_integration.py

b868763 verified 4 months ago

raw

history blame

3.04 kB

	import os
	import logging
	from rag_engine import process_file

	# Configure logger to see the internal output of your chunkers
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger("IntegrationTest")

	def create_dummy_files():
	"""Creates temporary test files for validation."""

	# 1. Create a dummy Markdown file
	md_content = """# Navy RAG Test

	## Section 1: Introduction
	This is a test of the markdown splitting capability.
	It should respect headers.

	## Section 2: Technical Specs
	The system must handle:
	* Paragraphs
	* Headers
	* Metadata
	"""
	with open("test_doc.md", "w", encoding="utf-8") as f:
	f.write(md_content)

	# 2. Create a dummy Text file
	txt_content = """This is a standard text file.

	It uses double line breaks to indicate paragraphs.
	The custom ParagraphChunker should detect this separation.

	This is the third paragraph. It should be treated as a distinct chunk."""
	with open("test_doc.txt", "w", encoding="utf-8") as f:
	f.write(txt_content)

	def cleanup_dummy_files():
	"""Removes temporary files."""
	if os.path.exists("test_doc.md"):
	os.remove("test_doc.md")
	if os.path.exists("test_doc.txt"):
	os.remove("test_doc.txt")

	def run_tests():
	print("\n--- STARTING INTEGRATION TEST ---\n")
	create_dummy_files()

	try:
	# TEST 1: Markdown Processing
	print(">> Testing Markdown Handler...")
	md_docs = process_file("test_doc.md")
	if len(md_docs) > 0 and 'Header 1' in md_docs[0].metadata:
	print(f"✅ PASS: Markdown processed {len(md_docs)} chunks with header metadata.")
	else:
	print(f"❌ FAIL: Markdown processing failed or missing metadata. Docs found: {len(md_docs)}")

	# TEST 2: Text Processing (Paragraph Strategy)
	print("\n>> Testing Text Handler (Paragraph Strategy)...")
	# We expect 3 paragraphs based on the input above
	p_docs = process_file("test_doc.txt", chunking_strategy="paragraph")
	if len(p_docs) == 3:
	print(f"✅ PASS: Paragraph strategy identified {len(p_docs)} distinct paragraphs.")
	else:
	print(f"⚠️ WARNING: Paragraph strategy found {len(p_docs)} chunks (Expected 3). Check min-length threshold settings in ParagraphChunker.")

	# TEST 3: Text Processing (Token Strategy)
	print("\n>> Testing Text Handler (Token Strategy)...")
	t_docs = process_file("test_doc.txt", chunking_strategy="token", chunk_size=50)
	if len(t_docs) > 0:
	print(f"✅ PASS: Token strategy successfully chunked text into {len(t_docs)} segments.")
	else:
	print("❌ FAIL: Token strategy returned 0 chunks.")

	except ImportError as e:
	print(f"\n❌ CRITICAL ERROR: Import failed. Check folder structure.\nDetails: {e}")
	except Exception as e:
	print(f"\n❌ CRITICAL ERROR: {e}")
	finally:
	cleanup_dummy_files()
	print("\n--- TEST COMPLETE ---")

	if __name__ == "__main__":
	run_tests()