Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

AI_Toolkit / src /test_integration.py

NavyDevilDoc

Update src/test_integration.py

6087545 verified 4 months ago

raw

history blame contribute delete

3.3 kB

	import os
	import logging
	import shutil
	from rag_engine import process_file

	# Configure logger
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger("IntegrationTest")

	def create_dummy_files():
	"""Creates temporary test files."""

	# 1. Create a dummy Markdown file
	md_content = """# Navy RAG Test

	## Section 1: Introduction
	This is a test of the markdown splitting capability.
	It should respect headers and keep them attached.

	## Section 2: Technical Specs
	The system must handle:
	* Paragraphs
	* Headers
	* Metadata
	"""
	with open("test_doc.md", "w", encoding="utf-8") as f:
	f.write(md_content)

	# 2. Create a dummy Text file with LONG paragraphs to survive the 50-char filter
	txt_content = """This is the first paragraph. It needs to be reasonably long to ensure the ParagraphChunker does not filter it out as noise. This should now be long enough to pass the threshold.

	This is the second paragraph. It is separated by double line breaks, which is the standard delimiter for the paragraph chunking strategy. This ensures semantic integrity is maintained.

	This is the third paragraph. By keeping these chunks distinct, we ensure that the RAG retrieval process grabs complete thoughts rather than fragmented sentences.
	"""
	with open("test_doc.txt", "w", encoding="utf-8") as f:
	f.write(txt_content)

	def cleanup_dummy_files():
	if os.path.exists("test_doc.md"): os.remove("test_doc.md")
	if os.path.exists("test_doc.txt"): os.remove("test_doc.txt")

	def run_tests():
	print("\n--- STARTING INTEGRATION TEST ---\n")
	create_dummy_files()

	try:
	# TEST 1: Markdown
	print(">> Testing Markdown Handler...")
	md_docs = process_file("test_doc.md")
	if len(md_docs) > 0 and 'Header 1' in md_docs[0].metadata:
	print(f"✅ PASS: Markdown processed {len(md_docs)} chunks with header metadata.")
	else:
	print(f"❌ FAIL: Markdown processing issues. Docs found: {len(md_docs)}")

	# TEST 2: Paragraph Strategy
	print("\n>> Testing Text Handler (Paragraph Strategy)...")
	p_docs = process_file("test_doc.txt", chunking_strategy="paragraph")

	# We expect 3 paragraphs.
	if len(p_docs) == 3:
	print(f"✅ PASS: Paragraph strategy identified {len(p_docs)}/3 paragraphs.")
	else:
	print(f"⚠️ WARNING: Found {len(p_docs)} chunks (Expected 3). (Check PARAGRAPH_MIN_LENGTH settings)")

	# TEST 3: Token Strategy (Fixed parameters)
	print("\n>> Testing Text Handler (Token Strategy)...")
	# FIXED: Set overlap to 10 so it is less than chunk_size 50
	t_docs = process_file("test_doc.txt", chunking_strategy="token", chunk_size=50, chunk_overlap=10)

	if len(t_docs) > 0:
	print(f"✅ PASS: Token strategy created {len(t_docs)} segments.")
	else:
	print("❌ FAIL: Token strategy returned 0 chunks.")

	except AttributeError:
	print("\n❌ ERROR: Function missing. Did you add 'list_documents' back to rag_engine.py?")
	except Exception as e:
	print(f"\n❌ CRITICAL ERROR: {e}")
	finally:
	cleanup_dummy_files()
	print("\n--- TEST COMPLETE ---")

	if __name__ == "__main__":
	run_tests()