NavyDevilDoc commited on
Commit
b868763
·
verified ·
1 Parent(s): 6695d4a

Create test_integration.py

Browse files
Files changed (1) hide show
  1. src/test_integration.py +84 -0
src/test_integration.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from rag_engine import process_file
4
+
5
+ # Configure logger to see the internal output of your chunkers
6
+ logging.basicConfig(level=logging.INFO)
7
+ logger = logging.getLogger("IntegrationTest")
8
+
9
+ def create_dummy_files():
10
+ """Creates temporary test files for validation."""
11
+
12
+ # 1. Create a dummy Markdown file
13
+ md_content = """# Navy RAG Test
14
+
15
+ ## Section 1: Introduction
16
+ This is a test of the markdown splitting capability.
17
+ It should respect headers.
18
+
19
+ ## Section 2: Technical Specs
20
+ The system must handle:
21
+ * Paragraphs
22
+ * Headers
23
+ * Metadata
24
+ """
25
+ with open("test_doc.md", "w", encoding="utf-8") as f:
26
+ f.write(md_content)
27
+
28
+ # 2. Create a dummy Text file
29
+ txt_content = """This is a standard text file.
30
+
31
+ It uses double line breaks to indicate paragraphs.
32
+ The custom ParagraphChunker should detect this separation.
33
+
34
+ This is the third paragraph. It should be treated as a distinct chunk."""
35
+ with open("test_doc.txt", "w", encoding="utf-8") as f:
36
+ f.write(txt_content)
37
+
38
+ def cleanup_dummy_files():
39
+ """Removes temporary files."""
40
+ if os.path.exists("test_doc.md"):
41
+ os.remove("test_doc.md")
42
+ if os.path.exists("test_doc.txt"):
43
+ os.remove("test_doc.txt")
44
+
45
+ def run_tests():
46
+ print("\n--- STARTING INTEGRATION TEST ---\n")
47
+ create_dummy_files()
48
+
49
+ try:
50
+ # TEST 1: Markdown Processing
51
+ print(">> Testing Markdown Handler...")
52
+ md_docs = process_file("test_doc.md")
53
+ if len(md_docs) > 0 and 'Header 1' in md_docs[0].metadata:
54
+ print(f"✅ PASS: Markdown processed {len(md_docs)} chunks with header metadata.")
55
+ else:
56
+ print(f"❌ FAIL: Markdown processing failed or missing metadata. Docs found: {len(md_docs)}")
57
+
58
+ # TEST 2: Text Processing (Paragraph Strategy)
59
+ print("\n>> Testing Text Handler (Paragraph Strategy)...")
60
+ # We expect 3 paragraphs based on the input above
61
+ p_docs = process_file("test_doc.txt", chunking_strategy="paragraph")
62
+ if len(p_docs) == 3:
63
+ print(f"✅ PASS: Paragraph strategy identified {len(p_docs)} distinct paragraphs.")
64
+ else:
65
+ print(f"⚠️ WARNING: Paragraph strategy found {len(p_docs)} chunks (Expected 3). Check min-length threshold settings in ParagraphChunker.")
66
+
67
+ # TEST 3: Text Processing (Token Strategy)
68
+ print("\n>> Testing Text Handler (Token Strategy)...")
69
+ t_docs = process_file("test_doc.txt", chunking_strategy="token", chunk_size=50)
70
+ if len(t_docs) > 0:
71
+ print(f"✅ PASS: Token strategy successfully chunked text into {len(t_docs)} segments.")
72
+ else:
73
+ print("❌ FAIL: Token strategy returned 0 chunks.")
74
+
75
+ except ImportError as e:
76
+ print(f"\n❌ CRITICAL ERROR: Import failed. Check folder structure.\nDetails: {e}")
77
+ except Exception as e:
78
+ print(f"\n❌ CRITICAL ERROR: {e}")
79
+ finally:
80
+ cleanup_dummy_files()
81
+ print("\n--- TEST COMPLETE ---")
82
+
83
+ if __name__ == "__main__":
84
+ run_tests()