NavyDevilDoc commited on
Commit
6087545
Β·
verified Β·
1 Parent(s): 99043ee

Update src/test_integration.py

Browse files
Files changed (1) hide show
  1. src/test_integration.py +25 -24
src/test_integration.py CHANGED
@@ -1,20 +1,21 @@
1
  import os
2
  import logging
 
3
  from rag_engine import process_file
4
 
5
- # Configure logger to see the internal output of your chunkers
6
  logging.basicConfig(level=logging.INFO)
7
  logger = logging.getLogger("IntegrationTest")
8
 
9
  def create_dummy_files():
10
- """Creates temporary test files for validation."""
11
 
12
  # 1. Create a dummy Markdown file
13
  md_content = """# Navy RAG Test
14
 
15
  ## Section 1: Introduction
16
  This is a test of the markdown splitting capability.
17
- It should respect headers.
18
 
19
  ## Section 2: Technical Specs
20
  The system must handle:
@@ -25,55 +26,55 @@ The system must handle:
25
  with open("test_doc.md", "w", encoding="utf-8") as f:
26
  f.write(md_content)
27
 
28
- # 2. Create a dummy Text file
29
- txt_content = """This is a standard text file.
30
 
31
- It uses double line breaks to indicate paragraphs.
32
- The custom ParagraphChunker should detect this separation.
33
 
34
- This is the third paragraph. It should be treated as a distinct chunk."""
 
35
  with open("test_doc.txt", "w", encoding="utf-8") as f:
36
  f.write(txt_content)
37
 
38
  def cleanup_dummy_files():
39
- """Removes temporary files."""
40
- if os.path.exists("test_doc.md"):
41
- os.remove("test_doc.md")
42
- if os.path.exists("test_doc.txt"):
43
- os.remove("test_doc.txt")
44
 
45
  def run_tests():
46
  print("\n--- STARTING INTEGRATION TEST ---\n")
47
  create_dummy_files()
48
 
49
  try:
50
- # TEST 1: Markdown Processing
51
  print(">> Testing Markdown Handler...")
52
  md_docs = process_file("test_doc.md")
53
  if len(md_docs) > 0 and 'Header 1' in md_docs[0].metadata:
54
  print(f"βœ… PASS: Markdown processed {len(md_docs)} chunks with header metadata.")
55
  else:
56
- print(f"❌ FAIL: Markdown processing failed or missing metadata. Docs found: {len(md_docs)}")
57
 
58
- # TEST 2: Text Processing (Paragraph Strategy)
59
  print("\n>> Testing Text Handler (Paragraph Strategy)...")
60
- # We expect 3 paragraphs based on the input above
61
  p_docs = process_file("test_doc.txt", chunking_strategy="paragraph")
 
 
62
  if len(p_docs) == 3:
63
- print(f"βœ… PASS: Paragraph strategy identified {len(p_docs)} distinct paragraphs.")
64
  else:
65
- print(f"⚠️ WARNING: Paragraph strategy found {len(p_docs)} chunks (Expected 3). Check min-length threshold settings in ParagraphChunker.")
66
 
67
- # TEST 3: Text Processing (Token Strategy)
68
  print("\n>> Testing Text Handler (Token Strategy)...")
69
- t_docs = process_file("test_doc.txt", chunking_strategy="token", chunk_size=50)
 
 
70
  if len(t_docs) > 0:
71
- print(f"βœ… PASS: Token strategy successfully chunked text into {len(t_docs)} segments.")
72
  else:
73
  print("❌ FAIL: Token strategy returned 0 chunks.")
74
 
75
- except ImportError as e:
76
- print(f"\n❌ CRITICAL ERROR: Import failed. Check folder structure.\nDetails: {e}")
77
  except Exception as e:
78
  print(f"\n❌ CRITICAL ERROR: {e}")
79
  finally:
 
1
  import os
2
  import logging
3
+ import shutil
4
  from rag_engine import process_file
5
 
6
+ # Configure logger
7
  logging.basicConfig(level=logging.INFO)
8
  logger = logging.getLogger("IntegrationTest")
9
 
10
  def create_dummy_files():
11
+ """Creates temporary test files."""
12
 
13
  # 1. Create a dummy Markdown file
14
  md_content = """# Navy RAG Test
15
 
16
  ## Section 1: Introduction
17
  This is a test of the markdown splitting capability.
18
+ It should respect headers and keep them attached.
19
 
20
  ## Section 2: Technical Specs
21
  The system must handle:
 
26
  with open("test_doc.md", "w", encoding="utf-8") as f:
27
  f.write(md_content)
28
 
29
+ # 2. Create a dummy Text file with LONG paragraphs to survive the 50-char filter
30
+ txt_content = """This is the first paragraph. It needs to be reasonably long to ensure the ParagraphChunker does not filter it out as noise. This should now be long enough to pass the threshold.
31
 
32
+ This is the second paragraph. It is separated by double line breaks, which is the standard delimiter for the paragraph chunking strategy. This ensures semantic integrity is maintained.
 
33
 
34
+ This is the third paragraph. By keeping these chunks distinct, we ensure that the RAG retrieval process grabs complete thoughts rather than fragmented sentences.
35
+ """
36
  with open("test_doc.txt", "w", encoding="utf-8") as f:
37
  f.write(txt_content)
38
 
39
  def cleanup_dummy_files():
40
+ if os.path.exists("test_doc.md"): os.remove("test_doc.md")
41
+ if os.path.exists("test_doc.txt"): os.remove("test_doc.txt")
 
 
 
42
 
43
  def run_tests():
44
  print("\n--- STARTING INTEGRATION TEST ---\n")
45
  create_dummy_files()
46
 
47
  try:
48
+ # TEST 1: Markdown
49
  print(">> Testing Markdown Handler...")
50
  md_docs = process_file("test_doc.md")
51
  if len(md_docs) > 0 and 'Header 1' in md_docs[0].metadata:
52
  print(f"βœ… PASS: Markdown processed {len(md_docs)} chunks with header metadata.")
53
  else:
54
+ print(f"❌ FAIL: Markdown processing issues. Docs found: {len(md_docs)}")
55
 
56
+ # TEST 2: Paragraph Strategy
57
  print("\n>> Testing Text Handler (Paragraph Strategy)...")
 
58
  p_docs = process_file("test_doc.txt", chunking_strategy="paragraph")
59
+
60
+ # We expect 3 paragraphs.
61
  if len(p_docs) == 3:
62
+ print(f"βœ… PASS: Paragraph strategy identified {len(p_docs)}/3 paragraphs.")
63
  else:
64
+ print(f"⚠️ WARNING: Found {len(p_docs)} chunks (Expected 3). (Check PARAGRAPH_MIN_LENGTH settings)")
65
 
66
+ # TEST 3: Token Strategy (Fixed parameters)
67
  print("\n>> Testing Text Handler (Token Strategy)...")
68
+ # FIXED: Set overlap to 10 so it is less than chunk_size 50
69
+ t_docs = process_file("test_doc.txt", chunking_strategy="token", chunk_size=50, chunk_overlap=10)
70
+
71
  if len(t_docs) > 0:
72
+ print(f"βœ… PASS: Token strategy created {len(t_docs)} segments.")
73
  else:
74
  print("❌ FAIL: Token strategy returned 0 chunks.")
75
 
76
+ except AttributeError:
77
+ print("\n❌ ERROR: Function missing. Did you add 'list_documents' back to rag_engine.py?")
78
  except Exception as e:
79
  print(f"\n❌ CRITICAL ERROR: {e}")
80
  finally: