File size: 3,036 Bytes
b868763
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import logging
from rag_engine import process_file

# Configure logger to see the internal output of your chunkers
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("IntegrationTest")

def create_dummy_files():
    """Creates temporary test files for validation."""
    
    # 1. Create a dummy Markdown file
    md_content = """# Navy RAG Test
    
## Section 1: Introduction
This is a test of the markdown splitting capability.
It should respect headers.

## Section 2: Technical Specs
The system must handle:
* Paragraphs
* Headers
* Metadata
"""
    with open("test_doc.md", "w", encoding="utf-8") as f:
        f.write(md_content)

    # 2. Create a dummy Text file
    txt_content = """This is a standard text file.

It uses double line breaks to indicate paragraphs. 
The custom ParagraphChunker should detect this separation.

This is the third paragraph. It should be treated as a distinct chunk."""
    with open("test_doc.txt", "w", encoding="utf-8") as f:
        f.write(txt_content)

def cleanup_dummy_files():
    """Removes temporary files."""
    if os.path.exists("test_doc.md"):
        os.remove("test_doc.md")
    if os.path.exists("test_doc.txt"):
        os.remove("test_doc.txt")

def run_tests():
    print("\n--- STARTING INTEGRATION TEST ---\n")
    create_dummy_files()

    try:
        # TEST 1: Markdown Processing
        print(">> Testing Markdown Handler...")
        md_docs = process_file("test_doc.md")
        if len(md_docs) > 0 and 'Header 1' in md_docs[0].metadata:
            print(f"✅ PASS: Markdown processed {len(md_docs)} chunks with header metadata.")
        else:
            print(f"❌ FAIL: Markdown processing failed or missing metadata. Docs found: {len(md_docs)}")

        # TEST 2: Text Processing (Paragraph Strategy)
        print("\n>> Testing Text Handler (Paragraph Strategy)...")
        # We expect 3 paragraphs based on the input above
        p_docs = process_file("test_doc.txt", chunking_strategy="paragraph")
        if len(p_docs) == 3:
            print(f"✅ PASS: Paragraph strategy identified {len(p_docs)} distinct paragraphs.")
        else:
            print(f"⚠️ WARNING: Paragraph strategy found {len(p_docs)} chunks (Expected 3). Check min-length threshold settings in ParagraphChunker.")

        # TEST 3: Text Processing (Token Strategy)
        print("\n>> Testing Text Handler (Token Strategy)...")
        t_docs = process_file("test_doc.txt", chunking_strategy="token", chunk_size=50)
        if len(t_docs) > 0:
            print(f"✅ PASS: Token strategy successfully chunked text into {len(t_docs)} segments.")
        else:
            print("❌ FAIL: Token strategy returned 0 chunks.")

    except ImportError as e:
        print(f"\n❌ CRITICAL ERROR: Import failed. Check folder structure.\nDetails: {e}")
    except Exception as e:
        print(f"\n❌ CRITICAL ERROR: {e}")
    finally:
        cleanup_dummy_files()
        print("\n--- TEST COMPLETE ---")

if __name__ == "__main__":
    run_tests()