Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

File size: 3,298 Bytes

import os
import logging
import shutil
from rag_engine import process_file

# Configure logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("IntegrationTest")

def create_dummy_files():
    """Creates temporary test files."""
    
    # 1. Create a dummy Markdown file
    md_content = """# Navy RAG Test
    
## Section 1: Introduction
This is a test of the markdown splitting capability.
It should respect headers and keep them attached.

## Section 2: Technical Specs
The system must handle:
* Paragraphs
* Headers
* Metadata
"""
    with open("test_doc.md", "w", encoding="utf-8") as f:
        f.write(md_content)

    # 2. Create a dummy Text file with LONG paragraphs to survive the 50-char filter
    txt_content = """This is the first paragraph. It needs to be reasonably long to ensure the ParagraphChunker does not filter it out as noise. This should now be long enough to pass the threshold.

This is the second paragraph. It is separated by double line breaks, which is the standard delimiter for the paragraph chunking strategy. This ensures semantic integrity is maintained.

This is the third paragraph. By keeping these chunks distinct, we ensure that the RAG retrieval process grabs complete thoughts rather than fragmented sentences.
"""
    with open("test_doc.txt", "w", encoding="utf-8") as f:
        f.write(txt_content)

def cleanup_dummy_files():
    if os.path.exists("test_doc.md"): os.remove("test_doc.md")
    if os.path.exists("test_doc.txt"): os.remove("test_doc.txt")

def run_tests():
    print("\n--- STARTING INTEGRATION TEST ---\n")
    create_dummy_files()

    try:
        # TEST 1: Markdown
        print(">> Testing Markdown Handler...")
        md_docs = process_file("test_doc.md")
        if len(md_docs) > 0 and 'Header 1' in md_docs[0].metadata:
            print(f"✅ PASS: Markdown processed {len(md_docs)} chunks with header metadata.")
        else:
            print(f"❌ FAIL: Markdown processing issues. Docs found: {len(md_docs)}")

        # TEST 2: Paragraph Strategy
        print("\n>> Testing Text Handler (Paragraph Strategy)...")
        p_docs = process_file("test_doc.txt", chunking_strategy="paragraph")
        
        # We expect 3 paragraphs. 
        if len(p_docs) == 3:
            print(f"✅ PASS: Paragraph strategy identified {len(p_docs)}/3 paragraphs.")
        else:
            print(f"⚠️ WARNING: Found {len(p_docs)} chunks (Expected 3). (Check PARAGRAPH_MIN_LENGTH settings)")

        # TEST 3: Token Strategy (Fixed parameters)
        print("\n>> Testing Text Handler (Token Strategy)...")
        # FIXED: Set overlap to 10 so it is less than chunk_size 50
        t_docs = process_file("test_doc.txt", chunking_strategy="token", chunk_size=50, chunk_overlap=10)
        
        if len(t_docs) > 0:
            print(f"✅ PASS: Token strategy created {len(t_docs)} segments.")
        else:
            print("❌ FAIL: Token strategy returned 0 chunks.")

    except AttributeError:
        print("\n❌ ERROR: Function missing. Did you add 'list_documents' back to rag_engine.py?")
    except Exception as e:
        print(f"\n❌ CRITICAL ERROR: {e}")
    finally:
        cleanup_dummy_files()
        print("\n--- TEST COMPLETE ---")

if __name__ == "__main__":
    run_tests()