Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 20, 2025

Commit

6087545

verified ·

1 Parent(s): 99043ee

Update src/test_integration.py

Browse files

Files changed (1) hide show

src/test_integration.py +25 -24

src/test_integration.py CHANGED Viewed

@@ -1,20 +1,21 @@
 import os
 import logging
 from rag_engine import process_file
-# Configure logger to see the internal output of your chunkers
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("IntegrationTest")
 def create_dummy_files():
-    """Creates temporary test files for validation."""
     # 1. Create a dummy Markdown file
     md_content = """# Navy RAG Test
 ## Section 1: Introduction
 This is a test of the markdown splitting capability.
-It should respect headers.
 ## Section 2: Technical Specs
 The system must handle:
@@ -25,55 +26,55 @@ The system must handle:
     with open("test_doc.md", "w", encoding="utf-8") as f:
         f.write(md_content)
-    # 2. Create a dummy Text file
-    txt_content = """This is a standard text file.
-It uses double line breaks to indicate paragraphs.
-The custom ParagraphChunker should detect this separation.
-This is the third paragraph. It should be treated as a distinct chunk."""
     with open("test_doc.txt", "w", encoding="utf-8") as f:
         f.write(txt_content)
 def cleanup_dummy_files():
-    """Removes temporary files."""
-    if os.path.exists("test_doc.md"):
-        os.remove("test_doc.md")
-    if os.path.exists("test_doc.txt"):
-        os.remove("test_doc.txt")
 def run_tests():
     print("\n--- STARTING INTEGRATION TEST ---\n")
     create_dummy_files()
     try:
-        # TEST 1: Markdown Processing
         print(">> Testing Markdown Handler...")
         md_docs = process_file("test_doc.md")
         if len(md_docs) > 0 and 'Header 1' in md_docs[0].metadata:
             print(f"✅ PASS: Markdown processed {len(md_docs)} chunks with header metadata.")
         else:
-            print(f"❌ FAIL: Markdown processing failed or missing metadata. Docs found: {len(md_docs)}")
-        # TEST 2: Text Processing (Paragraph Strategy)
         print("\n>> Testing Text Handler (Paragraph Strategy)...")
-        # We expect 3 paragraphs based on the input above
         p_docs = process_file("test_doc.txt", chunking_strategy="paragraph")
         if len(p_docs) == 3:
-            print(f"✅ PASS: Paragraph strategy identified {len(p_docs)} distinct paragraphs.")
         else:
-            print(f"⚠️ WARNING: Paragraph strategy found {len(p_docs)} chunks (Expected 3). Check min-length threshold settings in ParagraphChunker.")
-        # TEST 3: Text Processing (Token Strategy)
         print("\n>> Testing Text Handler (Token Strategy)...")
-        t_docs = process_file("test_doc.txt", chunking_strategy="token", chunk_size=50)
         if len(t_docs) > 0:
-            print(f"✅ PASS: Token strategy successfully chunked text into {len(t_docs)} segments.")
         else:
             print("❌ FAIL: Token strategy returned 0 chunks.")
-    except ImportError as e:
-        print(f"\n❌ CRITICAL ERROR: Import failed. Check folder structure.\nDetails: {e}")
     except Exception as e:
         print(f"\n❌ CRITICAL ERROR: {e}")
     finally:

 import os
 import logging
+import shutil
 from rag_engine import process_file
+# Configure logger
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("IntegrationTest")
 def create_dummy_files():
+    """Creates temporary test files."""
     # 1. Create a dummy Markdown file
     md_content = """# Navy RAG Test
 ## Section 1: Introduction
 This is a test of the markdown splitting capability.
+It should respect headers and keep them attached.
 ## Section 2: Technical Specs
 The system must handle:
     with open("test_doc.md", "w", encoding="utf-8") as f:
         f.write(md_content)
+    # 2. Create a dummy Text file with LONG paragraphs to survive the 50-char filter
+    txt_content = """This is the first paragraph. It needs to be reasonably long to ensure the ParagraphChunker does not filter it out as noise. This should now be long enough to pass the threshold.
+This is the second paragraph. It is separated by double line breaks, which is the standard delimiter for the paragraph chunking strategy. This ensures semantic integrity is maintained.
+This is the third paragraph. By keeping these chunks distinct, we ensure that the RAG retrieval process grabs complete thoughts rather than fragmented sentences.
+"""
     with open("test_doc.txt", "w", encoding="utf-8") as f:
         f.write(txt_content)
 def cleanup_dummy_files():
+    if os.path.exists("test_doc.md"): os.remove("test_doc.md")
+    if os.path.exists("test_doc.txt"): os.remove("test_doc.txt")
 def run_tests():
     print("\n--- STARTING INTEGRATION TEST ---\n")
     create_dummy_files()
     try:
+        # TEST 1: Markdown
         print(">> Testing Markdown Handler...")
         md_docs = process_file("test_doc.md")
         if len(md_docs) > 0 and 'Header 1' in md_docs[0].metadata:
             print(f"✅ PASS: Markdown processed {len(md_docs)} chunks with header metadata.")
         else:
+            print(f"❌ FAIL: Markdown processing issues. Docs found: {len(md_docs)}")
+        # TEST 2: Paragraph Strategy
         print("\n>> Testing Text Handler (Paragraph Strategy)...")
         p_docs = process_file("test_doc.txt", chunking_strategy="paragraph")
+        # We expect 3 paragraphs.
         if len(p_docs) == 3:
+            print(f"✅ PASS: Paragraph strategy identified {len(p_docs)}/3 paragraphs.")
         else:
+            print(f"⚠️ WARNING: Found {len(p_docs)} chunks (Expected 3). (Check PARAGRAPH_MIN_LENGTH settings)")
+        # TEST 3: Token Strategy (Fixed parameters)
         print("\n>> Testing Text Handler (Token Strategy)...")
+        # FIXED: Set overlap to 10 so it is less than chunk_size 50
+        t_docs = process_file("test_doc.txt", chunking_strategy="token", chunk_size=50, chunk_overlap=10)
         if len(t_docs) > 0:
+            print(f"✅ PASS: Token strategy created {len(t_docs)} segments.")
         else:
             print("❌ FAIL: Token strategy returned 0 chunks.")
+    except AttributeError:
+        print("\n❌ ERROR: Function missing. Did you add 'list_documents' back to rag_engine.py?")
     except Exception as e:
         print(f"\n❌ CRITICAL ERROR: {e}")
     finally: