Spaces:
Sleeping
Sleeping
| import os | |
| import logging | |
| import shutil | |
| from rag_engine import process_file | |
| # Configure logger | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger("IntegrationTest") | |
| def create_dummy_files(): | |
| """Creates temporary test files.""" | |
| # 1. Create a dummy Markdown file | |
| md_content = """# Navy RAG Test | |
| ## Section 1: Introduction | |
| This is a test of the markdown splitting capability. | |
| It should respect headers and keep them attached. | |
| ## Section 2: Technical Specs | |
| The system must handle: | |
| * Paragraphs | |
| * Headers | |
| * Metadata | |
| """ | |
| with open("test_doc.md", "w", encoding="utf-8") as f: | |
| f.write(md_content) | |
| # 2. Create a dummy Text file with LONG paragraphs to survive the 50-char filter | |
| txt_content = """This is the first paragraph. It needs to be reasonably long to ensure the ParagraphChunker does not filter it out as noise. This should now be long enough to pass the threshold. | |
| This is the second paragraph. It is separated by double line breaks, which is the standard delimiter for the paragraph chunking strategy. This ensures semantic integrity is maintained. | |
| This is the third paragraph. By keeping these chunks distinct, we ensure that the RAG retrieval process grabs complete thoughts rather than fragmented sentences. | |
| """ | |
| with open("test_doc.txt", "w", encoding="utf-8") as f: | |
| f.write(txt_content) | |
| def cleanup_dummy_files(): | |
| if os.path.exists("test_doc.md"): os.remove("test_doc.md") | |
| if os.path.exists("test_doc.txt"): os.remove("test_doc.txt") | |
| def run_tests(): | |
| print("\n--- STARTING INTEGRATION TEST ---\n") | |
| create_dummy_files() | |
| try: | |
| # TEST 1: Markdown | |
| print(">> Testing Markdown Handler...") | |
| md_docs = process_file("test_doc.md") | |
| if len(md_docs) > 0 and 'Header 1' in md_docs[0].metadata: | |
| print(f"β PASS: Markdown processed {len(md_docs)} chunks with header metadata.") | |
| else: | |
| print(f"β FAIL: Markdown processing issues. Docs found: {len(md_docs)}") | |
| # TEST 2: Paragraph Strategy | |
| print("\n>> Testing Text Handler (Paragraph Strategy)...") | |
| p_docs = process_file("test_doc.txt", chunking_strategy="paragraph") | |
| # We expect 3 paragraphs. | |
| if len(p_docs) == 3: | |
| print(f"β PASS: Paragraph strategy identified {len(p_docs)}/3 paragraphs.") | |
| else: | |
| print(f"β οΈ WARNING: Found {len(p_docs)} chunks (Expected 3). (Check PARAGRAPH_MIN_LENGTH settings)") | |
| # TEST 3: Token Strategy (Fixed parameters) | |
| print("\n>> Testing Text Handler (Token Strategy)...") | |
| # FIXED: Set overlap to 10 so it is less than chunk_size 50 | |
| t_docs = process_file("test_doc.txt", chunking_strategy="token", chunk_size=50, chunk_overlap=10) | |
| if len(t_docs) > 0: | |
| print(f"β PASS: Token strategy created {len(t_docs)} segments.") | |
| else: | |
| print("β FAIL: Token strategy returned 0 chunks.") | |
| except AttributeError: | |
| print("\nβ ERROR: Function missing. Did you add 'list_documents' back to rag_engine.py?") | |
| except Exception as e: | |
| print(f"\nβ CRITICAL ERROR: {e}") | |
| finally: | |
| cleanup_dummy_files() | |
| print("\n--- TEST COMPLETE ---") | |
| if __name__ == "__main__": | |
| run_tests() |