Spaces:
Running
Running
| import os | |
| import logging | |
| from rag_engine import process_file | |
| # Configure logger to see the internal output of your chunkers | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger("IntegrationTest") | |
| def create_dummy_files(): | |
| """Creates temporary test files for validation.""" | |
| # 1. Create a dummy Markdown file | |
| md_content = """# Navy RAG Test | |
| ## Section 1: Introduction | |
| This is a test of the markdown splitting capability. | |
| It should respect headers. | |
| ## Section 2: Technical Specs | |
| The system must handle: | |
| * Paragraphs | |
| * Headers | |
| * Metadata | |
| """ | |
| with open("test_doc.md", "w", encoding="utf-8") as f: | |
| f.write(md_content) | |
| # 2. Create a dummy Text file | |
| txt_content = """This is a standard text file. | |
| It uses double line breaks to indicate paragraphs. | |
| The custom ParagraphChunker should detect this separation. | |
| This is the third paragraph. It should be treated as a distinct chunk.""" | |
| with open("test_doc.txt", "w", encoding="utf-8") as f: | |
| f.write(txt_content) | |
| def cleanup_dummy_files(): | |
| """Removes temporary files.""" | |
| if os.path.exists("test_doc.md"): | |
| os.remove("test_doc.md") | |
| if os.path.exists("test_doc.txt"): | |
| os.remove("test_doc.txt") | |
| def run_tests(): | |
| print("\n--- STARTING INTEGRATION TEST ---\n") | |
| create_dummy_files() | |
| try: | |
| # TEST 1: Markdown Processing | |
| print(">> Testing Markdown Handler...") | |
| md_docs = process_file("test_doc.md") | |
| if len(md_docs) > 0 and 'Header 1' in md_docs[0].metadata: | |
| print(f"β PASS: Markdown processed {len(md_docs)} chunks with header metadata.") | |
| else: | |
| print(f"β FAIL: Markdown processing failed or missing metadata. Docs found: {len(md_docs)}") | |
| # TEST 2: Text Processing (Paragraph Strategy) | |
| print("\n>> Testing Text Handler (Paragraph Strategy)...") | |
| # We expect 3 paragraphs based on the input above | |
| p_docs = process_file("test_doc.txt", chunking_strategy="paragraph") | |
| if len(p_docs) == 3: | |
| print(f"β PASS: Paragraph strategy identified {len(p_docs)} distinct paragraphs.") | |
| else: | |
| print(f"β οΈ WARNING: Paragraph strategy found {len(p_docs)} chunks (Expected 3). Check min-length threshold settings in ParagraphChunker.") | |
| # TEST 3: Text Processing (Token Strategy) | |
| print("\n>> Testing Text Handler (Token Strategy)...") | |
| t_docs = process_file("test_doc.txt", chunking_strategy="token", chunk_size=50) | |
| if len(t_docs) > 0: | |
| print(f"β PASS: Token strategy successfully chunked text into {len(t_docs)} segments.") | |
| else: | |
| print("β FAIL: Token strategy returned 0 chunks.") | |
| except ImportError as e: | |
| print(f"\nβ CRITICAL ERROR: Import failed. Check folder structure.\nDetails: {e}") | |
| except Exception as e: | |
| print(f"\nβ CRITICAL ERROR: {e}") | |
| finally: | |
| cleanup_dummy_files() | |
| print("\n--- TEST COMPLETE ---") | |
| if __name__ == "__main__": | |
| run_tests() |