""" Test script to demonstrate the difference between default markdown splitter and section-aware markdown splitter. """ from app.services.text_splitter import TextSplitter # Sample markdown content with sections (like your faculty data) sample_markdown = """--- title: Information & Communication Technology Department source_url: https://www.vgecg.ac.in/department.php?dept=15 --- # Information & Communication Technology Department Overview Description: The Information & Communication Technology Department at VGEC started in 2023. - **Programme:** Information & Communication Technology - **Year of starting:** 2023 - **Intake:** 60 --- # Vision of the Information & Communication Technology Department Description: The department aims to create an ecosystem that fosters the growth of socially responsible engineers. --- # Mission of the Information & Communication Technology Department Description: - To establish cutting-edge laboratories. - To inspire faculty to upgrade their expertise. - To foster a culture of research. --- # Faculty of the Information & Communication Technology Department Description: - **Dr. Arun B. Nandurbarkar:** Professor, Ph.D. - **Prof. Manish P. Patel:** Associate Professor, M.E. - **Prof. Sanjaykumar Dahyalal Joshi:** Associate Professor, M. E. """ def test_splitters(): print("=" * 80) print("TESTING MARKDOWN SPLITTERS") print("=" * 80) # Test 1: Original markdown splitter (splits on headers) print("\n1️⃣ ORIGINAL for_markdown() SPLITTER:") print("-" * 80) original_splitter = TextSplitter.for_markdown(chunk_size=500, chunk_overlap=50) original_chunks = original_splitter.split_text(sample_markdown) print(f"Number of chunks: {len(original_chunks)}") print(f"Chunk sizes: {[len(chunk) for chunk in original_chunks]}") print("\nFirst 3 chunks preview:") for i, chunk in enumerate(original_chunks[:3]): print(f"\n--- Chunk {i+1} ({len(chunk)} chars) ---") print(chunk[:200] + "..." if len(chunk) > 200 else chunk) # Test 2: New section-aware splitter (respects --- boundaries) print("\n\n2️⃣ NEW for_markdown_with_sections() SPLITTER:") print("-" * 80) section_splitter = TextSplitter.for_markdown_with_sections(chunk_size=500, chunk_overlap=50) section_chunks = section_splitter.split_text(sample_markdown) print(f"Number of chunks: {len(section_chunks)}") print(f"Chunk sizes: {[len(chunk) for chunk in section_chunks]}") print("\nFirst 3 chunks preview:") for i, chunk in enumerate(section_chunks[:3]): print(f"\n--- Chunk {i+1} ({len(chunk)} chars) ---") print(chunk[:200] + "..." if len(chunk) > 200 else chunk) # Summary print("\n" + "=" * 80) print("SUMMARY:") print("=" * 80) print(f"Original splitter: {len(original_chunks)} chunks (may split headers separately)") print(f"Section-aware splitter: {len(section_chunks)} chunks (keeps sections together)") print(f"Reduction: {len(original_chunks) - len(section_chunks)} fewer chunks") print(f"Improvement: {((len(original_chunks) - len(section_chunks)) / len(original_chunks) * 100):.1f}% reduction") if __name__ == "__main__": test_splitters()