Spaces:
Sleeping
Sleeping
| """ | |
| Test script to demonstrate the difference between default markdown splitter | |
| and section-aware markdown splitter. | |
| """ | |
| from app.services.text_splitter import TextSplitter | |
| # Sample markdown content with sections (like your faculty data) | |
| sample_markdown = """--- | |
| title: Information & Communication Technology Department | |
| source_url: https://www.vgecg.ac.in/department.php?dept=15 | |
| --- | |
| # Information & Communication Technology Department Overview | |
| Description: | |
| The Information & Communication Technology Department at VGEC started in 2023. | |
| - **Programme:** Information & Communication Technology | |
| - **Year of starting:** 2023 | |
| - **Intake:** 60 | |
| --- | |
| # Vision of the Information & Communication Technology Department | |
| Description: | |
| The department aims to create an ecosystem that fosters the growth of socially responsible engineers. | |
| --- | |
| # Mission of the Information & Communication Technology Department | |
| Description: | |
| - To establish cutting-edge laboratories. | |
| - To inspire faculty to upgrade their expertise. | |
| - To foster a culture of research. | |
| --- | |
| # Faculty of the Information & Communication Technology Department | |
| Description: | |
| - **Dr. Arun B. Nandurbarkar:** Professor, Ph.D. | |
| - **Prof. Manish P. Patel:** Associate Professor, M.E. | |
| - **Prof. Sanjaykumar Dahyalal Joshi:** Associate Professor, M. E. | |
| """ | |
| def test_splitters(): | |
| print("=" * 80) | |
| print("TESTING MARKDOWN SPLITTERS") | |
| print("=" * 80) | |
| # Test 1: Original markdown splitter (splits on headers) | |
| print("\n1️⃣ ORIGINAL for_markdown() SPLITTER:") | |
| print("-" * 80) | |
| original_splitter = TextSplitter.for_markdown(chunk_size=500, chunk_overlap=50) | |
| original_chunks = original_splitter.split_text(sample_markdown) | |
| print(f"Number of chunks: {len(original_chunks)}") | |
| print(f"Chunk sizes: {[len(chunk) for chunk in original_chunks]}") | |
| print("\nFirst 3 chunks preview:") | |
| for i, chunk in enumerate(original_chunks[:3]): | |
| print(f"\n--- Chunk {i+1} ({len(chunk)} chars) ---") | |
| print(chunk[:200] + "..." if len(chunk) > 200 else chunk) | |
| # Test 2: New section-aware splitter (respects --- boundaries) | |
| print("\n\n2️⃣ NEW for_markdown_with_sections() SPLITTER:") | |
| print("-" * 80) | |
| section_splitter = TextSplitter.for_markdown_with_sections(chunk_size=500, chunk_overlap=50) | |
| section_chunks = section_splitter.split_text(sample_markdown) | |
| print(f"Number of chunks: {len(section_chunks)}") | |
| print(f"Chunk sizes: {[len(chunk) for chunk in section_chunks]}") | |
| print("\nFirst 3 chunks preview:") | |
| for i, chunk in enumerate(section_chunks[:3]): | |
| print(f"\n--- Chunk {i+1} ({len(chunk)} chars) ---") | |
| print(chunk[:200] + "..." if len(chunk) > 200 else chunk) | |
| # Summary | |
| print("\n" + "=" * 80) | |
| print("SUMMARY:") | |
| print("=" * 80) | |
| print(f"Original splitter: {len(original_chunks)} chunks (may split headers separately)") | |
| print(f"Section-aware splitter: {len(section_chunks)} chunks (keeps sections together)") | |
| print(f"Reduction: {len(original_chunks) - len(section_chunks)} fewer chunks") | |
| print(f"Improvement: {((len(original_chunks) - len(section_chunks)) / len(original_chunks) * 100):.1f}% reduction") | |
| if __name__ == "__main__": | |
| test_splitters() | |