vgecbot / test_markdown_splitter.py
harsh-dev's picture
docker deployment
4225666
"""
Test script to demonstrate the difference between default markdown splitter
and section-aware markdown splitter.
"""
from app.services.text_splitter import TextSplitter
# Sample markdown content with sections (like your faculty data)
sample_markdown = """---
title: Information & Communication Technology Department
source_url: https://www.vgecg.ac.in/department.php?dept=15
---
# Information & Communication Technology Department Overview
Description:
The Information & Communication Technology Department at VGEC started in 2023.
- **Programme:** Information & Communication Technology
- **Year of starting:** 2023
- **Intake:** 60
---
# Vision of the Information & Communication Technology Department
Description:
The department aims to create an ecosystem that fosters the growth of socially responsible engineers.
---
# Mission of the Information & Communication Technology Department
Description:
- To establish cutting-edge laboratories.
- To inspire faculty to upgrade their expertise.
- To foster a culture of research.
---
# Faculty of the Information & Communication Technology Department
Description:
- **Dr. Arun B. Nandurbarkar:** Professor, Ph.D.
- **Prof. Manish P. Patel:** Associate Professor, M.E.
- **Prof. Sanjaykumar Dahyalal Joshi:** Associate Professor, M. E.
"""
def test_splitters():
print("=" * 80)
print("TESTING MARKDOWN SPLITTERS")
print("=" * 80)
# Test 1: Original markdown splitter (splits on headers)
print("\n1️⃣ ORIGINAL for_markdown() SPLITTER:")
print("-" * 80)
original_splitter = TextSplitter.for_markdown(chunk_size=500, chunk_overlap=50)
original_chunks = original_splitter.split_text(sample_markdown)
print(f"Number of chunks: {len(original_chunks)}")
print(f"Chunk sizes: {[len(chunk) for chunk in original_chunks]}")
print("\nFirst 3 chunks preview:")
for i, chunk in enumerate(original_chunks[:3]):
print(f"\n--- Chunk {i+1} ({len(chunk)} chars) ---")
print(chunk[:200] + "..." if len(chunk) > 200 else chunk)
# Test 2: New section-aware splitter (respects --- boundaries)
print("\n\n2️⃣ NEW for_markdown_with_sections() SPLITTER:")
print("-" * 80)
section_splitter = TextSplitter.for_markdown_with_sections(chunk_size=500, chunk_overlap=50)
section_chunks = section_splitter.split_text(sample_markdown)
print(f"Number of chunks: {len(section_chunks)}")
print(f"Chunk sizes: {[len(chunk) for chunk in section_chunks]}")
print("\nFirst 3 chunks preview:")
for i, chunk in enumerate(section_chunks[:3]):
print(f"\n--- Chunk {i+1} ({len(chunk)} chars) ---")
print(chunk[:200] + "..." if len(chunk) > 200 else chunk)
# Summary
print("\n" + "=" * 80)
print("SUMMARY:")
print("=" * 80)
print(f"Original splitter: {len(original_chunks)} chunks (may split headers separately)")
print(f"Section-aware splitter: {len(section_chunks)} chunks (keeps sections together)")
print(f"Reduction: {len(original_chunks) - len(section_chunks)} fewer chunks")
print(f"Improvement: {((len(original_chunks) - len(section_chunks)) / len(original_chunks) * 100):.1f}% reduction")
if __name__ == "__main__":
test_splitters()