File size: 3,239 Bytes
4225666
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""
Test script to demonstrate the difference between default markdown splitter
and section-aware markdown splitter.
"""

from app.services.text_splitter import TextSplitter

# Sample markdown content with sections (like your faculty data)
sample_markdown = """---
title: Information & Communication Technology Department
source_url: https://www.vgecg.ac.in/department.php?dept=15
---

# Information & Communication Technology Department Overview

Description:
The Information & Communication Technology Department at VGEC started in 2023.

- **Programme:** Information & Communication Technology
- **Year of starting:** 2023
- **Intake:** 60

---

# Vision of the Information & Communication Technology Department

Description:
The department aims to create an ecosystem that fosters the growth of socially responsible engineers.

---

# Mission of the Information & Communication Technology Department

Description:
- To establish cutting-edge laboratories.
- To inspire faculty to upgrade their expertise.
- To foster a culture of research.

---

# Faculty of the Information & Communication Technology Department

Description:
- **Dr. Arun B. Nandurbarkar:** Professor, Ph.D.
- **Prof. Manish P. Patel:** Associate Professor, M.E.
- **Prof. Sanjaykumar Dahyalal Joshi:** Associate Professor, M. E.
"""

def test_splitters():
    print("=" * 80)
    print("TESTING MARKDOWN SPLITTERS")
    print("=" * 80)
    
    # Test 1: Original markdown splitter (splits on headers)
    print("\n1️⃣ ORIGINAL for_markdown() SPLITTER:")
    print("-" * 80)
    original_splitter = TextSplitter.for_markdown(chunk_size=500, chunk_overlap=50)
    original_chunks = original_splitter.split_text(sample_markdown)
    print(f"Number of chunks: {len(original_chunks)}")
    print(f"Chunk sizes: {[len(chunk) for chunk in original_chunks]}")
    print("\nFirst 3 chunks preview:")
    for i, chunk in enumerate(original_chunks[:3]):
        print(f"\n--- Chunk {i+1} ({len(chunk)} chars) ---")
        print(chunk[:200] + "..." if len(chunk) > 200 else chunk)
    
    # Test 2: New section-aware splitter (respects --- boundaries)
    print("\n\n2️⃣ NEW for_markdown_with_sections() SPLITTER:")
    print("-" * 80)
    section_splitter = TextSplitter.for_markdown_with_sections(chunk_size=500, chunk_overlap=50)
    section_chunks = section_splitter.split_text(sample_markdown)
    print(f"Number of chunks: {len(section_chunks)}")
    print(f"Chunk sizes: {[len(chunk) for chunk in section_chunks]}")
    print("\nFirst 3 chunks preview:")
    for i, chunk in enumerate(section_chunks[:3]):
        print(f"\n--- Chunk {i+1} ({len(chunk)} chars) ---")
        print(chunk[:200] + "..." if len(chunk) > 200 else chunk)
    
    # Summary
    print("\n" + "=" * 80)
    print("SUMMARY:")
    print("=" * 80)
    print(f"Original splitter: {len(original_chunks)} chunks (may split headers separately)")
    print(f"Section-aware splitter: {len(section_chunks)} chunks (keeps sections together)")
    print(f"Reduction: {len(original_chunks) - len(section_chunks)} fewer chunks")
    print(f"Improvement: {((len(original_chunks) - len(section_chunks)) / len(original_chunks) * 100):.1f}% reduction")

if __name__ == "__main__":
    test_splitters()