File size: 570 Bytes
2a8faae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    MarkdownHeaderTextSplitter
)

recursive_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""],
    )


markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[
        ("##", "Header 2"),   # Main guideline source (NCCN, ESMO, WHO…)
        ("###", "Header 3"),  # Subsections (Features, Recommendations, Statistics…)
    ],
    strip_headers=False,
)