File size: 535 Bytes
c5e1945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    MarkdownHeaderTextSplitter
)

recursive_500 = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=[
        "\n\n",  # Paragraph breaks
        "\n",    # Line breaks
        ".",     # Sentences
        ",",     # Clauses
        " ",     # Words
]
)


markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[
        ("#", "company_title"),
        ("##", "section"),
    ]
)