File size: 1,516 Bytes
dc8b7be
ee00031
dc8b7be
ee00031
dc8b7be
 
ee00031
 
 
 
 
dc8b7be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain.schema import Document
import tiktoken

def split_text_by_markdown(input_md: str, max_tokens: int = 2048, model: str = "cl100k_base") -> list:
    # Step 1: Split by headers
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    header_chunks = md_splitter.split_text(input_md)

    # Step 2: Tokenizer (OpenAI/Groq style)
    encoding = tiktoken.get_encoding(model)

    # Step 3: For each header chunk, further split if it’s too long
    final_docs = []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,    # characters per chunk (roughly ~500 tokens, safe buffer)
        chunk_overlap=100   # overlap to preserve context
    )

    for chunk in header_chunks:
        token_count = len(encoding.encode(chunk.page_content))

        if token_count > max_tokens:
            # Split into smaller parts
            sub_chunks = text_splitter.split_text(chunk.page_content)
            for sub in sub_chunks:
                final_docs.append(
                    Document(page_content=sub, metadata=chunk.metadata)
                )
        else:
            # Keep as is
            final_docs.append(
                Document(page_content=chunk.page_content, metadata=chunk.metadata)
            )

    return final_docs