File size: 1,516 Bytes
dc8b7be ee00031 dc8b7be ee00031 dc8b7be ee00031 dc8b7be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain.schema import Document
import tiktoken
def split_text_by_markdown(input_md: str, max_tokens: int = 2048, model: str = "cl100k_base") -> list:
# Step 1: Split by headers
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
header_chunks = md_splitter.split_text(input_md)
# Step 2: Tokenizer (OpenAI/Groq style)
encoding = tiktoken.get_encoding(model)
# Step 3: For each header chunk, further split if it’s too long
final_docs = []
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # characters per chunk (roughly ~500 tokens, safe buffer)
chunk_overlap=100 # overlap to preserve context
)
for chunk in header_chunks:
token_count = len(encoding.encode(chunk.page_content))
if token_count > max_tokens:
# Split into smaller parts
sub_chunks = text_splitter.split_text(chunk.page_content)
for sub in sub_chunks:
final_docs.append(
Document(page_content=sub, metadata=chunk.metadata)
)
else:
# Keep as is
final_docs.append(
Document(page_content=chunk.page_content, metadata=chunk.metadata)
)
return final_docs
|