Lung-Cancer-AI-Advisor / core /text_processors.py
moazx's picture
Initial commit
2a8faae
raw
history blame
570 Bytes
from langchain.text_splitter import (
RecursiveCharacterTextSplitter,
MarkdownHeaderTextSplitter
)
recursive_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=200,
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""],
)
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=[
("##", "Header 2"), # Main guideline source (NCCN, ESMO, WHO…)
("###", "Header 3"), # Subsections (Features, Recommendations, Statistics…)
],
strip_headers=False,
)