Spaces:
Sleeping
Sleeping
File size: 1,397 Bytes
ced61cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from langchain_text_splitters import CharacterTextSplitter
class DataSplitting:
def __init__(self, chunk_size=1000, chunk_overlap=200, separator="\n\n"):
"""
Initialize the DataSplitting class.
Args:
chunk_size (int): Maximum size of each chunk
chunk_overlap (int): Number of characters to overlap between chunks
separator (str): Character(s) to split on
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.separator = separator
self.text_splitter = CharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
separator=self.separator
)
def split_text(self, text):
"""
Split the input text into chunks.
Args:
text (str): The text to be split
Returns:
list: List of text chunks
"""
return self.text_splitter.split_text(text)
def split_documents(self, documents):
"""
Split documents into chunks.
Args:
documents (list): List of documents to be split
Returns:
list: List of document chunks
"""
return self.text_splitter.split_documents(documents) |