File size: 2,075 Bytes
e68d535
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from typing import Optional, List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pydantic import BaseModel

class RecursiveCharacterTextChunkerConfig(BaseModel):
    chunk_size: int = 500
    chunk_overlap: int = 100


class RecursiveCharacterTextChunker:
    def __init__(self, config: RecursiveCharacterTextChunkerConfig):
        self.config = config

    def chunk_text(self, text: str, separators: Optional[List[str]] = None) -> List[str]:
        """
        Chunks a single text string using Langchain's RecursiveCharacterTextSplitter.

        This function is designed to be easily used with pandas DataFrame.apply().

        Args:
            text (str): The input text string to be chunked.
            chunk_size (int): The maximum number of characters per chunk.
            chunk_overlap (int): The number of characters to overlap between chunks.
            separators (Optional[List[str]]): A list of characters/strings to use as split points.
                                               Defaults to common markdown-friendly separators.

        Returns:
            List[str]: A list of chunked text strings.
                       If the input text is empty or None, returns an empty list.
        """
        if not text:
            return []

        # Initialize the splitter inside the function.
        # This ensures each text receives a fresh splitter instance if needed,
        # though it's more efficient to initialize it once outside if possible
        # and pass it, but for df.apply() direct column operation, this is common.
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.config.chunk_size,
            chunk_overlap=self.config.chunk_overlap,
            separators=separators or ["\n\n", "\n", " ", ""], # Default separators
            length_function=len, # Use character length
            is_separator_regex=False
        )

        # Use split_text which returns a list of strings
        chunked_texts = text_splitter.split_text(text)

        return chunked_texts