File size: 1,257 Bytes
f085180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# agent/chunker.py

from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict

class TextChunker:
    def __init__(self, chunk_size: int = 2048, chunk_overlap: int = 512):
        """

        Initialize text chunker with recursive splitting strategy.

        """
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ".", "!", "?", " ", ""]
        )

    def chunk_text(self, content: str, metadata: Dict[str, str]) -> List[Dict[str, str]]:
        """

        Split the text into clean chunks and attach metadata for vector storage.



        Args:

            content (str): Raw web page text.

            metadata (Dict[str, str]): Metadata like URL, title, etc.



        Returns:

            List[Dict[str, str]]: List of chunks with content + metadata

        """
        try:
            chunks = self.splitter.split_text(content)
            return [{"content": chunk, "metadata": metadata} for chunk in chunks]
        except Exception as e:
            import logging
            logging.error(f"[Chunker] Failed to split content: {e}")
            return []