Spaces:
Runtime error
Runtime error
| # agent/chunker.py | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from typing import List, Dict | |
| class TextChunker: | |
| def __init__(self, chunk_size: int = 2048, chunk_overlap: int = 512): | |
| """ | |
| Initialize text chunker with recursive splitting strategy. | |
| """ | |
| self.splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| separators=["\n\n", "\n", ".", "!", "?", " ", ""] | |
| ) | |
| def chunk_text(self, content: str, metadata: Dict[str, str]) -> List[Dict[str, str]]: | |
| """ | |
| Split the text into clean chunks and attach metadata for vector storage. | |
| Args: | |
| content (str): Raw web page text. | |
| metadata (Dict[str, str]): Metadata like URL, title, etc. | |
| Returns: | |
| List[Dict[str, str]]: List of chunks with content + metadata | |
| """ | |
| try: | |
| chunks = self.splitter.split_text(content) | |
| return [{"content": chunk, "metadata": metadata} for chunk in chunks] | |
| except Exception as e: | |
| import logging | |
| logging.error(f"[Chunker] Failed to split content: {e}") | |
| return [] | |