File size: 2,405 Bytes
5374a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from typing import Dict, Any

from llama_index.core.embeddings import BaseEmbedding

from .base import BaseChunker, ChunkingStrategy
from .simple_chunker import SimpleChunker
from .semantic_chunker import SemanticChunker
from .hierachical_chunker import HierarchicalChunker
from evoagentx.core.logging import logger

__all__ = ['SimpleChunker', 'SemanticChunker', 'HierarchicalChunker', 'ChunkFactory', 'BaseChunker']


class ChunkFactory:
    """Factory for creating chunkers based on configuration."""
    
    def create(
        self,
        strategy: ChunkingStrategy,
        embed_model: BaseEmbedding = None,
        chunker_config: Dict[str, Any] = None
    ) -> BaseChunker:
        """Create a chunker based on strategy and configuration.
        
        Args:
            strategy (ChunkingStrategy): The chunking strategy.
            embed_model (BaseEmbedding, optional): Embedding model for semantic chunking.
            chunker_config (Dict[str, Any], optional): Chunker configuration.
            
        Returns:
            BaseChunker: A chunker instance.
            
        Raises:
            ValueError: If the strategy or configuration is invalid.
        """
        chunker_config = chunker_config or {}
        
        if strategy == ChunkingStrategy.SIMPLE:
            chunker = SimpleChunker(
                chunk_size=chunker_config.get("chunk_size", 1024),
                chunk_overlap=chunker_config.get("chunk_overlap", 20),
                max_workers=chunker_config.get("max_workers", 2)
            )
        elif strategy == ChunkingStrategy.SEMANTIC:
            if not embed_model:
                raise ValueError("Embed model required for semantic chunking")
            chunker = SemanticChunker(
                embed_model=embed_model,
                similarity_threshold=chunker_config.get("similarity_threshold", 0.7),
                max_workers=chunker_config.get("max_workers", 2)
            )
        elif strategy == ChunkingStrategy.HIERARCHICAL:
            chunker = HierarchicalChunker(
                chunk_sizes=chunker_config.get("chunk_sizes", [2048, 512, 128]),
                chunk_overlap=chunker_config.get("chunk_overlap", 20)
            )
        else:
            raise ValueError(f"Unsupported chunking strategy: {strategy}")
        
        logger.info(f"Created chunker for strategy: {strategy}")
        return chunker