File size: 4,824 Bytes
5374a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from typing import Optional, Callable, Dict, Any, List

from pydantic import Field

from evoagentx.core.base_config import BaseConfig, BaseModule
from .indexings.base import IndexType
from .chunkers.base import ChunkingStrategy
from .embeddings.base import EmbeddingProvider
from .retrievers.base import RetrieverType
from .postprocessors.base import RerankerType


class ReaderConfig(BaseModule):
    """Configuration for document reading stage."""
    recursive: bool = Field(default=False, description="Whether to recursively read directories.")
    exclude_hidden: bool = Field(default=True, description="Exclude hidden files and directories.")
    num_files_limit: Optional[int] = Field(default=None, description="Maximum number of files to read.")
    custom_metadata_function: Optional[Callable] = Field(default=None, description="Custom function to extract metadata from files.")
    extern_file_extractor: Optional[Dict[str, Any]] = Field(default=None, description="External file extractors for specific file types.")
    errors: str = Field(default="ignore", description="Error handling strategy ('ignore', 'strict').")
    encoding: str = Field(default="utf-8", description="File encoding for reading.")


class ChunkerConfig(BaseModule):
    """Configuration for document chunking stage."""
    strategy: str = Field(default=ChunkingStrategy.SIMPLE, description="Chunking strategy (SIMPLE, SEMANTIC, HIERARCHICAL).")
    chunk_size: int = Field(default=1024, description="Maximum size of each chunk in characters.")
    chunk_overlap: int = Field(default=20, description="Overlap between chunks in characters.")
    max_chunks: Optional[int] = Field(default=None, description="Maximum number of chunks per document.")


class EmbeddingConfig(BaseModule):
    """Configuration for embedding stage."""
    provider: str = Field(default=EmbeddingProvider.OPENAI, description="Embedding provider (OPENAI, HUGGINGFACE, AZURE_OPENAI).")
    model_name: str = Field(default="text-embedding-ada-002", description="Name of the embedding model.")
    api_key: Optional[str] = Field(default=None, description="API key for the embedding provider (if required).")
    api_url: str = Field(default="https://api.openai.com/v1", description="api url for embedding model.")
    dimensions: Optional[int] = Field(default=None, description="Dimensions of the embedding model.")
    normalize: Optional[bool] = Field(default=True, description="Whether to normalize the embedding model(huggingface).")
    device: Optional[str] = Field(default=None, description="The device was used for embedding model.")
    deployment_name: Optional[str] = Field(default=None, description="Deployment name for Azure OpenAI embeddings.")
    azure_endpoint: Optional[str] = Field(default=None, description="Azure OpenAI endpoint.")
    api_version: Optional[str] = Field(default=None, description="Azure OpenAI API version.")

class IndexConfig(BaseModule):
    """Configuration for indexing stage."""
    index_type: str = Field(default=IndexType.VECTOR, description="Index type (VECTOR, GRAPH, SUMMARY, TREE).")


class RetrievalConfig(BaseModule):
    """Configuration for retrieval stage.(pre-retrieve, retrieve, post-retrieve)"""
    # prepross_type: str = Field(default="")
    retrivel_type: str = Field(default=RetrieverType.VECTOR, description="The type of retriver for retrieve.")
    postprocessor_type: str = Field(default=RerankerType.SIMPLE, description="The type of postprocessor for retrieve.")
    top_k: int = Field(default=5, description="Number of top results to retrieve.")
    similarity_cutoff: Optional[float] = Field(default=0.7, description="Minimum similarity score for retrieved chunks.")
    keyword_filters: Optional[List[str]] = Field(default=None, description="Keywords to filter retrieved chunks.")
    metadata_filters: Optional[Dict[str, Any]] = Field(default=None, description="Metadata filters for retrieval.")


class RAGConfig(BaseConfig):
    """Configuration for the RAG pipeline."""
    modality: str = Field(default="text", description="RAG modality: 'text' for text-only, 'multimodal' for images.")
    num_workers: Optional[int] = Field(default=None, description="Number of workers for parallel processing (e.g., reading, retrieval).")
    reader: ReaderConfig = Field(default_factory=ReaderConfig, description="Configuration for document reading.")
    chunker: ChunkerConfig = Field(default_factory=ChunkerConfig, description="Configuration for document chunking.")
    embedding: EmbeddingConfig = Field(default_factory=EmbeddingConfig, description="Configuration for embeddings.")
    index: IndexConfig = Field(default_factory=IndexConfig, description="Configuration for indexing.")
    retrieval: RetrievalConfig = Field(default_factory=RetrievalConfig, description="Configuration for retrieval.")