# SPARKNET RAG Configuration # =========================== # Vector Store Configuration vector_store: # Store type: "chromadb" (default) type: chromadb # ChromaDB settings chromadb: persist_directory: ./data/vectorstore collection_name: sparknet_documents anonymized_telemetry: false # Search settings default_top_k: 5 similarity_threshold: 0.7 # Embedding Configuration embeddings: # Adapter type: "ollama" (default) or "openai" adapter_type: ollama # Ollama settings (local, default) ollama: base_url: http://localhost:11434 model: nomic-embed-text # Options: nomic-embed-text, mxbai-embed-large, all-minilm # OpenAI settings (optional, feature-flagged) openai: enabled: false model: text-embedding-3-small # Options: text-embedding-3-small, text-embedding-3-large # api_key: ${OPENAI_API_KEY} # Use env var # Common settings batch_size: 32 timeout: 60 # Caching enable_cache: true cache_directory: ./data/embedding_cache # Indexer Configuration indexer: # Batch processing batch_size: 32 # Metadata to index include_bbox: true include_page: true include_chunk_type: true # Filtering skip_empty_chunks: true min_chunk_length: 10 # Retriever Configuration retriever: # Search parameters default_top_k: 5 similarity_threshold: 0.7 max_results: 20 # Reranking (future) enable_reranking: false rerank_top_k: 10 # Evidence settings include_evidence: true evidence_snippet_length: 200 # Generator Configuration generator: # LLM provider: "ollama" (default) or "openai" llm_provider: ollama # Ollama settings ollama: base_url: http://localhost:11434 model: llama3.2:3b # Options: llama3.2:3b, llama3.1:8b, mistral # OpenAI settings (optional) openai: model: gpt-4o-mini # Options: gpt-4o-mini, gpt-4o # api_key: ${OPENAI_API_KEY} # Use env var # Generation settings temperature: 0.1 max_tokens: 1024 timeout: 120 # Citation settings require_citations: true citation_format: "[{index}]" # Abstention settings abstain_on_low_confidence: true confidence_threshold: 0.6 # Query Processing query: # Query expansion expand_queries: false max_expansions: 3 # Hybrid search (future) enable_hybrid: false keyword_weight: 0.3 semantic_weight: 0.7 # Metadata Filtering filters: # Supported filter types supported: - document_id - chunk_type - page - confidence_min # Default filters (applied to all queries) defaults: {} # Performance Settings performance: # Connection pooling max_connections: 10 # Timeouts embedding_timeout: 60 search_timeout: 30 generation_timeout: 120 # Caching query_cache_enabled: true query_cache_ttl: 3600 # Seconds # Logging logging: level: INFO include_queries: false # Log user queries (privacy consideration) include_latency: true