# config/vector_config.py """ Configuration for multiple vector database options """ VECTOR_CONFIG = { "chromadb": { "type": "local", "description": "Local ChromaDB - Easy setup, good for development", "persist_directory": "./data/vector_db/chromadb", "embedding_model": "sentence-transformers/all-MiniLM-L6-v2", "chunk_size": 512, "chunk_overlap": 50 }, "faiss_sqlite": { "type": "local", "description": "FAISS + SQLite - High performance, more control", "faiss_index_path": "./data/vector_db/faiss/index.faiss", "sqlite_db_path": "./data/vector_db/faiss/metadata.db", "embedding_model": "sentence-transformers/all-mpnet-base-v2", "chunk_size": 512, "chunk_overlap": 50, "index_type": "IVFFlat" # Options: Flat, IVFFlat, IVFPQ }, "pinecone": { "type": "cloud", "description": "Pinecone - Scalable cloud solution", "api_key": None, # Set your Pinecone API key "environment": "us-west1-gcp", "index_name": "medical-research-papers", "embedding_model": "sentence-transformers/all-MiniLM-L6-v2", "chunk_size": 512, "chunk_overlap": 50, "dimension": 384 # all-MiniLM-L6-v2 produces 384-dim embeddings } } # Available embedding models EMBEDDING_MODELS = { "all-MiniLM-L6-v2": { "dimensions": 384, "description": "Fast, good quality general-purpose model", "model_name": "sentence-transformers/all-MiniLM-L6-v2" }, "all-mpnet-base-v2": { "dimensions": 768, "description": "Higher quality, slower general-purpose model", "model_name": "sentence-transformers/all-mpnet-base-v2" }, "multi-qa-MiniLM-L6-cos-v1": { "dimensions": 384, "description": "Optimized for question-answering tasks", "model_name": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" } } # Chunking strategies CHUNKING_STRATEGIES = { "semantic": { "method": "semantic", "size": 512, "overlap": 50 }, "fixed": { "method": "fixed", "size": 500, "overlap": 50 }, "paragraph": { "method": "paragraph", "max_size": 512 } } def get_vector_config(vector_type: str) -> dict: """Get configuration for a specific vector database type""" return VECTOR_CONFIG.get(vector_type, {}) def get_available_vector_types() -> list: """Get list of available vector database types""" return list(VECTOR_CONFIG.keys()) def get_embedding_model_config(model_name: str) -> dict: """Get configuration for a specific embedding model""" return EMBEDDING_MODELS.get(model_name, {})