MedSearchPro / config /vector_config.py
paulhemb's picture
Initial Backend Deployment
1367957
# config/vector_config.py
"""
Configuration for multiple vector database options
"""
VECTOR_CONFIG = {
"chromadb": {
"type": "local",
"description": "Local ChromaDB - Easy setup, good for development",
"persist_directory": "./data/vector_db/chromadb",
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
"chunk_size": 512,
"chunk_overlap": 50
},
"faiss_sqlite": {
"type": "local",
"description": "FAISS + SQLite - High performance, more control",
"faiss_index_path": "./data/vector_db/faiss/index.faiss",
"sqlite_db_path": "./data/vector_db/faiss/metadata.db",
"embedding_model": "sentence-transformers/all-mpnet-base-v2",
"chunk_size": 512,
"chunk_overlap": 50,
"index_type": "IVFFlat" # Options: Flat, IVFFlat, IVFPQ
},
"pinecone": {
"type": "cloud",
"description": "Pinecone - Scalable cloud solution",
"api_key": None, # Set your Pinecone API key
"environment": "us-west1-gcp",
"index_name": "medical-research-papers",
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
"chunk_size": 512,
"chunk_overlap": 50,
"dimension": 384 # all-MiniLM-L6-v2 produces 384-dim embeddings
}
}
# Available embedding models
EMBEDDING_MODELS = {
"all-MiniLM-L6-v2": {
"dimensions": 384,
"description": "Fast, good quality general-purpose model",
"model_name": "sentence-transformers/all-MiniLM-L6-v2"
},
"all-mpnet-base-v2": {
"dimensions": 768,
"description": "Higher quality, slower general-purpose model",
"model_name": "sentence-transformers/all-mpnet-base-v2"
},
"multi-qa-MiniLM-L6-cos-v1": {
"dimensions": 384,
"description": "Optimized for question-answering tasks",
"model_name": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
}
}
# Chunking strategies
CHUNKING_STRATEGIES = {
"semantic": {
"method": "semantic",
"size": 512,
"overlap": 50
},
"fixed": {
"method": "fixed",
"size": 500,
"overlap": 50
},
"paragraph": {
"method": "paragraph",
"max_size": 512
}
}
def get_vector_config(vector_type: str) -> dict:
"""Get configuration for a specific vector database type"""
return VECTOR_CONFIG.get(vector_type, {})
def get_available_vector_types() -> list:
"""Get list of available vector database types"""
return list(VECTOR_CONFIG.keys())
def get_embedding_model_config(model_name: str) -> dict:
"""Get configuration for a specific embedding model"""
return EMBEDDING_MODELS.get(model_name, {})