Spaces:

paulhemb
/

MedSearchPro

Sleeping

File size: 2,819 Bytes
# config/vector_config.py
"""

Configuration for multiple vector database options

"""

VECTOR_CONFIG = {
    "chromadb": {
        "type": "local",
        "description": "Local ChromaDB - Easy setup, good for development",
        "persist_directory": "./data/vector_db/chromadb",
        "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
        "chunk_size": 512,
        "chunk_overlap": 50
    },
    "faiss_sqlite": {
        "type": "local",
        "description": "FAISS + SQLite - High performance, more control",
        "faiss_index_path": "./data/vector_db/faiss/index.faiss",
        "sqlite_db_path": "./data/vector_db/faiss/metadata.db",
        "embedding_model": "sentence-transformers/all-mpnet-base-v2",
        "chunk_size": 512,
        "chunk_overlap": 50,
        "index_type": "IVFFlat"  # Options: Flat, IVFFlat, IVFPQ
    },
    "pinecone": {
        "type": "cloud",
        "description": "Pinecone - Scalable cloud solution",
        "api_key": None,  # Set your Pinecone API key
        "environment": "us-west1-gcp",
        "index_name": "medical-research-papers",
        "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
        "chunk_size": 512,
        "chunk_overlap": 50,
        "dimension": 384  # all-MiniLM-L6-v2 produces 384-dim embeddings
    }
}

# Available embedding models
EMBEDDING_MODELS = {
    "all-MiniLM-L6-v2": {
        "dimensions": 384,
        "description": "Fast, good quality general-purpose model",
        "model_name": "sentence-transformers/all-MiniLM-L6-v2"
    },
    "all-mpnet-base-v2": {
        "dimensions": 768,
        "description": "Higher quality, slower general-purpose model",
        "model_name": "sentence-transformers/all-mpnet-base-v2"
    },
    "multi-qa-MiniLM-L6-cos-v1": {
        "dimensions": 384,
        "description": "Optimized for question-answering tasks",
        "model_name": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
    }
}

# Chunking strategies
CHUNKING_STRATEGIES = {
    "semantic": {
        "method": "semantic",
        "size": 512,
        "overlap": 50
    },
    "fixed": {
        "method": "fixed",
        "size": 500,
        "overlap": 50
    },
    "paragraph": {
        "method": "paragraph",
        "max_size": 512
    }
}

def get_vector_config(vector_type: str) -> dict:
    """Get configuration for a specific vector database type"""
    return VECTOR_CONFIG.get(vector_type, {})

def get_available_vector_types() -> list:
    """Get list of available vector database types"""
    return list(VECTOR_CONFIG.keys())

def get_embedding_model_config(model_name: str) -> dict:
    """Get configuration for a specific embedding model"""
    return EMBEDDING_MODELS.get(model_name, {})