Spaces:

paulhemb
/

MedSearchPro

Running

App Files Files Community

MedSearchPro / config /vector_config.py

paulhemb

Initial Backend Deployment

1367957 about 1 month ago

raw

history blame contribute delete

2.82 kB

	# config/vector_config.py
	"""
	Configuration for multiple vector database options
	"""

	VECTOR_CONFIG = {
	"chromadb": {
	"type": "local",
	"description": "Local ChromaDB - Easy setup, good for development",
	"persist_directory": "./data/vector_db/chromadb",
	"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
	"chunk_size": 512,
	"chunk_overlap": 50
	},
	"faiss_sqlite": {
	"type": "local",
	"description": "FAISS + SQLite - High performance, more control",
	"faiss_index_path": "./data/vector_db/faiss/index.faiss",
	"sqlite_db_path": "./data/vector_db/faiss/metadata.db",
	"embedding_model": "sentence-transformers/all-mpnet-base-v2",
	"chunk_size": 512,
	"chunk_overlap": 50,
	"index_type": "IVFFlat" # Options: Flat, IVFFlat, IVFPQ
	},
	"pinecone": {
	"type": "cloud",
	"description": "Pinecone - Scalable cloud solution",
	"api_key": None, # Set your Pinecone API key
	"environment": "us-west1-gcp",
	"index_name": "medical-research-papers",
	"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
	"chunk_size": 512,
	"chunk_overlap": 50,
	"dimension": 384 # all-MiniLM-L6-v2 produces 384-dim embeddings
	}
	}

	# Available embedding models
	EMBEDDING_MODELS = {
	"all-MiniLM-L6-v2": {
	"dimensions": 384,
	"description": "Fast, good quality general-purpose model",
	"model_name": "sentence-transformers/all-MiniLM-L6-v2"
	},
	"all-mpnet-base-v2": {
	"dimensions": 768,
	"description": "Higher quality, slower general-purpose model",
	"model_name": "sentence-transformers/all-mpnet-base-v2"
	},
	"multi-qa-MiniLM-L6-cos-v1": {
	"dimensions": 384,
	"description": "Optimized for question-answering tasks",
	"model_name": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
	}
	}

	# Chunking strategies
	CHUNKING_STRATEGIES = {
	"semantic": {
	"method": "semantic",
	"size": 512,
	"overlap": 50
	},
	"fixed": {
	"method": "fixed",
	"size": 500,
	"overlap": 50
	},
	"paragraph": {
	"method": "paragraph",
	"max_size": 512
	}
	}

	def get_vector_config(vector_type: str) -> dict:
	"""Get configuration for a specific vector database type"""
	return VECTOR_CONFIG.get(vector_type, {})

	def get_available_vector_types() -> list:
	"""Get list of available vector database types"""
	return list(VECTOR_CONFIG.keys())

	def get_embedding_model_config(model_name: str) -> dict:
	"""Get configuration for a specific embedding model"""
	return EMBEDDING_MODELS.get(model_name, {})