Spaces:
Running
Running
| # config/vector_config.py | |
| """ | |
| Configuration for multiple vector database options | |
| """ | |
| VECTOR_CONFIG = { | |
| "chromadb": { | |
| "type": "local", | |
| "description": "Local ChromaDB - Easy setup, good for development", | |
| "persist_directory": "./data/vector_db/chromadb", | |
| "embedding_model": "sentence-transformers/all-MiniLM-L6-v2", | |
| "chunk_size": 512, | |
| "chunk_overlap": 50 | |
| }, | |
| "faiss_sqlite": { | |
| "type": "local", | |
| "description": "FAISS + SQLite - High performance, more control", | |
| "faiss_index_path": "./data/vector_db/faiss/index.faiss", | |
| "sqlite_db_path": "./data/vector_db/faiss/metadata.db", | |
| "embedding_model": "sentence-transformers/all-mpnet-base-v2", | |
| "chunk_size": 512, | |
| "chunk_overlap": 50, | |
| "index_type": "IVFFlat" # Options: Flat, IVFFlat, IVFPQ | |
| }, | |
| "pinecone": { | |
| "type": "cloud", | |
| "description": "Pinecone - Scalable cloud solution", | |
| "api_key": None, # Set your Pinecone API key | |
| "environment": "us-west1-gcp", | |
| "index_name": "medical-research-papers", | |
| "embedding_model": "sentence-transformers/all-MiniLM-L6-v2", | |
| "chunk_size": 512, | |
| "chunk_overlap": 50, | |
| "dimension": 384 # all-MiniLM-L6-v2 produces 384-dim embeddings | |
| } | |
| } | |
| # Available embedding models | |
| EMBEDDING_MODELS = { | |
| "all-MiniLM-L6-v2": { | |
| "dimensions": 384, | |
| "description": "Fast, good quality general-purpose model", | |
| "model_name": "sentence-transformers/all-MiniLM-L6-v2" | |
| }, | |
| "all-mpnet-base-v2": { | |
| "dimensions": 768, | |
| "description": "Higher quality, slower general-purpose model", | |
| "model_name": "sentence-transformers/all-mpnet-base-v2" | |
| }, | |
| "multi-qa-MiniLM-L6-cos-v1": { | |
| "dimensions": 384, | |
| "description": "Optimized for question-answering tasks", | |
| "model_name": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1" | |
| } | |
| } | |
| # Chunking strategies | |
| CHUNKING_STRATEGIES = { | |
| "semantic": { | |
| "method": "semantic", | |
| "size": 512, | |
| "overlap": 50 | |
| }, | |
| "fixed": { | |
| "method": "fixed", | |
| "size": 500, | |
| "overlap": 50 | |
| }, | |
| "paragraph": { | |
| "method": "paragraph", | |
| "max_size": 512 | |
| } | |
| } | |
| def get_vector_config(vector_type: str) -> dict: | |
| """Get configuration for a specific vector database type""" | |
| return VECTOR_CONFIG.get(vector_type, {}) | |
| def get_available_vector_types() -> list: | |
| """Get list of available vector database types""" | |
| return list(VECTOR_CONFIG.keys()) | |
| def get_embedding_model_config(model_name: str) -> dict: | |
| """Get configuration for a specific embedding model""" | |
| return EMBEDDING_MODELS.get(model_name, {}) |