"""Configuration settings for the ingestion pipeline""" import os # Default ingestion settings DEFAULT_CHUNK_SIZE = 1000 DEFAULT_OVERLAP = 200 RANDOM_SEED = 42 # Supported file formats SUPPORTED_FORMATS = {".txt", ".md", ".markdown"} # Corpus directory CORPUS_DIRECTORY = "synthetic_policies" # Vector Database Settings VECTOR_STORAGE_TYPE = os.getenv("VECTOR_STORAGE_TYPE", "chroma") # "chroma" or "postgres" VECTOR_DB_PERSIST_PATH = "data/chroma_db" # Used for ChromaDB DATABASE_URL = os.getenv("DATABASE_URL") # Used for PostgreSQL COLLECTION_NAME = "policy_documents" EMBEDDING_DIMENSION = 1024 # intfloat/multilingual-e5-large dimension (UPDATED: Oct 25, 2025) SIMILARITY_METRIC = "cosine" # ChromaDB Configuration for Memory Optimization (when using ChromaDB) CHROMA_SETTINGS = { "anonymized_telemetry": False, "allow_reset": False, "is_persistent": True, } # PostgreSQL Configuration (when using PostgreSQL) POSTGRES_TABLE_NAME = "document_embeddings" POSTGRES_MAX_CONNECTIONS = 10 # Embedding Model Settings EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large" # HF Inference API model EMBEDDING_BATCH_SIZE = 1 # Absolute minimum for extreme memory constraints EMBEDDING_DEVICE = "cpu" # Use CPU for free tier compatibility EMBEDDING_USE_QUANTIZED = os.getenv("EMBEDDING_USE_QUANTIZED", "false").lower() == "true" # Document Processing Settings (for memory optimization) MAX_DOCUMENT_LENGTH = 1000 # Truncate documents to reduce memory usage MAX_DOCUMENTS_IN_MEMORY = 100 # Process documents in small batches # Memory Management Settings ENABLE_MEMORY_MONITORING = os.getenv("ENABLE_MEMORY_MONITORING", "true").lower() == "true" MEMORY_LIMIT_MB = int(os.getenv("MEMORY_LIMIT_MB", "400")) # Conservative limit for 512MB instances # Search Settings DEFAULT_TOP_K = 5 MAX_TOP_K = 20 MIN_SIMILARITY_THRESHOLD = 0.3 # OpenAI Embedding configuration (toggle to use remote embeddings to save memory) USE_OPENAI_EMBEDDING = os.getenv("USE_OPENAI_EMBEDDING", "false").lower() == "true" # CRITICAL OVERRIDE: Force HF embeddings when HF_TOKEN is available # This ensures HF Spaces always uses free HF services instead of paid OpenAI HF_TOKEN_AVAILABLE = bool(os.getenv("HF_TOKEN")) if HF_TOKEN_AVAILABLE: print( "🔧 CONFIG OVERRIDE: HF_TOKEN detected - FORCING HF embeddings " f"(was USE_OPENAI_EMBEDDING={USE_OPENAI_EMBEDDING})" ) USE_OPENAI_EMBEDDING = False print( "🔧 CONFIG DEBUG: USE_OPENAI_EMBEDDING env var = '", os.getenv("USE_OPENAI_EMBEDDING", "NOT_SET"), "->", USE_OPENAI_EMBEDDING, ) print("🔧 CONFIG DEBUG: HF_TOKEN available =", HF_TOKEN_AVAILABLE) OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small") # Dimension for the chosen OpenAI embedding model. Adjust if you change models. OPENAI_EMBEDDING_DIMENSION = int(os.getenv("OPENAI_EMBEDDING_DIMENSION", "1536")) # If using OpenAI embeddings, override EMBEDDING_DIMENSION to keep checks consistent # Note: We're using HF embeddings (1024) by default, OpenAI is optional override if USE_OPENAI_EMBEDDING: EMBEDDING_DIMENSION = OPENAI_EMBEDDING_DIMENSION print(f"🔧 CONFIG: Using OpenAI embeddings, dimension overridden to {EMBEDDING_DIMENSION}") else: print(f"🔧 CONFIG: Using HF embeddings, dimension is {EMBEDDING_DIMENSION}") # Flask configuration classes class Config: """Base configuration""" SECRET_KEY = os.getenv("SECRET_KEY", "dev-secret-key-change-in-production") ENABLE_HF_SERVICES = os.getenv("ENABLE_HF_SERVICES", "false").lower() == "true" HF_TOKEN = os.getenv("HF_TOKEN") # Force HF services when token is available if HF_TOKEN: ENABLE_HF_SERVICES = True class DevelopmentConfig(Config): """Development configuration""" DEBUG = True ENABLE_HF_PROCESSING = os.getenv("ENABLE_HF_PROCESSING", "true").lower() == "true" class ProductionConfig(Config): """Production configuration""" DEBUG = False ENABLE_HF_PROCESSING = os.getenv("ENABLE_HF_PROCESSING", "true").lower() == "true" class TestConfig(Config): """Testing configuration""" TESTING = True DEBUG = True ENABLE_HF_SERVICES = False ENABLE_HF_PROCESSING = False # Configuration dictionary config = { "default": DevelopmentConfig, "development": DevelopmentConfig, "production": ProductionConfig, "test": TestConfig, "testing": TestConfig, }