Spaces:

msse-team-3
/

ai-engineering-project

Sleeping

File size: 4,421 Bytes

f884e6e

"""Configuration settings for the ingestion pipeline"""

import os

# Default ingestion settings
DEFAULT_CHUNK_SIZE = 1000
DEFAULT_OVERLAP = 200
RANDOM_SEED = 42

# Supported file formats
SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}

# Corpus directory
CORPUS_DIRECTORY = "synthetic_policies"

# Vector Database Settings
VECTOR_STORAGE_TYPE = os.getenv("VECTOR_STORAGE_TYPE", "chroma")  # "chroma" or "postgres"
VECTOR_DB_PERSIST_PATH = "data/chroma_db"  # Used for ChromaDB
DATABASE_URL = os.getenv("DATABASE_URL")  # Used for PostgreSQL
COLLECTION_NAME = "policy_documents"
EMBEDDING_DIMENSION = 1024  # intfloat/multilingual-e5-large dimension (UPDATED: Oct 25, 2025)
SIMILARITY_METRIC = "cosine"

# ChromaDB Configuration for Memory Optimization (when using ChromaDB)
CHROMA_SETTINGS = {
    "anonymized_telemetry": False,
    "allow_reset": False,
    "is_persistent": True,
}

# PostgreSQL Configuration (when using PostgreSQL)
POSTGRES_TABLE_NAME = "document_embeddings"
POSTGRES_MAX_CONNECTIONS = 10

# Embedding Model Settings
EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large"  # HF Inference API model
EMBEDDING_BATCH_SIZE = 1  # Absolute minimum for extreme memory constraints
EMBEDDING_DEVICE = "cpu"  # Use CPU for free tier compatibility
EMBEDDING_USE_QUANTIZED = os.getenv("EMBEDDING_USE_QUANTIZED", "false").lower() == "true"

# Document Processing Settings (for memory optimization)
MAX_DOCUMENT_LENGTH = 1000  # Truncate documents to reduce memory usage
MAX_DOCUMENTS_IN_MEMORY = 100  # Process documents in small batches

# Memory Management Settings
ENABLE_MEMORY_MONITORING = os.getenv("ENABLE_MEMORY_MONITORING", "true").lower() == "true"
MEMORY_LIMIT_MB = int(os.getenv("MEMORY_LIMIT_MB", "400"))  # Conservative limit for 512MB instances

# Search Settings
DEFAULT_TOP_K = 5
MAX_TOP_K = 20
MIN_SIMILARITY_THRESHOLD = 0.3

# OpenAI Embedding configuration (toggle to use remote embeddings to save memory)
USE_OPENAI_EMBEDDING = os.getenv("USE_OPENAI_EMBEDDING", "false").lower() == "true"

# CRITICAL OVERRIDE: Force HF embeddings when HF_TOKEN is available
# This ensures HF Spaces always uses free HF services instead of paid OpenAI
HF_TOKEN_AVAILABLE = bool(os.getenv("HF_TOKEN"))
if HF_TOKEN_AVAILABLE:
    print(
        "🔧 CONFIG OVERRIDE: HF_TOKEN detected - FORCING HF embeddings "
        f"(was USE_OPENAI_EMBEDDING={USE_OPENAI_EMBEDDING})"
    )
    USE_OPENAI_EMBEDDING = False

print(
    "🔧 CONFIG DEBUG: USE_OPENAI_EMBEDDING env var = '",
    os.getenv("USE_OPENAI_EMBEDDING", "NOT_SET"),
    "->",
    USE_OPENAI_EMBEDDING,
)
print("🔧 CONFIG DEBUG: HF_TOKEN available =", HF_TOKEN_AVAILABLE)
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")
# Dimension for the chosen OpenAI embedding model. Adjust if you change models.
OPENAI_EMBEDDING_DIMENSION = int(os.getenv("OPENAI_EMBEDDING_DIMENSION", "1536"))

# If using OpenAI embeddings, override EMBEDDING_DIMENSION to keep checks consistent
# Note: We're using HF embeddings (1024) by default, OpenAI is optional override
if USE_OPENAI_EMBEDDING:
    EMBEDDING_DIMENSION = OPENAI_EMBEDDING_DIMENSION
    print(f"🔧 CONFIG: Using OpenAI embeddings, dimension overridden to {EMBEDDING_DIMENSION}")
else:
    print(f"🔧 CONFIG: Using HF embeddings, dimension is {EMBEDDING_DIMENSION}")


# Flask configuration classes
class Config:
    """Base configuration"""

    SECRET_KEY = os.getenv("SECRET_KEY", "dev-secret-key-change-in-production")
    ENABLE_HF_SERVICES = os.getenv("ENABLE_HF_SERVICES", "false").lower() == "true"
    HF_TOKEN = os.getenv("HF_TOKEN")

    # Force HF services when token is available
    if HF_TOKEN:
        ENABLE_HF_SERVICES = True


class DevelopmentConfig(Config):
    """Development configuration"""

    DEBUG = True
    ENABLE_HF_PROCESSING = os.getenv("ENABLE_HF_PROCESSING", "true").lower() == "true"


class ProductionConfig(Config):
    """Production configuration"""

    DEBUG = False
    ENABLE_HF_PROCESSING = os.getenv("ENABLE_HF_PROCESSING", "true").lower() == "true"


class TestConfig(Config):
    """Testing configuration"""

    TESTING = True
    DEBUG = True
    ENABLE_HF_SERVICES = False
    ENABLE_HF_PROCESSING = False


# Configuration dictionary
config = {
    "default": DevelopmentConfig,
    "development": DevelopmentConfig,
    "production": ProductionConfig,
    "test": TestConfig,
    "testing": TestConfig,
}