Spaces:
Sleeping
Sleeping
File size: 4,421 Bytes
f884e6e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | """Configuration settings for the ingestion pipeline"""
import os
# Default ingestion settings
DEFAULT_CHUNK_SIZE = 1000
DEFAULT_OVERLAP = 200
RANDOM_SEED = 42
# Supported file formats
SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}
# Corpus directory
CORPUS_DIRECTORY = "synthetic_policies"
# Vector Database Settings
VECTOR_STORAGE_TYPE = os.getenv("VECTOR_STORAGE_TYPE", "chroma") # "chroma" or "postgres"
VECTOR_DB_PERSIST_PATH = "data/chroma_db" # Used for ChromaDB
DATABASE_URL = os.getenv("DATABASE_URL") # Used for PostgreSQL
COLLECTION_NAME = "policy_documents"
EMBEDDING_DIMENSION = 1024 # intfloat/multilingual-e5-large dimension (UPDATED: Oct 25, 2025)
SIMILARITY_METRIC = "cosine"
# ChromaDB Configuration for Memory Optimization (when using ChromaDB)
CHROMA_SETTINGS = {
"anonymized_telemetry": False,
"allow_reset": False,
"is_persistent": True,
}
# PostgreSQL Configuration (when using PostgreSQL)
POSTGRES_TABLE_NAME = "document_embeddings"
POSTGRES_MAX_CONNECTIONS = 10
# Embedding Model Settings
EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large" # HF Inference API model
EMBEDDING_BATCH_SIZE = 1 # Absolute minimum for extreme memory constraints
EMBEDDING_DEVICE = "cpu" # Use CPU for free tier compatibility
EMBEDDING_USE_QUANTIZED = os.getenv("EMBEDDING_USE_QUANTIZED", "false").lower() == "true"
# Document Processing Settings (for memory optimization)
MAX_DOCUMENT_LENGTH = 1000 # Truncate documents to reduce memory usage
MAX_DOCUMENTS_IN_MEMORY = 100 # Process documents in small batches
# Memory Management Settings
ENABLE_MEMORY_MONITORING = os.getenv("ENABLE_MEMORY_MONITORING", "true").lower() == "true"
MEMORY_LIMIT_MB = int(os.getenv("MEMORY_LIMIT_MB", "400")) # Conservative limit for 512MB instances
# Search Settings
DEFAULT_TOP_K = 5
MAX_TOP_K = 20
MIN_SIMILARITY_THRESHOLD = 0.3
# OpenAI Embedding configuration (toggle to use remote embeddings to save memory)
USE_OPENAI_EMBEDDING = os.getenv("USE_OPENAI_EMBEDDING", "false").lower() == "true"
# CRITICAL OVERRIDE: Force HF embeddings when HF_TOKEN is available
# This ensures HF Spaces always uses free HF services instead of paid OpenAI
HF_TOKEN_AVAILABLE = bool(os.getenv("HF_TOKEN"))
if HF_TOKEN_AVAILABLE:
print(
"🔧 CONFIG OVERRIDE: HF_TOKEN detected - FORCING HF embeddings "
f"(was USE_OPENAI_EMBEDDING={USE_OPENAI_EMBEDDING})"
)
USE_OPENAI_EMBEDDING = False
print(
"🔧 CONFIG DEBUG: USE_OPENAI_EMBEDDING env var = '",
os.getenv("USE_OPENAI_EMBEDDING", "NOT_SET"),
"->",
USE_OPENAI_EMBEDDING,
)
print("🔧 CONFIG DEBUG: HF_TOKEN available =", HF_TOKEN_AVAILABLE)
OPENAI_EMBEDDING_MODEL = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")
# Dimension for the chosen OpenAI embedding model. Adjust if you change models.
OPENAI_EMBEDDING_DIMENSION = int(os.getenv("OPENAI_EMBEDDING_DIMENSION", "1536"))
# If using OpenAI embeddings, override EMBEDDING_DIMENSION to keep checks consistent
# Note: We're using HF embeddings (1024) by default, OpenAI is optional override
if USE_OPENAI_EMBEDDING:
EMBEDDING_DIMENSION = OPENAI_EMBEDDING_DIMENSION
print(f"🔧 CONFIG: Using OpenAI embeddings, dimension overridden to {EMBEDDING_DIMENSION}")
else:
print(f"🔧 CONFIG: Using HF embeddings, dimension is {EMBEDDING_DIMENSION}")
# Flask configuration classes
class Config:
"""Base configuration"""
SECRET_KEY = os.getenv("SECRET_KEY", "dev-secret-key-change-in-production")
ENABLE_HF_SERVICES = os.getenv("ENABLE_HF_SERVICES", "false").lower() == "true"
HF_TOKEN = os.getenv("HF_TOKEN")
# Force HF services when token is available
if HF_TOKEN:
ENABLE_HF_SERVICES = True
class DevelopmentConfig(Config):
"""Development configuration"""
DEBUG = True
ENABLE_HF_PROCESSING = os.getenv("ENABLE_HF_PROCESSING", "true").lower() == "true"
class ProductionConfig(Config):
"""Production configuration"""
DEBUG = False
ENABLE_HF_PROCESSING = os.getenv("ENABLE_HF_PROCESSING", "true").lower() == "true"
class TestConfig(Config):
"""Testing configuration"""
TESTING = True
DEBUG = True
ENABLE_HF_SERVICES = False
ENABLE_HF_PROCESSING = False
# Configuration dictionary
config = {
"default": DevelopmentConfig,
"development": DevelopmentConfig,
"production": ProductionConfig,
"test": TestConfig,
"testing": TestConfig,
}
|