import os from pathlib import Path from typing import Dict, Any, Optional import streamlit as st class Config: """ Centralized configuration management for BLUESCARF AI HR Assistant. Provides environment-aware settings with sensible defaults and validation. """ def __init__(self): """Initialize configuration with environment-specific optimizations.""" self._load_environment_config() self._validate_configuration() def _load_environment_config(self): """Load configuration from environment variables with intelligent defaults.""" # === Core Application Settings === self.APP_NAME = "BLUESCARF AI HR Assistant" self.APP_VERSION = "1.0.0" self.COMPANY_NAME = "BLUESCARF ARTIFICIAL INTELLIGENCE" # === Document Processing Configuration === # Optimal chunk size for semantic coherence (384-512 tokens typical) self.CHUNK_SIZE = int(os.getenv('CHUNK_SIZE', 1000)) # Overlap for context continuity (10-20% of chunk size) self.CHUNK_OVERLAP = int(os.getenv('CHUNK_OVERLAP', 200)) # Minimum viable chunk size to filter noise self.MIN_CHUNK_SIZE = int(os.getenv('MIN_CHUNK_SIZE', 100)) # Maximum file size (50MB default for enterprise documents) self.MAX_FILE_SIZE = int(os.getenv('MAX_FILE_SIZE', 50 * 1024 * 1024)) # === Vector Store Configuration === # Persistent storage path with environment fallback default_db_path = Path("vector_db") self.VECTOR_DB_PATH = Path(os.getenv('VECTOR_DB_PATH', default_db_path)) # Maximum context chunks for retrieval (balance between context and noise) self.MAX_CONTEXT_CHUNKS = int(os.getenv('MAX_CONTEXT_CHUNKS', 5)) # Similarity search parameters self.SIMILARITY_THRESHOLD = float(os.getenv('SIMILARITY_THRESHOLD', 0.5)) self.MAX_SEARCH_RESULTS = int(os.getenv('MAX_SEARCH_RESULTS', 10)) # === API Configuration === # Gemini model selection (optimized for reasoning and context) self.GEMINI_MODEL = os.getenv('GEMINI_MODEL', 'gemini-pro') # Response generation parameters self.MAX_RESPONSE_TOKENS = int(os.getenv('MAX_RESPONSE_TOKENS', 1024)) self.TEMPERATURE = float(os.getenv('TEMPERATURE', 0.3)) # Conservative for factual responses # API rate limiting and retry configuration self.API_RETRY_ATTEMPTS = int(os.getenv('API_RETRY_ATTEMPTS', 3)) self.API_TIMEOUT_SECONDS = int(os.getenv('API_TIMEOUT_SECONDS', 30)) # === Security Configuration === # Session and authentication settings self.SESSION_TIMEOUT_HOURS = int(os.getenv('SESSION_TIMEOUT_HOURS', 8)) self.ADMIN_SESSION_TIMEOUT_HOURS = int(os.getenv('ADMIN_SESSION_TIMEOUT_HOURS', 2)) # === Logging and Monitoring === # Application logging configuration self.LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO') self.LOG_FILE_PATH = Path(os.getenv('LOG_FILE_PATH', 'logs/hr_assistant.log')) self.ENABLE_INTERACTION_LOGGING = os.getenv('ENABLE_INTERACTION_LOGGING', 'true').lower() == 'true' # === Performance Optimization === # Embedding model caching and batch processing self.EMBEDDING_BATCH_SIZE = int(os.getenv('EMBEDDING_BATCH_SIZE', 32)) self.ENABLE_MODEL_CACHING = os.getenv('ENABLE_MODEL_CACHING', 'true').lower() == 'true' # Streamlit performance settings self.STREAMLIT_THEME = os.getenv('STREAMLIT_THEME', 'light') self.ENABLE_CACHING = os.getenv('ENABLE_CACHING', 'true').lower() == 'true' # === Deployment Configuration === # Environment detection for deployment-specific optimizations self.ENVIRONMENT = os.getenv('ENVIRONMENT', 'development') self.IS_PRODUCTION = self.ENVIRONMENT.lower() == 'production' self.IS_HUGGINGFACE = os.getenv('SPACE_ID') is not None # Resource limits for cloud deployment if self.IS_HUGGINGFACE: self._apply_huggingface_optimizations() def _apply_huggingface_optimizations(self): """Apply Hugging Face Spaces specific optimizations.""" # Reduce memory footprint for cloud deployment self.CHUNK_SIZE = min(self.CHUNK_SIZE, 800) self.MAX_CONTEXT_CHUNKS = min(self.MAX_CONTEXT_CHUNKS, 4) self.EMBEDDING_BATCH_SIZE = min(self.EMBEDDING_BATCH_SIZE, 16) self.MAX_FILE_SIZE = min(self.MAX_FILE_SIZE, 25 * 1024 * 1024) # 25MB limit # Optimize for limited computational resources self.ENABLE_MODEL_CACHING = True self.API_TIMEOUT_SECONDS = 60 # More lenient timeout for cloud def _validate_configuration(self): """Validate configuration parameters and ensure system compatibility.""" validation_errors = [] # Validate numeric ranges if self.CHUNK_SIZE < 100 or self.CHUNK_SIZE > 2000: validation_errors.append("CHUNK_SIZE must be between 100 and 2000") if self.CHUNK_OVERLAP >= self.CHUNK_SIZE: validation_errors.append("CHUNK_OVERLAP must be less than CHUNK_SIZE") if self.SIMILARITY_THRESHOLD < 0 or self.SIMILARITY_THRESHOLD > 1: validation_errors.append("SIMILARITY_THRESHOLD must be between 0 and 1") if self.TEMPERATURE < 0 or self.TEMPERATURE > 1: validation_errors.append("TEMPERATURE must be between 0 and 1") # Validate paths and create directories try: self.VECTOR_DB_PATH.mkdir(parents=True, exist_ok=True) self.LOG_FILE_PATH.parent.mkdir(parents=True, exist_ok=True) except Exception as e: validation_errors.append(f"Cannot create required directories: {str(e)}") # Report validation errors if validation_errors: error_message = "Configuration validation failed:\n" + "\n".join(validation_errors) if 'streamlit' in globals(): st.error(error_message) else: print(f"ERROR: {error_message}") raise ValueError(error_message) def get_hr_context_prompt(self) -> str: """ Generate context-aware system prompt for HR assistant interactions. Returns: Optimized system prompt for Gemini API """ return f""" You are an intelligent HR Assistant for {self.COMPANY_NAME}. CORE IDENTITY: - Professional, helpful, and knowledgeable about company policies - Exclusively focused on HR-related matters using provided company documents - Maintain confidentiality and provide accurate, policy-based guidance RESPONSE GUIDELINES: 1. SCOPE: Only answer questions related to company HR policies, procedures, and benefits 2. SOURCE: Base responses exclusively on provided company documents 3. CLARITY: Provide clear, actionable guidance with specific policy references 4. BOUNDARIES: Politely redirect non-HR questions to appropriate resources 5. ACCURACY: If information isn't in the documents, state this clearly 6. TONE: Professional yet approachable, maintaining company values STRUCTURED RESPONSE FORMAT: - Direct answer to the question - Relevant policy/document references - Next steps or additional resources if applicable - Contact information for complex cases requiring human intervention Remember: You represent {self.COMPANY_NAME} and should reflect our commitment to supporting employees through clear, accurate HR guidance. """ def get_similarity_search_config(self) -> Dict[str, Any]: """ Get optimized configuration for vector similarity search. Returns: Dictionary with search parameters """ return { 'k': self.MAX_CONTEXT_CHUNKS, 'similarity_threshold': self.SIMILARITY_THRESHOLD, 'max_results': self.MAX_SEARCH_RESULTS, 'include_metadata': True, 'score_threshold': 0.3, # Minimum relevance score 'diversity_penalty': 0.1 # Encourage diverse results } def get_gemini_config(self) -> Dict[str, Any]: """ Get optimized configuration for Gemini API calls. Returns: Dictionary with API parameters """ return { 'model': self.GEMINI_MODEL, 'temperature': self.TEMPERATURE, 'max_output_tokens': self.MAX_RESPONSE_TOKENS, 'top_p': 0.8, # Nucleus sampling for balanced creativity 'top_k': 40, # Limit token consideration for consistency 'stop_sequences': ["Human:", "Assistant:", "---"], } def get_document_processing_config(self) -> Dict[str, Any]: """ Get optimized configuration for document processing pipeline. Returns: Dictionary with processing parameters """ return { 'chunk_size': self.CHUNK_SIZE, 'chunk_overlap': self.CHUNK_OVERLAP, 'min_chunk_size': self.MIN_CHUNK_SIZE, 'max_file_size': self.MAX_FILE_SIZE, 'embedding_batch_size': self.EMBEDDING_BATCH_SIZE, 'enable_caching': self.ENABLE_MODEL_CACHING, 'supported_formats': ['pdf'], 'content_filters': { 'min_word_count': 10, 'max_word_count': 2000, 'remove_headers_footers': True, 'normalize_whitespace': True } } def get_streamlit_config(self) -> Dict[str, str]: """ Get Streamlit-specific configuration for optimal UI performance. Returns: Dictionary with Streamlit settings """ return { 'page_title': self.APP_NAME, 'page_icon': '🔷', 'layout': 'wide', 'initial_sidebar_state': 'collapsed', 'menu_items': { 'Get Help': f'mailto:support@{self.COMPANY_NAME.lower().replace(" ", "")}.com', 'Report a bug': None, 'About': f'{self.APP_NAME} v{self.APP_VERSION} - Powered by Google Gemini AI' } } def get_logging_config(self) -> Dict[str, Any]: """ Get comprehensive logging configuration for monitoring and debugging. Returns: Dictionary with logging parameters """ return { 'level': self.LOG_LEVEL, 'file_path': str(self.LOG_FILE_PATH), 'enable_interaction_logging': self.ENABLE_INTERACTION_LOGGING, 'log_format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', 'max_file_size': 10 * 1024 * 1024, # 10MB 'backup_count': 5, 'console_output': not self.IS_PRODUCTION } def get_security_config(self) -> Dict[str, Any]: """ Get security configuration for admin access and session management. Returns: Dictionary with security parameters """ return { 'session_timeout_hours': self.SESSION_TIMEOUT_HOURS, 'admin_session_timeout_hours': self.ADMIN_SESSION_TIMEOUT_HOURS, 'password_min_length': 8, 'password_complexity_required': self.IS_PRODUCTION, 'enable_rate_limiting': self.IS_PRODUCTION, 'max_failed_attempts': 3, 'lockout_duration_minutes': 15 } def create_environment_file(self, file_path: Optional[str] = None) -> str: """ Generate .env file template with all configuration options. Args: file_path: Optional path for .env file Returns: Path to created .env file """ if not file_path: file_path = '.env' env_content = f"""# {self.APP_NAME} Configuration # Generated automatically - modify as needed for your deployment # === Application Settings === APP_NAME="{self.APP_NAME}" APP_VERSION="{self.APP_VERSION}" COMPANY_NAME="{self.COMPANY_NAME}" ENVIRONMENT=production # === Document Processing === CHUNK_SIZE={self.CHUNK_SIZE} CHUNK_OVERLAP={self.CHUNK_OVERLAP} MIN_CHUNK_SIZE={self.MIN_CHUNK_SIZE} MAX_FILE_SIZE={self.MAX_FILE_SIZE} # === Vector Database === VECTOR_DB_PATH=./vector_db MAX_CONTEXT_CHUNKS={self.MAX_CONTEXT_CHUNKS} SIMILARITY_THRESHOLD={self.SIMILARITY_THRESHOLD} # === API Configuration === GEMINI_MODEL={self.GEMINI_MODEL} TEMPERATURE={self.TEMPERATURE} MAX_RESPONSE_TOKENS={self.MAX_RESPONSE_TOKENS} # === Security === SESSION_TIMEOUT_HOURS={self.SESSION_TIMEOUT_HOURS} ADMIN_SESSION_TIMEOUT_HOURS={self.ADMIN_SESSION_TIMEOUT_HOURS} # === Logging === LOG_LEVEL={self.LOG_LEVEL} LOG_FILE_PATH=./logs/hr_assistant.log ENABLE_INTERACTION_LOGGING=true # === Performance === EMBEDDING_BATCH_SIZE={self.EMBEDDING_BATCH_SIZE} ENABLE_MODEL_CACHING=true ENABLE_CACHING=true """ try: with open(file_path, 'w') as f: f.write(env_content) return file_path except Exception as e: if 'streamlit' in globals(): st.error(f"Failed to create .env file: {str(e)}") return "" def __str__(self) -> str: """String representation for debugging and logging.""" return f"{self.APP_NAME} Config (Environment: {self.ENVIRONMENT})" def __repr__(self) -> str: """Developer-friendly representation.""" return f"Config(app='{self.APP_NAME}', env='{self.ENVIRONMENT}', version='{self.APP_VERSION}')"