Spaces:
Sleeping
Sleeping
| import os | |
| from pathlib import Path | |
| from typing import Dict, Any, Optional | |
| import streamlit as st | |
| class Config: | |
| """ | |
| Centralized configuration management for BLUESCARF AI HR Assistant. | |
| Provides environment-aware settings with sensible defaults and validation. | |
| """ | |
| def __init__(self): | |
| """Initialize configuration with environment-specific optimizations.""" | |
| self._load_environment_config() | |
| self._validate_configuration() | |
| def _load_environment_config(self): | |
| """Load configuration from environment variables with intelligent defaults.""" | |
| # === Core Application Settings === | |
| self.APP_NAME = "BLUESCARF AI HR Assistant" | |
| self.APP_VERSION = "1.0.0" | |
| self.COMPANY_NAME = "BLUESCARF ARTIFICIAL INTELLIGENCE" | |
| # === Document Processing Configuration === | |
| # Optimal chunk size for semantic coherence (384-512 tokens typical) | |
| self.CHUNK_SIZE = int(os.getenv('CHUNK_SIZE', 1000)) | |
| # Overlap for context continuity (10-20% of chunk size) | |
| self.CHUNK_OVERLAP = int(os.getenv('CHUNK_OVERLAP', 200)) | |
| # Minimum viable chunk size to filter noise | |
| self.MIN_CHUNK_SIZE = int(os.getenv('MIN_CHUNK_SIZE', 100)) | |
| # Maximum file size (50MB default for enterprise documents) | |
| self.MAX_FILE_SIZE = int(os.getenv('MAX_FILE_SIZE', 50 * 1024 * 1024)) | |
| # === Vector Store Configuration === | |
| # Persistent storage path with environment fallback | |
| default_db_path = Path("vector_db") | |
| self.VECTOR_DB_PATH = Path(os.getenv('VECTOR_DB_PATH', default_db_path)) | |
| # Maximum context chunks for retrieval (balance between context and noise) | |
| self.MAX_CONTEXT_CHUNKS = int(os.getenv('MAX_CONTEXT_CHUNKS', 5)) | |
| # Similarity search parameters | |
| self.SIMILARITY_THRESHOLD = float(os.getenv('SIMILARITY_THRESHOLD', 0.5)) | |
| self.MAX_SEARCH_RESULTS = int(os.getenv('MAX_SEARCH_RESULTS', 10)) | |
| # === API Configuration === | |
| # Gemini model selection (optimized for reasoning and context) | |
| self.GEMINI_MODEL = os.getenv('GEMINI_MODEL', 'gemini-pro') | |
| # Response generation parameters | |
| self.MAX_RESPONSE_TOKENS = int(os.getenv('MAX_RESPONSE_TOKENS', 1024)) | |
| self.TEMPERATURE = float(os.getenv('TEMPERATURE', 0.3)) # Conservative for factual responses | |
| # API rate limiting and retry configuration | |
| self.API_RETRY_ATTEMPTS = int(os.getenv('API_RETRY_ATTEMPTS', 3)) | |
| self.API_TIMEOUT_SECONDS = int(os.getenv('API_TIMEOUT_SECONDS', 30)) | |
| # === Security Configuration === | |
| # Session and authentication settings | |
| self.SESSION_TIMEOUT_HOURS = int(os.getenv('SESSION_TIMEOUT_HOURS', 8)) | |
| self.ADMIN_SESSION_TIMEOUT_HOURS = int(os.getenv('ADMIN_SESSION_TIMEOUT_HOURS', 2)) | |
| # === Logging and Monitoring === | |
| # Application logging configuration | |
| self.LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO') | |
| self.LOG_FILE_PATH = Path(os.getenv('LOG_FILE_PATH', 'logs/hr_assistant.log')) | |
| self.ENABLE_INTERACTION_LOGGING = os.getenv('ENABLE_INTERACTION_LOGGING', 'true').lower() == 'true' | |
| # === Performance Optimization === | |
| # Embedding model caching and batch processing | |
| self.EMBEDDING_BATCH_SIZE = int(os.getenv('EMBEDDING_BATCH_SIZE', 32)) | |
| self.ENABLE_MODEL_CACHING = os.getenv('ENABLE_MODEL_CACHING', 'true').lower() == 'true' | |
| # Streamlit performance settings | |
| self.STREAMLIT_THEME = os.getenv('STREAMLIT_THEME', 'light') | |
| self.ENABLE_CACHING = os.getenv('ENABLE_CACHING', 'true').lower() == 'true' | |
| # === Deployment Configuration === | |
| # Environment detection for deployment-specific optimizations | |
| self.ENVIRONMENT = os.getenv('ENVIRONMENT', 'development') | |
| self.IS_PRODUCTION = self.ENVIRONMENT.lower() == 'production' | |
| self.IS_HUGGINGFACE = os.getenv('SPACE_ID') is not None | |
| # Resource limits for cloud deployment | |
| if self.IS_HUGGINGFACE: | |
| self._apply_huggingface_optimizations() | |
| def _apply_huggingface_optimizations(self): | |
| """Apply Hugging Face Spaces specific optimizations.""" | |
| # Reduce memory footprint for cloud deployment | |
| self.CHUNK_SIZE = min(self.CHUNK_SIZE, 800) | |
| self.MAX_CONTEXT_CHUNKS = min(self.MAX_CONTEXT_CHUNKS, 4) | |
| self.EMBEDDING_BATCH_SIZE = min(self.EMBEDDING_BATCH_SIZE, 16) | |
| self.MAX_FILE_SIZE = min(self.MAX_FILE_SIZE, 25 * 1024 * 1024) # 25MB limit | |
| # Optimize for limited computational resources | |
| self.ENABLE_MODEL_CACHING = True | |
| self.API_TIMEOUT_SECONDS = 60 # More lenient timeout for cloud | |
| def _validate_configuration(self): | |
| """Validate configuration parameters and ensure system compatibility.""" | |
| validation_errors = [] | |
| # Validate numeric ranges | |
| if self.CHUNK_SIZE < 100 or self.CHUNK_SIZE > 2000: | |
| validation_errors.append("CHUNK_SIZE must be between 100 and 2000") | |
| if self.CHUNK_OVERLAP >= self.CHUNK_SIZE: | |
| validation_errors.append("CHUNK_OVERLAP must be less than CHUNK_SIZE") | |
| if self.SIMILARITY_THRESHOLD < 0 or self.SIMILARITY_THRESHOLD > 1: | |
| validation_errors.append("SIMILARITY_THRESHOLD must be between 0 and 1") | |
| if self.TEMPERATURE < 0 or self.TEMPERATURE > 1: | |
| validation_errors.append("TEMPERATURE must be between 0 and 1") | |
| # Validate paths and create directories | |
| try: | |
| self.VECTOR_DB_PATH.mkdir(parents=True, exist_ok=True) | |
| self.LOG_FILE_PATH.parent.mkdir(parents=True, exist_ok=True) | |
| except Exception as e: | |
| validation_errors.append(f"Cannot create required directories: {str(e)}") | |
| # Report validation errors | |
| if validation_errors: | |
| error_message = "Configuration validation failed:\n" + "\n".join(validation_errors) | |
| if 'streamlit' in globals(): | |
| st.error(error_message) | |
| else: | |
| print(f"ERROR: {error_message}") | |
| raise ValueError(error_message) | |
| def get_hr_context_prompt(self) -> str: | |
| """ | |
| Generate context-aware system prompt for HR assistant interactions. | |
| Returns: | |
| Optimized system prompt for Gemini API | |
| """ | |
| return f""" | |
| You are an intelligent HR Assistant for {self.COMPANY_NAME}. | |
| CORE IDENTITY: | |
| - Professional, helpful, and knowledgeable about company policies | |
| - Exclusively focused on HR-related matters using provided company documents | |
| - Maintain confidentiality and provide accurate, policy-based guidance | |
| RESPONSE GUIDELINES: | |
| 1. SCOPE: Only answer questions related to company HR policies, procedures, and benefits | |
| 2. SOURCE: Base responses exclusively on provided company documents | |
| 3. CLARITY: Provide clear, actionable guidance with specific policy references | |
| 4. BOUNDARIES: Politely redirect non-HR questions to appropriate resources | |
| 5. ACCURACY: If information isn't in the documents, state this clearly | |
| 6. TONE: Professional yet approachable, maintaining company values | |
| STRUCTURED RESPONSE FORMAT: | |
| - Direct answer to the question | |
| - Relevant policy/document references | |
| - Next steps or additional resources if applicable | |
| - Contact information for complex cases requiring human intervention | |
| Remember: You represent {self.COMPANY_NAME} and should reflect our commitment to supporting employees through clear, accurate HR guidance. | |
| """ | |
| def get_similarity_search_config(self) -> Dict[str, Any]: | |
| """ | |
| Get optimized configuration for vector similarity search. | |
| Returns: | |
| Dictionary with search parameters | |
| """ | |
| return { | |
| 'k': self.MAX_CONTEXT_CHUNKS, | |
| 'similarity_threshold': self.SIMILARITY_THRESHOLD, | |
| 'max_results': self.MAX_SEARCH_RESULTS, | |
| 'include_metadata': True, | |
| 'score_threshold': 0.3, # Minimum relevance score | |
| 'diversity_penalty': 0.1 # Encourage diverse results | |
| } | |
| def get_gemini_config(self) -> Dict[str, Any]: | |
| """ | |
| Get optimized configuration for Gemini API calls. | |
| Returns: | |
| Dictionary with API parameters | |
| """ | |
| return { | |
| 'model': self.GEMINI_MODEL, | |
| 'temperature': self.TEMPERATURE, | |
| 'max_output_tokens': self.MAX_RESPONSE_TOKENS, | |
| 'top_p': 0.8, # Nucleus sampling for balanced creativity | |
| 'top_k': 40, # Limit token consideration for consistency | |
| 'stop_sequences': ["Human:", "Assistant:", "---"], | |
| } | |
| def get_document_processing_config(self) -> Dict[str, Any]: | |
| """ | |
| Get optimized configuration for document processing pipeline. | |
| Returns: | |
| Dictionary with processing parameters | |
| """ | |
| return { | |
| 'chunk_size': self.CHUNK_SIZE, | |
| 'chunk_overlap': self.CHUNK_OVERLAP, | |
| 'min_chunk_size': self.MIN_CHUNK_SIZE, | |
| 'max_file_size': self.MAX_FILE_SIZE, | |
| 'embedding_batch_size': self.EMBEDDING_BATCH_SIZE, | |
| 'enable_caching': self.ENABLE_MODEL_CACHING, | |
| 'supported_formats': ['pdf'], | |
| 'content_filters': { | |
| 'min_word_count': 10, | |
| 'max_word_count': 2000, | |
| 'remove_headers_footers': True, | |
| 'normalize_whitespace': True | |
| } | |
| } | |
| def get_streamlit_config(self) -> Dict[str, str]: | |
| """ | |
| Get Streamlit-specific configuration for optimal UI performance. | |
| Returns: | |
| Dictionary with Streamlit settings | |
| """ | |
| return { | |
| 'page_title': self.APP_NAME, | |
| 'page_icon': '🔷', | |
| 'layout': 'wide', | |
| 'initial_sidebar_state': 'collapsed', | |
| 'menu_items': { | |
| 'Get Help': f'mailto:support@{self.COMPANY_NAME.lower().replace(" ", "")}.com', | |
| 'Report a bug': None, | |
| 'About': f'{self.APP_NAME} v{self.APP_VERSION} - Powered by Google Gemini AI' | |
| } | |
| } | |
| def get_logging_config(self) -> Dict[str, Any]: | |
| """ | |
| Get comprehensive logging configuration for monitoring and debugging. | |
| Returns: | |
| Dictionary with logging parameters | |
| """ | |
| return { | |
| 'level': self.LOG_LEVEL, | |
| 'file_path': str(self.LOG_FILE_PATH), | |
| 'enable_interaction_logging': self.ENABLE_INTERACTION_LOGGING, | |
| 'log_format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| 'max_file_size': 10 * 1024 * 1024, # 10MB | |
| 'backup_count': 5, | |
| 'console_output': not self.IS_PRODUCTION | |
| } | |
| def get_security_config(self) -> Dict[str, Any]: | |
| """ | |
| Get security configuration for admin access and session management. | |
| Returns: | |
| Dictionary with security parameters | |
| """ | |
| return { | |
| 'session_timeout_hours': self.SESSION_TIMEOUT_HOURS, | |
| 'admin_session_timeout_hours': self.ADMIN_SESSION_TIMEOUT_HOURS, | |
| 'password_min_length': 8, | |
| 'password_complexity_required': self.IS_PRODUCTION, | |
| 'enable_rate_limiting': self.IS_PRODUCTION, | |
| 'max_failed_attempts': 3, | |
| 'lockout_duration_minutes': 15 | |
| } | |
| def create_environment_file(self, file_path: Optional[str] = None) -> str: | |
| """ | |
| Generate .env file template with all configuration options. | |
| Args: | |
| file_path: Optional path for .env file | |
| Returns: | |
| Path to created .env file | |
| """ | |
| if not file_path: | |
| file_path = '.env' | |
| env_content = f"""# {self.APP_NAME} Configuration | |
| # Generated automatically - modify as needed for your deployment | |
| # === Application Settings === | |
| APP_NAME="{self.APP_NAME}" | |
| APP_VERSION="{self.APP_VERSION}" | |
| COMPANY_NAME="{self.COMPANY_NAME}" | |
| ENVIRONMENT=production | |
| # === Document Processing === | |
| CHUNK_SIZE={self.CHUNK_SIZE} | |
| CHUNK_OVERLAP={self.CHUNK_OVERLAP} | |
| MIN_CHUNK_SIZE={self.MIN_CHUNK_SIZE} | |
| MAX_FILE_SIZE={self.MAX_FILE_SIZE} | |
| # === Vector Database === | |
| VECTOR_DB_PATH=./vector_db | |
| MAX_CONTEXT_CHUNKS={self.MAX_CONTEXT_CHUNKS} | |
| SIMILARITY_THRESHOLD={self.SIMILARITY_THRESHOLD} | |
| # === API Configuration === | |
| GEMINI_MODEL={self.GEMINI_MODEL} | |
| TEMPERATURE={self.TEMPERATURE} | |
| MAX_RESPONSE_TOKENS={self.MAX_RESPONSE_TOKENS} | |
| # === Security === | |
| SESSION_TIMEOUT_HOURS={self.SESSION_TIMEOUT_HOURS} | |
| ADMIN_SESSION_TIMEOUT_HOURS={self.ADMIN_SESSION_TIMEOUT_HOURS} | |
| # === Logging === | |
| LOG_LEVEL={self.LOG_LEVEL} | |
| LOG_FILE_PATH=./logs/hr_assistant.log | |
| ENABLE_INTERACTION_LOGGING=true | |
| # === Performance === | |
| EMBEDDING_BATCH_SIZE={self.EMBEDDING_BATCH_SIZE} | |
| ENABLE_MODEL_CACHING=true | |
| ENABLE_CACHING=true | |
| """ | |
| try: | |
| with open(file_path, 'w') as f: | |
| f.write(env_content) | |
| return file_path | |
| except Exception as e: | |
| if 'streamlit' in globals(): | |
| st.error(f"Failed to create .env file: {str(e)}") | |
| return "" | |
| def __str__(self) -> str: | |
| """String representation for debugging and logging.""" | |
| return f"{self.APP_NAME} Config (Environment: {self.ENVIRONMENT})" | |
| def __repr__(self) -> str: | |
| """Developer-friendly representation.""" | |
| return f"Config(app='{self.APP_NAME}', env='{self.ENVIRONMENT}', version='{self.APP_VERSION}')" | |