HR-Assistant / config.py
HassanJalil's picture
Upload 13 files
0a9f9c2 verified
import os
from pathlib import Path
from typing import Dict, Any, Optional
import streamlit as st
class Config:
"""
Centralized configuration management for BLUESCARF AI HR Assistant.
Provides environment-aware settings with sensible defaults and validation.
"""
def __init__(self):
"""Initialize configuration with environment-specific optimizations."""
self._load_environment_config()
self._validate_configuration()
def _load_environment_config(self):
"""Load configuration from environment variables with intelligent defaults."""
# === Core Application Settings ===
self.APP_NAME = "BLUESCARF AI HR Assistant"
self.APP_VERSION = "1.0.0"
self.COMPANY_NAME = "BLUESCARF ARTIFICIAL INTELLIGENCE"
# === Document Processing Configuration ===
# Optimal chunk size for semantic coherence (384-512 tokens typical)
self.CHUNK_SIZE = int(os.getenv('CHUNK_SIZE', 1000))
# Overlap for context continuity (10-20% of chunk size)
self.CHUNK_OVERLAP = int(os.getenv('CHUNK_OVERLAP', 200))
# Minimum viable chunk size to filter noise
self.MIN_CHUNK_SIZE = int(os.getenv('MIN_CHUNK_SIZE', 100))
# Maximum file size (50MB default for enterprise documents)
self.MAX_FILE_SIZE = int(os.getenv('MAX_FILE_SIZE', 50 * 1024 * 1024))
# === Vector Store Configuration ===
# Persistent storage path with environment fallback
default_db_path = Path("vector_db")
self.VECTOR_DB_PATH = Path(os.getenv('VECTOR_DB_PATH', default_db_path))
# Maximum context chunks for retrieval (balance between context and noise)
self.MAX_CONTEXT_CHUNKS = int(os.getenv('MAX_CONTEXT_CHUNKS', 5))
# Similarity search parameters
self.SIMILARITY_THRESHOLD = float(os.getenv('SIMILARITY_THRESHOLD', 0.5))
self.MAX_SEARCH_RESULTS = int(os.getenv('MAX_SEARCH_RESULTS', 10))
# === API Configuration ===
# Gemini model selection (optimized for reasoning and context)
self.GEMINI_MODEL = os.getenv('GEMINI_MODEL', 'gemini-pro')
# Response generation parameters
self.MAX_RESPONSE_TOKENS = int(os.getenv('MAX_RESPONSE_TOKENS', 1024))
self.TEMPERATURE = float(os.getenv('TEMPERATURE', 0.3)) # Conservative for factual responses
# API rate limiting and retry configuration
self.API_RETRY_ATTEMPTS = int(os.getenv('API_RETRY_ATTEMPTS', 3))
self.API_TIMEOUT_SECONDS = int(os.getenv('API_TIMEOUT_SECONDS', 30))
# === Security Configuration ===
# Session and authentication settings
self.SESSION_TIMEOUT_HOURS = int(os.getenv('SESSION_TIMEOUT_HOURS', 8))
self.ADMIN_SESSION_TIMEOUT_HOURS = int(os.getenv('ADMIN_SESSION_TIMEOUT_HOURS', 2))
# === Logging and Monitoring ===
# Application logging configuration
self.LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO')
self.LOG_FILE_PATH = Path(os.getenv('LOG_FILE_PATH', 'logs/hr_assistant.log'))
self.ENABLE_INTERACTION_LOGGING = os.getenv('ENABLE_INTERACTION_LOGGING', 'true').lower() == 'true'
# === Performance Optimization ===
# Embedding model caching and batch processing
self.EMBEDDING_BATCH_SIZE = int(os.getenv('EMBEDDING_BATCH_SIZE', 32))
self.ENABLE_MODEL_CACHING = os.getenv('ENABLE_MODEL_CACHING', 'true').lower() == 'true'
# Streamlit performance settings
self.STREAMLIT_THEME = os.getenv('STREAMLIT_THEME', 'light')
self.ENABLE_CACHING = os.getenv('ENABLE_CACHING', 'true').lower() == 'true'
# === Deployment Configuration ===
# Environment detection for deployment-specific optimizations
self.ENVIRONMENT = os.getenv('ENVIRONMENT', 'development')
self.IS_PRODUCTION = self.ENVIRONMENT.lower() == 'production'
self.IS_HUGGINGFACE = os.getenv('SPACE_ID') is not None
# Resource limits for cloud deployment
if self.IS_HUGGINGFACE:
self._apply_huggingface_optimizations()
def _apply_huggingface_optimizations(self):
"""Apply Hugging Face Spaces specific optimizations."""
# Reduce memory footprint for cloud deployment
self.CHUNK_SIZE = min(self.CHUNK_SIZE, 800)
self.MAX_CONTEXT_CHUNKS = min(self.MAX_CONTEXT_CHUNKS, 4)
self.EMBEDDING_BATCH_SIZE = min(self.EMBEDDING_BATCH_SIZE, 16)
self.MAX_FILE_SIZE = min(self.MAX_FILE_SIZE, 25 * 1024 * 1024) # 25MB limit
# Optimize for limited computational resources
self.ENABLE_MODEL_CACHING = True
self.API_TIMEOUT_SECONDS = 60 # More lenient timeout for cloud
def _validate_configuration(self):
"""Validate configuration parameters and ensure system compatibility."""
validation_errors = []
# Validate numeric ranges
if self.CHUNK_SIZE < 100 or self.CHUNK_SIZE > 2000:
validation_errors.append("CHUNK_SIZE must be between 100 and 2000")
if self.CHUNK_OVERLAP >= self.CHUNK_SIZE:
validation_errors.append("CHUNK_OVERLAP must be less than CHUNK_SIZE")
if self.SIMILARITY_THRESHOLD < 0 or self.SIMILARITY_THRESHOLD > 1:
validation_errors.append("SIMILARITY_THRESHOLD must be between 0 and 1")
if self.TEMPERATURE < 0 or self.TEMPERATURE > 1:
validation_errors.append("TEMPERATURE must be between 0 and 1")
# Validate paths and create directories
try:
self.VECTOR_DB_PATH.mkdir(parents=True, exist_ok=True)
self.LOG_FILE_PATH.parent.mkdir(parents=True, exist_ok=True)
except Exception as e:
validation_errors.append(f"Cannot create required directories: {str(e)}")
# Report validation errors
if validation_errors:
error_message = "Configuration validation failed:\n" + "\n".join(validation_errors)
if 'streamlit' in globals():
st.error(error_message)
else:
print(f"ERROR: {error_message}")
raise ValueError(error_message)
def get_hr_context_prompt(self) -> str:
"""
Generate context-aware system prompt for HR assistant interactions.
Returns:
Optimized system prompt for Gemini API
"""
return f"""
You are an intelligent HR Assistant for {self.COMPANY_NAME}.
CORE IDENTITY:
- Professional, helpful, and knowledgeable about company policies
- Exclusively focused on HR-related matters using provided company documents
- Maintain confidentiality and provide accurate, policy-based guidance
RESPONSE GUIDELINES:
1. SCOPE: Only answer questions related to company HR policies, procedures, and benefits
2. SOURCE: Base responses exclusively on provided company documents
3. CLARITY: Provide clear, actionable guidance with specific policy references
4. BOUNDARIES: Politely redirect non-HR questions to appropriate resources
5. ACCURACY: If information isn't in the documents, state this clearly
6. TONE: Professional yet approachable, maintaining company values
STRUCTURED RESPONSE FORMAT:
- Direct answer to the question
- Relevant policy/document references
- Next steps or additional resources if applicable
- Contact information for complex cases requiring human intervention
Remember: You represent {self.COMPANY_NAME} and should reflect our commitment to supporting employees through clear, accurate HR guidance.
"""
def get_similarity_search_config(self) -> Dict[str, Any]:
"""
Get optimized configuration for vector similarity search.
Returns:
Dictionary with search parameters
"""
return {
'k': self.MAX_CONTEXT_CHUNKS,
'similarity_threshold': self.SIMILARITY_THRESHOLD,
'max_results': self.MAX_SEARCH_RESULTS,
'include_metadata': True,
'score_threshold': 0.3, # Minimum relevance score
'diversity_penalty': 0.1 # Encourage diverse results
}
def get_gemini_config(self) -> Dict[str, Any]:
"""
Get optimized configuration for Gemini API calls.
Returns:
Dictionary with API parameters
"""
return {
'model': self.GEMINI_MODEL,
'temperature': self.TEMPERATURE,
'max_output_tokens': self.MAX_RESPONSE_TOKENS,
'top_p': 0.8, # Nucleus sampling for balanced creativity
'top_k': 40, # Limit token consideration for consistency
'stop_sequences': ["Human:", "Assistant:", "---"],
}
def get_document_processing_config(self) -> Dict[str, Any]:
"""
Get optimized configuration for document processing pipeline.
Returns:
Dictionary with processing parameters
"""
return {
'chunk_size': self.CHUNK_SIZE,
'chunk_overlap': self.CHUNK_OVERLAP,
'min_chunk_size': self.MIN_CHUNK_SIZE,
'max_file_size': self.MAX_FILE_SIZE,
'embedding_batch_size': self.EMBEDDING_BATCH_SIZE,
'enable_caching': self.ENABLE_MODEL_CACHING,
'supported_formats': ['pdf'],
'content_filters': {
'min_word_count': 10,
'max_word_count': 2000,
'remove_headers_footers': True,
'normalize_whitespace': True
}
}
def get_streamlit_config(self) -> Dict[str, str]:
"""
Get Streamlit-specific configuration for optimal UI performance.
Returns:
Dictionary with Streamlit settings
"""
return {
'page_title': self.APP_NAME,
'page_icon': '🔷',
'layout': 'wide',
'initial_sidebar_state': 'collapsed',
'menu_items': {
'Get Help': f'mailto:support@{self.COMPANY_NAME.lower().replace(" ", "")}.com',
'Report a bug': None,
'About': f'{self.APP_NAME} v{self.APP_VERSION} - Powered by Google Gemini AI'
}
}
def get_logging_config(self) -> Dict[str, Any]:
"""
Get comprehensive logging configuration for monitoring and debugging.
Returns:
Dictionary with logging parameters
"""
return {
'level': self.LOG_LEVEL,
'file_path': str(self.LOG_FILE_PATH),
'enable_interaction_logging': self.ENABLE_INTERACTION_LOGGING,
'log_format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
'max_file_size': 10 * 1024 * 1024, # 10MB
'backup_count': 5,
'console_output': not self.IS_PRODUCTION
}
def get_security_config(self) -> Dict[str, Any]:
"""
Get security configuration for admin access and session management.
Returns:
Dictionary with security parameters
"""
return {
'session_timeout_hours': self.SESSION_TIMEOUT_HOURS,
'admin_session_timeout_hours': self.ADMIN_SESSION_TIMEOUT_HOURS,
'password_min_length': 8,
'password_complexity_required': self.IS_PRODUCTION,
'enable_rate_limiting': self.IS_PRODUCTION,
'max_failed_attempts': 3,
'lockout_duration_minutes': 15
}
def create_environment_file(self, file_path: Optional[str] = None) -> str:
"""
Generate .env file template with all configuration options.
Args:
file_path: Optional path for .env file
Returns:
Path to created .env file
"""
if not file_path:
file_path = '.env'
env_content = f"""# {self.APP_NAME} Configuration
# Generated automatically - modify as needed for your deployment
# === Application Settings ===
APP_NAME="{self.APP_NAME}"
APP_VERSION="{self.APP_VERSION}"
COMPANY_NAME="{self.COMPANY_NAME}"
ENVIRONMENT=production
# === Document Processing ===
CHUNK_SIZE={self.CHUNK_SIZE}
CHUNK_OVERLAP={self.CHUNK_OVERLAP}
MIN_CHUNK_SIZE={self.MIN_CHUNK_SIZE}
MAX_FILE_SIZE={self.MAX_FILE_SIZE}
# === Vector Database ===
VECTOR_DB_PATH=./vector_db
MAX_CONTEXT_CHUNKS={self.MAX_CONTEXT_CHUNKS}
SIMILARITY_THRESHOLD={self.SIMILARITY_THRESHOLD}
# === API Configuration ===
GEMINI_MODEL={self.GEMINI_MODEL}
TEMPERATURE={self.TEMPERATURE}
MAX_RESPONSE_TOKENS={self.MAX_RESPONSE_TOKENS}
# === Security ===
SESSION_TIMEOUT_HOURS={self.SESSION_TIMEOUT_HOURS}
ADMIN_SESSION_TIMEOUT_HOURS={self.ADMIN_SESSION_TIMEOUT_HOURS}
# === Logging ===
LOG_LEVEL={self.LOG_LEVEL}
LOG_FILE_PATH=./logs/hr_assistant.log
ENABLE_INTERACTION_LOGGING=true
# === Performance ===
EMBEDDING_BATCH_SIZE={self.EMBEDDING_BATCH_SIZE}
ENABLE_MODEL_CACHING=true
ENABLE_CACHING=true
"""
try:
with open(file_path, 'w') as f:
f.write(env_content)
return file_path
except Exception as e:
if 'streamlit' in globals():
st.error(f"Failed to create .env file: {str(e)}")
return ""
def __str__(self) -> str:
"""String representation for debugging and logging."""
return f"{self.APP_NAME} Config (Environment: {self.ENVIRONMENT})"
def __repr__(self) -> str:
"""Developer-friendly representation."""
return f"Config(app='{self.APP_NAME}', env='{self.ENVIRONMENT}', version='{self.APP_VERSION}')"