Spaces:

BluescarfAI
/

HR-Assistant

Sleeping

App Files Files Community

HR-Assistant / config.py

HassanJalil

Upload 13 files

0a9f9c2 verified 8 months ago

raw

history blame contribute delete

14 kB

	import os
	from pathlib import Path
	from typing import Dict, Any, Optional
	import streamlit as st

	class Config:
	"""
	Centralized configuration management for BLUESCARF AI HR Assistant.
	Provides environment-aware settings with sensible defaults and validation.
	"""

	def __init__(self):
	"""Initialize configuration with environment-specific optimizations."""
	self._load_environment_config()
	self._validate_configuration()

	def _load_environment_config(self):
	"""Load configuration from environment variables with intelligent defaults."""

	# === Core Application Settings ===
	self.APP_NAME = "BLUESCARF AI HR Assistant"
	self.APP_VERSION = "1.0.0"
	self.COMPANY_NAME = "BLUESCARF ARTIFICIAL INTELLIGENCE"

	# === Document Processing Configuration ===
	# Optimal chunk size for semantic coherence (384-512 tokens typical)
	self.CHUNK_SIZE = int(os.getenv('CHUNK_SIZE', 1000))

	# Overlap for context continuity (10-20% of chunk size)
	self.CHUNK_OVERLAP = int(os.getenv('CHUNK_OVERLAP', 200))

	# Minimum viable chunk size to filter noise
	self.MIN_CHUNK_SIZE = int(os.getenv('MIN_CHUNK_SIZE', 100))

	# Maximum file size (50MB default for enterprise documents)
	self.MAX_FILE_SIZE = int(os.getenv('MAX_FILE_SIZE', 50 * 1024 * 1024))

	# === Vector Store Configuration ===
	# Persistent storage path with environment fallback
	default_db_path = Path("vector_db")
	self.VECTOR_DB_PATH = Path(os.getenv('VECTOR_DB_PATH', default_db_path))

	# Maximum context chunks for retrieval (balance between context and noise)
	self.MAX_CONTEXT_CHUNKS = int(os.getenv('MAX_CONTEXT_CHUNKS', 5))

	# Similarity search parameters
	self.SIMILARITY_THRESHOLD = float(os.getenv('SIMILARITY_THRESHOLD', 0.5))
	self.MAX_SEARCH_RESULTS = int(os.getenv('MAX_SEARCH_RESULTS', 10))

	# === API Configuration ===
	# Gemini model selection (optimized for reasoning and context)
	self.GEMINI_MODEL = os.getenv('GEMINI_MODEL', 'gemini-pro')

	# Response generation parameters
	self.MAX_RESPONSE_TOKENS = int(os.getenv('MAX_RESPONSE_TOKENS', 1024))
	self.TEMPERATURE = float(os.getenv('TEMPERATURE', 0.3)) # Conservative for factual responses

	# API rate limiting and retry configuration
	self.API_RETRY_ATTEMPTS = int(os.getenv('API_RETRY_ATTEMPTS', 3))
	self.API_TIMEOUT_SECONDS = int(os.getenv('API_TIMEOUT_SECONDS', 30))

	# === Security Configuration ===
	# Session and authentication settings
	self.SESSION_TIMEOUT_HOURS = int(os.getenv('SESSION_TIMEOUT_HOURS', 8))
	self.ADMIN_SESSION_TIMEOUT_HOURS = int(os.getenv('ADMIN_SESSION_TIMEOUT_HOURS', 2))

	# === Logging and Monitoring ===
	# Application logging configuration
	self.LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO')
	self.LOG_FILE_PATH = Path(os.getenv('LOG_FILE_PATH', 'logs/hr_assistant.log'))
	self.ENABLE_INTERACTION_LOGGING = os.getenv('ENABLE_INTERACTION_LOGGING', 'true').lower() == 'true'

	# === Performance Optimization ===
	# Embedding model caching and batch processing
	self.EMBEDDING_BATCH_SIZE = int(os.getenv('EMBEDDING_BATCH_SIZE', 32))
	self.ENABLE_MODEL_CACHING = os.getenv('ENABLE_MODEL_CACHING', 'true').lower() == 'true'

	# Streamlit performance settings
	self.STREAMLIT_THEME = os.getenv('STREAMLIT_THEME', 'light')
	self.ENABLE_CACHING = os.getenv('ENABLE_CACHING', 'true').lower() == 'true'

	# === Deployment Configuration ===
	# Environment detection for deployment-specific optimizations
	self.ENVIRONMENT = os.getenv('ENVIRONMENT', 'development')
	self.IS_PRODUCTION = self.ENVIRONMENT.lower() == 'production'
	self.IS_HUGGINGFACE = os.getenv('SPACE_ID') is not None

	# Resource limits for cloud deployment
	if self.IS_HUGGINGFACE:
	self._apply_huggingface_optimizations()

	def _apply_huggingface_optimizations(self):
	"""Apply Hugging Face Spaces specific optimizations."""
	# Reduce memory footprint for cloud deployment
	self.CHUNK_SIZE = min(self.CHUNK_SIZE, 800)
	self.MAX_CONTEXT_CHUNKS = min(self.MAX_CONTEXT_CHUNKS, 4)
	self.EMBEDDING_BATCH_SIZE = min(self.EMBEDDING_BATCH_SIZE, 16)
	self.MAX_FILE_SIZE = min(self.MAX_FILE_SIZE, 25 * 1024 * 1024) # 25MB limit

	# Optimize for limited computational resources
	self.ENABLE_MODEL_CACHING = True
	self.API_TIMEOUT_SECONDS = 60 # More lenient timeout for cloud

	def _validate_configuration(self):
	"""Validate configuration parameters and ensure system compatibility."""
	validation_errors = []

	# Validate numeric ranges
	if self.CHUNK_SIZE < 100 or self.CHUNK_SIZE > 2000:
	validation_errors.append("CHUNK_SIZE must be between 100 and 2000")

	if self.CHUNK_OVERLAP >= self.CHUNK_SIZE:
	validation_errors.append("CHUNK_OVERLAP must be less than CHUNK_SIZE")

	if self.SIMILARITY_THRESHOLD < 0 or self.SIMILARITY_THRESHOLD > 1:
	validation_errors.append("SIMILARITY_THRESHOLD must be between 0 and 1")

	if self.TEMPERATURE < 0 or self.TEMPERATURE > 1:
	validation_errors.append("TEMPERATURE must be between 0 and 1")

	# Validate paths and create directories
	try:
	self.VECTOR_DB_PATH.mkdir(parents=True, exist_ok=True)
	self.LOG_FILE_PATH.parent.mkdir(parents=True, exist_ok=True)
	except Exception as e:
	validation_errors.append(f"Cannot create required directories: {str(e)}")

	# Report validation errors
	if validation_errors:
	error_message = "Configuration validation failed:\n" + "\n".join(validation_errors)
	if 'streamlit' in globals():
	st.error(error_message)
	else:
	print(f"ERROR: {error_message}")
	raise ValueError(error_message)

	def get_hr_context_prompt(self) -> str:
	"""
	Generate context-aware system prompt for HR assistant interactions.

	Returns:
	Optimized system prompt for Gemini API
	"""
	return f"""
	You are an intelligent HR Assistant for {self.COMPANY_NAME}.

	CORE IDENTITY:
	- Professional, helpful, and knowledgeable about company policies
	- Exclusively focused on HR-related matters using provided company documents
	- Maintain confidentiality and provide accurate, policy-based guidance

	RESPONSE GUIDELINES:
	1. SCOPE: Only answer questions related to company HR policies, procedures, and benefits
	2. SOURCE: Base responses exclusively on provided company documents
	3. CLARITY: Provide clear, actionable guidance with specific policy references
	4. BOUNDARIES: Politely redirect non-HR questions to appropriate resources
	5. ACCURACY: If information isn't in the documents, state this clearly
	6. TONE: Professional yet approachable, maintaining company values

	STRUCTURED RESPONSE FORMAT:
	- Direct answer to the question
	- Relevant policy/document references
	- Next steps or additional resources if applicable
	- Contact information for complex cases requiring human intervention

	Remember: You represent {self.COMPANY_NAME} and should reflect our commitment to supporting employees through clear, accurate HR guidance.
	"""

	def get_similarity_search_config(self) -> Dict[str, Any]:
	"""
	Get optimized configuration for vector similarity search.

	Returns:
	Dictionary with search parameters
	"""
	return {
	'k': self.MAX_CONTEXT_CHUNKS,
	'similarity_threshold': self.SIMILARITY_THRESHOLD,
	'max_results': self.MAX_SEARCH_RESULTS,
	'include_metadata': True,
	'score_threshold': 0.3, # Minimum relevance score
	'diversity_penalty': 0.1 # Encourage diverse results
	}

	def get_gemini_config(self) -> Dict[str, Any]:
	"""
	Get optimized configuration for Gemini API calls.

	Returns:
	Dictionary with API parameters
	"""
	return {
	'model': self.GEMINI_MODEL,
	'temperature': self.TEMPERATURE,
	'max_output_tokens': self.MAX_RESPONSE_TOKENS,
	'top_p': 0.8, # Nucleus sampling for balanced creativity
	'top_k': 40, # Limit token consideration for consistency
	'stop_sequences': ["Human:", "Assistant:", "---"],
	}

	def get_document_processing_config(self) -> Dict[str, Any]:
	"""
	Get optimized configuration for document processing pipeline.

	Returns:
	Dictionary with processing parameters
	"""
	return {
	'chunk_size': self.CHUNK_SIZE,
	'chunk_overlap': self.CHUNK_OVERLAP,
	'min_chunk_size': self.MIN_CHUNK_SIZE,
	'max_file_size': self.MAX_FILE_SIZE,
	'embedding_batch_size': self.EMBEDDING_BATCH_SIZE,
	'enable_caching': self.ENABLE_MODEL_CACHING,
	'supported_formats': ['pdf'],
	'content_filters': {
	'min_word_count': 10,
	'max_word_count': 2000,
	'remove_headers_footers': True,
	'normalize_whitespace': True
	}
	}

	def get_streamlit_config(self) -> Dict[str, str]:
	"""
	Get Streamlit-specific configuration for optimal UI performance.

	Returns:
	Dictionary with Streamlit settings
	"""
	return {
	'page_title': self.APP_NAME,
	'page_icon': '🔷',
	'layout': 'wide',
	'initial_sidebar_state': 'collapsed',
	'menu_items': {
	'Get Help': f'mailto:support@{self.COMPANY_NAME.lower().replace(" ", "")}.com',
	'Report a bug': None,
	'About': f'{self.APP_NAME} v{self.APP_VERSION} - Powered by Google Gemini AI'
	}
	}

	def get_logging_config(self) -> Dict[str, Any]:
	"""
	Get comprehensive logging configuration for monitoring and debugging.

	Returns:
	Dictionary with logging parameters
	"""
	return {
	'level': self.LOG_LEVEL,
	'file_path': str(self.LOG_FILE_PATH),
	'enable_interaction_logging': self.ENABLE_INTERACTION_LOGGING,
	'log_format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	'max_file_size': 10 * 1024 * 1024, # 10MB
	'backup_count': 5,
	'console_output': not self.IS_PRODUCTION
	}

	def get_security_config(self) -> Dict[str, Any]:
	"""
	Get security configuration for admin access and session management.

	Returns:
	Dictionary with security parameters
	"""
	return {
	'session_timeout_hours': self.SESSION_TIMEOUT_HOURS,
	'admin_session_timeout_hours': self.ADMIN_SESSION_TIMEOUT_HOURS,
	'password_min_length': 8,
	'password_complexity_required': self.IS_PRODUCTION,
	'enable_rate_limiting': self.IS_PRODUCTION,
	'max_failed_attempts': 3,
	'lockout_duration_minutes': 15
	}

	def create_environment_file(self, file_path: Optional[str] = None) -> str:
	"""
	Generate .env file template with all configuration options.

	Args:
	file_path: Optional path for .env file

	Returns:
	Path to created .env file
	"""
	if not file_path:
	file_path = '.env'

	env_content = f"""# {self.APP_NAME} Configuration
	# Generated automatically - modify as needed for your deployment

	# === Application Settings ===
	APP_NAME="{self.APP_NAME}"
	APP_VERSION="{self.APP_VERSION}"
	COMPANY_NAME="{self.COMPANY_NAME}"
	ENVIRONMENT=production

	# === Document Processing ===
	CHUNK_SIZE={self.CHUNK_SIZE}
	CHUNK_OVERLAP={self.CHUNK_OVERLAP}
	MIN_CHUNK_SIZE={self.MIN_CHUNK_SIZE}
	MAX_FILE_SIZE={self.MAX_FILE_SIZE}

	# === Vector Database ===
	VECTOR_DB_PATH=./vector_db
	MAX_CONTEXT_CHUNKS={self.MAX_CONTEXT_CHUNKS}
	SIMILARITY_THRESHOLD={self.SIMILARITY_THRESHOLD}

	# === API Configuration ===
	GEMINI_MODEL={self.GEMINI_MODEL}
	TEMPERATURE={self.TEMPERATURE}
	MAX_RESPONSE_TOKENS={self.MAX_RESPONSE_TOKENS}

	# === Security ===
	SESSION_TIMEOUT_HOURS={self.SESSION_TIMEOUT_HOURS}
	ADMIN_SESSION_TIMEOUT_HOURS={self.ADMIN_SESSION_TIMEOUT_HOURS}

	# === Logging ===
	LOG_LEVEL={self.LOG_LEVEL}
	LOG_FILE_PATH=./logs/hr_assistant.log
	ENABLE_INTERACTION_LOGGING=true

	# === Performance ===
	EMBEDDING_BATCH_SIZE={self.EMBEDDING_BATCH_SIZE}
	ENABLE_MODEL_CACHING=true
	ENABLE_CACHING=true
	"""

	try:
	with open(file_path, 'w') as f:
	f.write(env_content)
	return file_path
	except Exception as e:
	if 'streamlit' in globals():
	st.error(f"Failed to create .env file: {str(e)}")
	return ""

	def __str__(self) -> str:
	"""String representation for debugging and logging."""
	return f"{self.APP_NAME} Config (Environment: {self.ENVIRONMENT})"

	def __repr__(self) -> str:
	"""Developer-friendly representation."""
	return f"Config(app='{self.APP_NAME}', env='{self.ENVIRONMENT}', version='{self.APP_VERSION}')"