Spaces:
Sleeping
Sleeping
| # llm_clients/performance_utils.py | |
| """ | |
| Performance optimization utilities to reduce startup time and memory usage. | |
| """ | |
| import os | |
| import warnings | |
| def apply_performance_optimizations(): | |
| """Apply various performance optimizations to reduce startup time and memory usage.""" | |
| # Disable TensorFlow warnings and optimizations | |
| os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" | |
| os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Only show errors | |
| # Disable PyTorch compilation for CPU-only inference | |
| os.environ["TORCH_COMPILE_DISABLE"] = "1" | |
| os.environ["TORCHDYNAMO_DISABLE"] = "1" | |
| # Optimize memory usage | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" # Reduce tokenizer overhead | |
| os.environ["OMP_NUM_THREADS"] = "1" # Reduce CPU threading overhead | |
| # Disable various warnings to reduce console noise | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| warnings.filterwarnings("ignore", category=UserWarning, module="transformers") | |
| warnings.filterwarnings("ignore", category=UserWarning, module="torch") | |
| print("⚡ Applied performance optimizations") | |
| def setup_model_sharing(): | |
| """Initialize shared model manager early to control loading order.""" | |
| try: | |
| from .shared_models import shared_model_manager | |
| print("🔗 Shared model manager initialized") | |
| return shared_model_manager | |
| except ImportError: | |
| print("⚠️ Could not initialize shared model manager") | |
| return None | |
| def optimize_transformers(): | |
| """Apply transformers-specific optimizations.""" | |
| try: | |
| import transformers | |
| # Disable transformers warnings | |
| transformers.logging.set_verbosity_error() | |
| print("🤖 Transformers logging optimized") | |
| except ImportError: | |
| pass | |
| def optimize_for_cpu(): | |
| """Apply CPU-specific optimizations.""" | |
| try: | |
| import torch | |
| # Set number of threads for CPU inference | |
| torch.set_num_threads(1) | |
| # Disable autograd for inference-only mode | |
| torch.autograd.set_grad_enabled(False) | |
| print("🧠 CPU inference optimized") | |
| except ImportError: | |
| pass | |
| def apply_all_optimizations(): | |
| """Apply all available performance optimizations.""" | |
| apply_performance_optimizations() | |
| optimize_transformers() | |
| optimize_for_cpu() | |
| setup_model_sharing() |