File size: 2,357 Bytes
a2e1879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# llm_clients/performance_utils.py
"""
Performance optimization utilities to reduce startup time and memory usage.
"""

import os
import warnings

def apply_performance_optimizations():
    """Apply various performance optimizations to reduce startup time and memory usage."""
    
    # Disable TensorFlow warnings and optimizations
    os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # Only show errors
    
    # Disable PyTorch compilation for CPU-only inference
    os.environ["TORCH_COMPILE_DISABLE"] = "1"
    os.environ["TORCHDYNAMO_DISABLE"] = "1"
    
    # Optimize memory usage
    os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Reduce tokenizer overhead
    os.environ["OMP_NUM_THREADS"] = "1"  # Reduce CPU threading overhead
    
    # Disable various warnings to reduce console noise
    warnings.filterwarnings("ignore", category=FutureWarning)
    warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
    warnings.filterwarnings("ignore", category=UserWarning, module="torch")
    
    print("⚡ Applied performance optimizations")

def setup_model_sharing():
    """Initialize shared model manager early to control loading order."""
    try:
        from .shared_models import shared_model_manager
        print("🔗 Shared model manager initialized")
        return shared_model_manager
    except ImportError:
        print("⚠️  Could not initialize shared model manager")
        return None

def optimize_transformers():
    """Apply transformers-specific optimizations."""
    try:
        import transformers
        # Disable transformers warnings
        transformers.logging.set_verbosity_error()
        print("🤖 Transformers logging optimized")
    except ImportError:
        pass

def optimize_for_cpu():
    """Apply CPU-specific optimizations."""
    try:
        import torch
        # Set number of threads for CPU inference
        torch.set_num_threads(1)
        # Disable autograd for inference-only mode
        torch.autograd.set_grad_enabled(False)
        print("🧠 CPU inference optimized")
    except ImportError:
        pass

def apply_all_optimizations():
    """Apply all available performance optimizations."""
    apply_performance_optimizations()
    optimize_transformers()
    optimize_for_cpu()
    setup_model_sharing()