Spaces:

zazaman
/

guardrails-final

Sleeping

App Files Files Community

guardrails-final / llm_clients /performance_utils.py

zazaman

Add multilingual translation support with Qwen3-0.6B-GGUF and optimize for Hugging Face Spaces deployment

a2e1879 about 1 month ago

raw

history blame contribute delete

2.36 kB

	# llm_clients/performance_utils.py
	"""
	Performance optimization utilities to reduce startup time and memory usage.
	"""

	import os
	import warnings

	def apply_performance_optimizations():
	"""Apply various performance optimizations to reduce startup time and memory usage."""

	# Disable TensorFlow warnings and optimizations
	os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
	os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Only show errors

	# Disable PyTorch compilation for CPU-only inference
	os.environ["TORCH_COMPILE_DISABLE"] = "1"
	os.environ["TORCHDYNAMO_DISABLE"] = "1"

	# Optimize memory usage
	os.environ["TOKENIZERS_PARALLELISM"] = "false" # Reduce tokenizer overhead
	os.environ["OMP_NUM_THREADS"] = "1" # Reduce CPU threading overhead

	# Disable various warnings to reduce console noise
	warnings.filterwarnings("ignore", category=FutureWarning)
	warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
	warnings.filterwarnings("ignore", category=UserWarning, module="torch")

	print("⚡ Applied performance optimizations")

	def setup_model_sharing():
	"""Initialize shared model manager early to control loading order."""
	try:
	from .shared_models import shared_model_manager
	print("🔗 Shared model manager initialized")
	return shared_model_manager
	except ImportError:
	print("⚠️ Could not initialize shared model manager")
	return None

	def optimize_transformers():
	"""Apply transformers-specific optimizations."""
	try:
	import transformers
	# Disable transformers warnings
	transformers.logging.set_verbosity_error()
	print("🤖 Transformers logging optimized")
	except ImportError:
	pass

	def optimize_for_cpu():
	"""Apply CPU-specific optimizations."""
	try:
	import torch
	# Set number of threads for CPU inference
	torch.set_num_threads(1)
	# Disable autograd for inference-only mode
	torch.autograd.set_grad_enabled(False)
	print("🧠 CPU inference optimized")
	except ImportError:
	pass

	def apply_all_optimizations():
	"""Apply all available performance optimizations."""
	apply_performance_optimizations()
	optimize_transformers()
	optimize_for_cpu()
	setup_model_sharing()