Text Generation
Transformers
Diffusers
Safetensors
English
gpt_oss
phillnet-2
gpt-oss
multimodal
image-generation
video-generation
speech
audio
custom-code
conversational
custom_code
Instructions to use ayjays132/Phillnet-2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ayjays132/Phillnet-2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="ayjays132/Phillnet-2", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use ayjays132/Phillnet-2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "ayjays132/Phillnet-2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/ayjays132/Phillnet-2
- SGLang
How to use ayjays132/Phillnet-2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "ayjays132/Phillnet-2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "ayjays132/Phillnet-2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use ayjays132/Phillnet-2 with Docker Model Runner:
docker model run hf.co/ayjays132/Phillnet-2
| """ | |
| Unified Memory Manager Module | |
| Main memory management system integrating all components. | |
| """ | |
| import torch | |
| import torch.nn as nn | |
| import logging | |
| from typing import Dict, Any, Optional, Tuple, List | |
| from .config import MemoryOptimizationConfig | |
| from .tensor_pool import TensorPool | |
| from .model_cache import ModelCache | |
| from .cleanup import MemoryCleanup | |
| logger = logging.getLogger(__name__) | |
| class UnifiedMemoryManager: | |
| """ | |
| Unified Memory Manager - Central memory optimization system. | |
| Integrates tensor pooling, model caching, and cleanup utilities. | |
| Uses shared Qwen model for zero memory overhead. | |
| """ | |
| _instance = None | |
| _initialized = False | |
| def __new__(cls, config: Optional[MemoryOptimizationConfig] = None): | |
| if cls._instance is None: | |
| cls._instance = super().__new__(cls) | |
| return cls._instance | |
| def __init__(self, config: Optional[MemoryOptimizationConfig] = None): | |
| if not self._initialized: | |
| self.config = config or MemoryOptimizationConfig() | |
| self._initialize_memory_manager() | |
| UnifiedMemoryManager._initialized = True | |
| def _initialize_memory_manager(self): | |
| """Initialize the unified memory manager with optimal settings.""" | |
| self.device = torch.device(self.config.device) | |
| # Initialize components | |
| self.tensor_pool = TensorPool( | |
| max_pool_size=self.config.max_pool_size, | |
| max_tensor_size=self.config.max_tensor_size | |
| ) | |
| self.model_cache = ModelCache( | |
| use_shared_model=self.config.use_shared_model, | |
| shared_model_name=self.config.shared_model_name | |
| ) | |
| self.cleanup = MemoryCleanup( | |
| memory_threshold=self.config.memory_threshold, | |
| cleanup_threshold=self.config.cleanup_threshold | |
| ) | |
| # Lazy loading registry | |
| self.lazy_modules = {} | |
| self.active_modules = set() | |
| logger.info("[MANAGER] Unified Memory Manager initialized") | |
| logger.info(f"[MANAGER] Device: {self.device}") | |
| logger.info(f"[MANAGER] Shared Model: {self.config.use_shared_model}") | |
| def get_shared_model(self, model_name: str, model_type: str = "transformer", | |
| device: Optional[str] = None, **kwargs) -> Any: | |
| """ | |
| Get or create a shared model instance. | |
| Args: | |
| model_name: Name of the model to load | |
| model_type: Type of model (transformer, tokenizer, etc.) | |
| device: Device to load model on | |
| **kwargs: Additional model loading parameters | |
| Returns: | |
| Shared model instance | |
| """ | |
| if device is None: | |
| device = str(self.device) | |
| return self.model_cache.get_shared_model( | |
| model_name, model_type, device, **kwargs | |
| ) | |
| def get_tensor(self, shape: Tuple[int, ...], dtype: torch.dtype = torch.float32, | |
| requires_grad: bool = False, module_name: str = "default") -> torch.Tensor: | |
| """ | |
| Get tensor from unified pool or create new one. | |
| Args: | |
| shape: Tensor shape | |
| dtype: Tensor data type | |
| requires_grad: Whether tensor requires gradients | |
| module_name: Name of requesting module for tracking | |
| Returns: | |
| Optimized tensor | |
| """ | |
| # Check memory pressure and cleanup if needed | |
| if self.tensor_pool.operation_count % self.config.cleanup_frequency == 0: | |
| self.cleanup.adaptive_cleanup(self.tensor_pool) | |
| # Check memory pressure before creating new tensor | |
| if self.cleanup.check_memory_pressure(): | |
| self.cleanup.emergency_cleanup(self.tensor_pool) | |
| return self.tensor_pool.get_tensor(shape, dtype, requires_grad, self.device) | |
| def return_tensor(self, tensor: torch.Tensor, module_name: str = "default") -> None: | |
| """ | |
| Return tensor to unified pool for reuse. | |
| Args: | |
| tensor: Tensor to return to pool | |
| module_name: Name of returning module | |
| """ | |
| self.tensor_pool.return_tensor(tensor) | |
| def register_lazy_module(self, module_name: str, module_class: type, | |
| init_args: tuple = (), init_kwargs: dict = None) -> None: | |
| """ | |
| Register a module for lazy loading. | |
| Args: | |
| module_name: Name of the module | |
| module_class: Module class to instantiate | |
| init_args: Positional arguments for initialization | |
| init_kwargs: Keyword arguments for initialization | |
| """ | |
| if init_kwargs is None: | |
| init_kwargs = {} | |
| self.lazy_modules[module_name] = { | |
| 'class': module_class, | |
| 'args': init_args, | |
| 'kwargs': init_kwargs | |
| } | |
| def get_lazy_module(self, module_name: str) -> Optional[Any]: | |
| """ | |
| Get lazy-loaded module, creating it if necessary. | |
| Args: | |
| module_name: Name of the module to get | |
| Returns: | |
| Module instance or None if not found | |
| """ | |
| if module_name in self.active_modules: | |
| return getattr(self, module_name, None) | |
| if module_name in self.lazy_modules: | |
| config = self.lazy_modules[module_name] | |
| module = config['class'](*config['args'], **config['kwargs']) | |
| setattr(self, module_name, module) | |
| self.active_modules.add(module_name) | |
| # Check memory pressure after loading | |
| if self.cleanup.check_memory_pressure(): | |
| self.cleanup.adaptive_cleanup(self.tensor_pool) | |
| return module | |
| return None | |
| def optimize_for_inference(self, model: nn.Module) -> nn.Module: | |
| """ | |
| Optimize model for inference with memory efficiency. | |
| Args: | |
| model: Model to optimize | |
| Returns: | |
| Optimized model | |
| """ | |
| # Set to evaluation mode | |
| model.eval() | |
| # Enable gradient checkpointing if available | |
| if self.config.use_gradient_checkpointing and hasattr(model, 'gradient_checkpointing_enable'): | |
| model.gradient_checkpointing_enable() | |
| # Optimize for inference | |
| if hasattr(model, 'half') and torch.cuda.is_available(): | |
| model = model.half() | |
| return model | |
| def register_memory(self, embedding_tensor: torch.Tensor, metadata: Optional[Dict[str, Any]] = None) -> None: | |
| """ | |
| Register a memory embedding tensor with the optimization system. | |
| Args: | |
| embedding_tensor: Memory embedding tensor to register | |
| metadata: Optional metadata dictionary | |
| """ | |
| # Track memory usage for optimization | |
| if metadata is None: | |
| metadata = {} | |
| # Check memory pressure and cleanup if needed | |
| if self.cleanup.check_memory_pressure(): | |
| self.cleanup.adaptive_cleanup(self.tensor_pool) | |
| # Store metadata for tracking (if needed for future optimization) | |
| # This is a no-op for now but allows the interface to exist | |
| # The actual memory is managed by the tensor pool and cleanup system | |
| pass | |
| def get_memory_stats(self) -> Dict[str, Any]: | |
| """Get comprehensive memory statistics.""" | |
| stats = { | |
| 'tensor_pool': self.tensor_pool.get_stats(), | |
| 'model_cache': self.model_cache.get_stats(), | |
| 'cleanup': self.cleanup.get_memory_stats(), | |
| 'active_modules': list(self.active_modules), | |
| 'lazy_modules': list(self.lazy_modules.keys()) | |
| } | |
| return stats | |
| def clear_all_memory(self) -> None: | |
| """Clear all memory and reset the manager.""" | |
| logger.info("[MANAGER] Clearing all memory") | |
| # Clear tensor pools | |
| self.tensor_pool.clear_all() | |
| # Clear model cache | |
| self.model_cache.clear_cache() | |
| # Clear active modules | |
| self.active_modules.clear() | |
| self.lazy_modules.clear() | |
| # Clear PyTorch cache | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| # Force garbage collection | |
| import gc | |
| gc.collect() | |
| logger.info("[MANAGER] All memory cleared") | |