Text Generation
Transformers
Diffusers
Safetensors
English
gpt_oss
phillnet-2
gpt-oss
multimodal
image-generation
video-generation
speech
audio
custom-code
conversational
custom_code
Instructions to use ayjays132/Phillnet-2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ayjays132/Phillnet-2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="ayjays132/Phillnet-2", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use ayjays132/Phillnet-2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "ayjays132/Phillnet-2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/ayjays132/Phillnet-2
- SGLang
How to use ayjays132/Phillnet-2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "ayjays132/Phillnet-2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "ayjays132/Phillnet-2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use ayjays132/Phillnet-2 with Docker Model Runner:
docker model run hf.co/ayjays132/Phillnet-2
| """ | |
| Model Cache Module | |
| Shared model caching system with shared Qwen model integration. | |
| """ | |
| import torch | |
| import logging | |
| from typing import Dict, Any, Optional | |
| import sys | |
| import os | |
| logger = logging.getLogger(__name__) | |
| # Try to import shared model | |
| SHARED_MODEL_AVAILABLE = False | |
| get_shared_model_func = None | |
| get_shared_tokenizer_func = None | |
| try: | |
| import importlib.util | |
| shared_model_path = os.path.join(os.path.dirname(__file__), '..', 'Shared Model', 'shared_model.py') | |
| if os.path.exists(shared_model_path): | |
| spec = importlib.util.spec_from_file_location("shared_model", shared_model_path) | |
| if spec and spec.loader: | |
| shared_model_module = importlib.util.module_from_spec(spec) | |
| spec.loader.exec_module(shared_model_module) | |
| SharedModel = shared_model_module.SharedModel | |
| SharedModelConfig = shared_model_module.SharedModelConfig | |
| get_shared_model_func = shared_model_module.get_shared_model | |
| get_shared_tokenizer_func = shared_model_module.get_shared_tokenizer | |
| SHARED_MODEL_AVAILABLE = True | |
| except Exception as e: | |
| logger.debug(f"Shared model not available: {e}") | |
| class ModelCache: | |
| """ | |
| Shared model cache to prevent memory duplication. | |
| Integrates with shared Qwen model for zero memory overhead. | |
| """ | |
| def __init__(self, use_shared_model: bool = True, shared_model_name: str = "Qwen/Qwen3-0.6B"): | |
| self.use_shared_model = use_shared_model | |
| self.shared_model_name = shared_model_name | |
| self.shared_models = {} | |
| self.shared_tokenizers = {} | |
| logger.debug("ModelCache initialized") | |
| def get_shared_model(self, model_name: str, model_type: str = "transformer", | |
| device: Optional[str] = None, **kwargs) -> Any: | |
| """ | |
| Get or create a shared model instance. | |
| Uses shared Qwen model if available for zero memory overhead. | |
| Args: | |
| model_name: Name of the model to load | |
| model_type: Type of model (transformer, tokenizer, etc.) | |
| device: Device to load model on | |
| **kwargs: Additional model loading parameters | |
| Returns: | |
| Shared model instance | |
| """ | |
| if device is None: | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Try to use shared Qwen model first | |
| if (self.use_shared_model and model_type == "transformer" and | |
| SHARED_MODEL_AVAILABLE and get_shared_model_func is not None): | |
| try: | |
| shared_model = get_shared_model_func() | |
| if shared_model is not None: | |
| logger.info(f"[CACHE] Using shared Qwen model (zero memory overhead)") | |
| return shared_model | |
| except Exception as e: | |
| logger.debug(f"[CACHE] Shared model not available: {e}") | |
| # Try to use shared tokenizer | |
| if (self.use_shared_model and model_type == "tokenizer" and | |
| SHARED_MODEL_AVAILABLE and get_shared_tokenizer_func is not None): | |
| try: | |
| shared_tokenizer = get_shared_tokenizer_func() | |
| if shared_tokenizer is not None: | |
| logger.info(f"[CACHE] Using shared Qwen tokenizer (zero memory overhead)") | |
| return shared_tokenizer | |
| except Exception as e: | |
| logger.debug(f"[CACHE] Shared tokenizer not available: {e}") | |
| # Fallback to cached models | |
| cache_key = f"{model_name}_{model_type}_{device}_{hash(str(sorted(kwargs.items())))}" | |
| if model_type == "tokenizer": | |
| cache_dict = self.shared_tokenizers | |
| else: | |
| cache_dict = self.shared_models | |
| if cache_key not in cache_dict: | |
| logger.info(f"[CACHE] Loading {model_type} model: {model_name}") | |
| try: | |
| if model_type == "transformer": | |
| model = self._load_transformer_model(model_name, device, **kwargs) | |
| elif model_type == "tokenizer": | |
| model = self._load_tokenizer_model(model_name, device, **kwargs) | |
| else: | |
| raise ValueError(f"Unknown model type: {model_type}") | |
| cache_dict[cache_key] = model | |
| logger.info(f"[CACHE] {model_type} model cached: {cache_key}") | |
| except Exception as e: | |
| logger.error(f"[CACHE] Failed to load {model_type} model {model_name}: {e}") | |
| raise | |
| else: | |
| logger.debug(f"[CACHE] Using cached {model_type} model: {cache_key}") | |
| return cache_dict[cache_key] | |
| def _load_transformer_model(self, model_name: str, device: str, **kwargs) -> Any: | |
| """Load transformer model with memory optimizations.""" | |
| from transformers import AutoModelForCausalLM, BitsAndBytesConfig | |
| # Memory-optimized loading configuration | |
| load_config = { | |
| "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32, | |
| "device_map": "auto" if torch.cuda.is_available() else None, | |
| "trust_remote_code": True, | |
| "attn_implementation": "eager", # More memory efficient | |
| } | |
| # Add quantization if available and beneficial | |
| if kwargs.get('use_4bit_quantization', True) and torch.cuda.is_available(): | |
| try: | |
| # Check if bitsandbytes is properly installed with CUDA support | |
| import bitsandbytes as bnb | |
| if hasattr(bnb, 'libbitsandbytes_cuda'): | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4" | |
| ) | |
| load_config["quantization_config"] = quantization_config | |
| else: | |
| logger.debug("[CACHE] BitsAndBytes CUDA support not available, skipping quantization") | |
| except (ImportError, AttributeError, Exception) as e: | |
| logger.debug(f"[CACHE] 4-bit quantization not available: {e}") | |
| # Remove problematic kwargs | |
| filtered_kwargs = {k: v for k, v in kwargs.items() | |
| if k not in ['use_4bit', 'dtype', 'use_4bit_quantization']} | |
| # Merge with user-provided kwargs | |
| load_config.update(filtered_kwargs) | |
| return AutoModelForCausalLM.from_pretrained(model_name, **load_config) | |
| def _load_tokenizer_model(self, model_name: str, device: str, **kwargs) -> Any: | |
| """Load tokenizer with memory optimizations.""" | |
| from transformers import AutoTokenizer | |
| load_config = { | |
| "trust_remote_code": True, | |
| } | |
| load_config.update({k: v for k, v in kwargs.items() if k != 'use_4bit_quantization'}) | |
| return AutoTokenizer.from_pretrained(model_name, **load_config) | |
| def clear_cache(self) -> None: | |
| """Clear all cached models.""" | |
| self.shared_models.clear() | |
| self.shared_tokenizers.clear() | |
| logger.info("[CACHE] Model cache cleared") | |
| def get_stats(self) -> Dict: | |
| """Get cache statistics.""" | |
| return { | |
| 'shared_models': list(self.shared_models.keys()), | |
| 'shared_tokenizers': list(self.shared_tokenizers.keys()), | |
| 'use_shared_model': self.use_shared_model | |
| } | |