Text Generation
Transformers
Diffusers
Safetensors
English
gpt_oss
phillnet-2
gpt-oss
multimodal
image-generation
video-generation
speech
audio
custom-code
conversational
custom_code
Instructions to use ayjays132/Phillnet-2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ayjays132/Phillnet-2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="ayjays132/Phillnet-2", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use ayjays132/Phillnet-2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "ayjays132/Phillnet-2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/ayjays132/Phillnet-2
- SGLang
How to use ayjays132/Phillnet-2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "ayjays132/Phillnet-2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "ayjays132/Phillnet-2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use ayjays132/Phillnet-2 with Docker Model Runner:
docker model run hf.co/ayjays132/Phillnet-2
| import torch | |
| import torch.nn as nn | |
| from typing import Optional, Any, Dict, List | |
| class SharedModelConfig: | |
| def __init__(self, model_name=None, model_type=None, device="cuda", use_fp16=True): | |
| self.model_name = model_name | |
| self.model_type = model_type | |
| self.device = device | |
| self.use_fp16 = use_fp16 | |
| _GLOBAL_SHARED_MODEL = None | |
| _GLOBAL_SHARED_TOKENIZER = None | |
| _GLOBAL_SHARED_ADAPTER = None | |
| class SharedModel: | |
| """ | |
| 🗜️ Zero-Overhead Shared Model Backbone | |
| Provides a unified interface for multiple neural subsystems | |
| to share the same VRAM-resident weights. | |
| """ | |
| def __init__(self, config: Optional[SharedModelConfig] = None): | |
| self.config = config | |
| self._model = _GLOBAL_SHARED_MODEL | |
| self._tokenizer = _GLOBAL_SHARED_TOKENIZER | |
| def register(cls, model: nn.Module, tokenizer: Any = None): | |
| """Register the primary model as the shared backbone.""" | |
| global _GLOBAL_SHARED_MODEL, _GLOBAL_SHARED_TOKENIZER, _GLOBAL_SHARED_ADAPTER | |
| _GLOBAL_SHARED_MODEL = model | |
| _GLOBAL_SHARED_TOKENIZER = tokenizer | |
| _GLOBAL_SHARED_ADAPTER = SharedBackboneAdapter(model, tokenizer) | |
| def get_model(self) -> Optional[nn.Module]: | |
| return _GLOBAL_SHARED_MODEL | |
| def get_tokenizer(self) -> Any: | |
| return _GLOBAL_SHARED_TOKENIZER | |
| def get_adapter(self) -> Optional["SharedBackboneAdapter"]: | |
| return _GLOBAL_SHARED_ADAPTER | |
| def model(self) -> Optional[nn.Module]: | |
| return _GLOBAL_SHARED_MODEL | |
| class SharedBackboneAdapter: | |
| """ | |
| Read-only bridge to the already-loaded Qwen backbone. | |
| Subsystems use this instead of creating private layers or fallback models. | |
| """ | |
| def __init__(self, model: nn.Module, tokenizer: Any = None): | |
| self.model = model | |
| self.tokenizer = tokenizer | |
| def device(self) -> torch.device: | |
| return next(self.model.parameters()).device | |
| def dtype(self) -> torch.dtype: | |
| return next(self.model.parameters()).dtype | |
| def config(self): | |
| return getattr(self.model, "config", None) | |
| def hidden_size(self) -> int: | |
| return int(getattr(self.config, "hidden_size", 1024)) | |
| def get_model(self) -> nn.Module: | |
| return self.model | |
| def get_tokenizer(self) -> Any: | |
| return self.tokenizer | |
| def get_text_model(self) -> nn.Module: | |
| model = getattr(self.model, "model", self.model) | |
| return getattr(model, "language_model", model) | |
| def get_layers(self) -> List[nn.Module]: | |
| text_model = self.get_text_model() | |
| return list(getattr(text_model, "layers", [])) | |
| def get_layer(self, index: int = 0) -> Optional[nn.Module]: | |
| layers = self.get_layers() | |
| if not layers: | |
| return None | |
| index = max(0, min(int(index), len(layers) - 1)) | |
| return layers[index] | |
| def get_embedding_layer(self) -> Optional[nn.Module]: | |
| if hasattr(self.model, "get_input_embeddings"): | |
| return self.model.get_input_embeddings() | |
| text_model = self.get_text_model() | |
| return getattr(text_model, "embed_tokens", None) | |
| def encode_text(self, text: str, max_length: int = 256, use_hidden: bool = False) -> torch.Tensor: | |
| if self.tokenizer is None: | |
| raise RuntimeError("Shared tokenizer is not registered.") | |
| inputs = self.tokenizer( | |
| text, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=max_length, | |
| add_special_tokens=True, | |
| ) | |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| if use_hidden: | |
| outputs = self.model(**inputs, output_hidden_states=True, use_cache=False) | |
| hidden = getattr(outputs, "hidden_states", None) | |
| if hidden: | |
| states = hidden[-1] | |
| else: | |
| states = getattr(outputs, "last_hidden_state", None) | |
| if states is not None: | |
| return states.mean(dim=1).to(dtype=self.dtype) | |
| embed = self.get_embedding_layer() | |
| if embed is None: | |
| raise RuntimeError("Shared model has no input embedding layer.") | |
| token_embeddings = embed(inputs["input_ids"]) | |
| attention_mask = inputs.get("attention_mask") | |
| if attention_mask is None: | |
| return token_embeddings.mean(dim=1) | |
| mask = attention_mask.unsqueeze(-1).to(dtype=token_embeddings.dtype) | |
| return (token_embeddings * mask).sum(dim=1) / mask.sum(dim=1).clamp_min(1) | |
| def summary(self) -> Dict[str, Any]: | |
| return { | |
| "model_type": type(self.model).__name__, | |
| "tokenizer_type": type(self.tokenizer).__name__ if self.tokenizer is not None else None, | |
| "hidden_size": self.hidden_size, | |
| "num_layers": len(self.get_layers()), | |
| "device": str(self.device), | |
| "dtype": str(self.dtype), | |
| } | |
| def get_shared_model() -> Optional[nn.Module]: | |
| return _GLOBAL_SHARED_MODEL | |
| def get_shared_tokenizer() -> Any: | |
| return _GLOBAL_SHARED_TOKENIZER | |
| def get_shared_adapter() -> Optional[SharedBackboneAdapter]: | |
| return _GLOBAL_SHARED_ADAPTER | |
| def register_shared_model(model: nn.Module, tokenizer: Any = None) -> SharedBackboneAdapter: | |
| SharedModel.register(model, tokenizer) | |
| return _GLOBAL_SHARED_ADAPTER | |