Text Generation
Transformers
Diffusers
Safetensors
English
gpt_oss
phillnet-2
gpt-oss
multimodal
image-generation
video-generation
speech
audio
custom-code
conversational
custom_code
Instructions to use ayjays132/Phillnet-2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ayjays132/Phillnet-2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="ayjays132/Phillnet-2", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use ayjays132/Phillnet-2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "ayjays132/Phillnet-2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/ayjays132/Phillnet-2
- SGLang
How to use ayjays132/Phillnet-2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "ayjays132/Phillnet-2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "ayjays132/Phillnet-2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use ayjays132/Phillnet-2 with Docker Model Runner:
docker model run hf.co/ayjays132/Phillnet-2
| """Qwen3.5-aligned text refiner. | |
| Mirrors the per-layer tensor shapes of ``Qwen3_5TextModel`` so that | |
| ``transplant_qwen_text_weights.py`` can load real Qwen3.5 weights into our | |
| own modules. The mirror is intentionally minimal and architecture-faithful | |
| where it can be (RMSNorm, SwiGLU MLP, GQA + rotary), and approximate where | |
| Qwen3.5 uses an exotic op (Gated DeltaNet). Layers that mirror DeltaNet | |
| keep the input/post norms and MLP weights (which transplant 1:1) but | |
| replace the linear-attention mixing with an identity pass — letting the | |
| 6 standard ``self_attn`` layers carry the cross-token mixing. | |
| This module shares activation singletons (``SHARED_SILU``) and keeps | |
| weight names aligned with Qwen's ``layers.{i}.{...}`` paths so transplant | |
| is a direct key map. It is dim-agnostic at construction time; defaults | |
| match Qwen3.5-0.8B exactly. | |
| """ | |
| from __future__ import annotations | |
| import math | |
| from typing import List, Optional, Tuple | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| # --------------------------------------------------------------------------- | |
| # Shared activation singletons. Importing modules can grab these instead of | |
| # instantiating their own; all share the same nn.Module instance so the | |
| # adapter has one canonical SiLU rather than thirty. | |
| # --------------------------------------------------------------------------- | |
| SHARED_SILU = nn.SiLU() | |
| SHARED_GELU = nn.GELU() | |
| SHARED_SIGMOID = nn.Sigmoid() | |
| # --------------------------------------------------------------------------- | |
| # Primitives | |
| # --------------------------------------------------------------------------- | |
| class QwenRMSNorm(nn.Module): | |
| """RMSNorm matching Qwen3.5: weight only, no bias, eps default 1e-6.""" | |
| def __init__(self, dim: int, eps: float = 1e-6): | |
| super().__init__() | |
| self.eps = eps | |
| self.weight = nn.Parameter(torch.ones(dim)) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| dtype = x.dtype | |
| x_f = x.float() | |
| x_f = x_f * torch.rsqrt(x_f.pow(2).mean(-1, keepdim=True) + self.eps) | |
| return (x_f * self.weight.float()).to(dtype) | |
| def _build_inv_freq(rope_dim: int, base: float, device, dtype) -> torch.Tensor: | |
| half = rope_dim // 2 | |
| return 1.0 / (base ** (torch.arange(0, half, device=device, dtype=dtype) / half)) | |
| def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: | |
| """Apply rotary to the first ``cos.shape[-1] * 2`` dims of head_dim.""" | |
| rope_dim = cos.shape[-1] * 2 | |
| x_rope, x_pass = x[..., :rope_dim], x[..., rope_dim:] | |
| x1, x2 = x_rope.chunk(2, dim=-1) | |
| rotated = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1) | |
| return torch.cat([rotated, x_pass], dim=-1) | |
| class QwenSwiGLU(nn.Module): | |
| """Mirrors Qwen3.5 ``mlp`` layer: gate_proj, up_proj, down_proj (no bias).""" | |
| def __init__(self, hidden_size: int, intermediate_size: int): | |
| super().__init__() | |
| self.hidden_size = hidden_size | |
| self.intermediate_size = intermediate_size | |
| self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) | |
| self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) | |
| self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) | |
| self.act = SHARED_SILU | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x)) | |
| class QwenGatedGQA(nn.Module): | |
| """Mirrors Qwen3.5 ``self_attn``: GQA + rotary + per-head q/k norm + a | |
| halved ``o_proj`` input (Qwen3.5 splits q into attn/gate halves). | |
| Shapes for Qwen3.5-0.8B exactly: | |
| q_proj: (q_heads*head_dim, hidden) = (4096, 1024) | |
| k_proj: (kv_heads*head_dim, hidden) = ( 512, 1024) | |
| v_proj: (kv_heads*head_dim, hidden) = ( 512, 1024) | |
| o_proj: (hidden, (q_heads//2)*head_dim) = (1024, 2048) | |
| q_norm: (head_dim,) = (256,) | |
| k_norm: (head_dim,) = (256,) | |
| """ | |
| def __init__( | |
| self, | |
| hidden_size: int = 1024, | |
| num_q_heads: int = 16, | |
| num_kv_heads: int = 2, | |
| head_dim: int = 256, | |
| rope_dim: int = 64, | |
| rope_base: float = 1_000_000.0, | |
| ): | |
| super().__init__() | |
| assert num_q_heads % 2 == 0, "q heads must be even for Qwen3.5 gated split" | |
| assert num_q_heads % num_kv_heads == 0, "q heads must be a multiple of kv heads" | |
| self.hidden_size = hidden_size | |
| self.num_q_heads = num_q_heads | |
| self.num_kv_heads = num_kv_heads | |
| self.num_attn_heads = num_q_heads // 2 # half routed through attention | |
| self.head_dim = head_dim | |
| self.rope_dim = rope_dim | |
| self.rope_base = rope_base | |
| self.kv_repeat = self.num_attn_heads // num_kv_heads | |
| q_dim = num_q_heads * head_dim | |
| kv_dim = num_kv_heads * head_dim | |
| attn_out = self.num_attn_heads * head_dim | |
| self.q_proj = nn.Linear(hidden_size, q_dim, bias=False) | |
| self.k_proj = nn.Linear(hidden_size, kv_dim, bias=False) | |
| self.v_proj = nn.Linear(hidden_size, kv_dim, bias=False) | |
| self.o_proj = nn.Linear(attn_out, hidden_size, bias=False) | |
| self.q_norm = QwenRMSNorm(head_dim) | |
| self.k_norm = QwenRMSNorm(head_dim) | |
| def _rotary(self, seq_len: int, device, dtype) -> Tuple[torch.Tensor, torch.Tensor]: | |
| inv_freq = _build_inv_freq(self.rope_dim, self.rope_base, device, dtype) | |
| pos = torch.arange(seq_len, device=device, dtype=dtype) | |
| freqs = torch.einsum("i,j->ij", pos, inv_freq) # (T, rope_dim/2) | |
| cos, sin = freqs.cos(), freqs.sin() | |
| return cos[None, None, :, :], sin[None, None, :, :] # broadcast over (B, H, T, D/2) | |
| def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor: | |
| bsz, seq_len, _ = x.shape | |
| q = self.q_proj(x) # (B, T, num_q_heads * hd) | |
| k = self.k_proj(x) # (B, T, num_kv * hd) | |
| v = self.v_proj(x) | |
| # Split Qwen3.5 "gated" q into attn and gate halves | |
| q = q.view(bsz, seq_len, self.num_q_heads, self.head_dim) | |
| q_attn, q_gate = q[:, :, : self.num_attn_heads, :], q[:, :, self.num_attn_heads :, :] | |
| q_attn = self.q_norm(q_attn) | |
| k = self.k_norm(k.view(bsz, seq_len, self.num_kv_heads, self.head_dim)) | |
| v = v.view(bsz, seq_len, self.num_kv_heads, self.head_dim) | |
| # (B, H, T, D) for attention | |
| q_attn = q_attn.transpose(1, 2) | |
| k = k.transpose(1, 2) | |
| v = v.transpose(1, 2) | |
| cos, sin = self._rotary(seq_len, x.device, q_attn.dtype) | |
| q_attn = _apply_rotary(q_attn, cos, sin) | |
| k = _apply_rotary(k, cos, sin) | |
| # Expand kv heads to match attn heads (GQA) | |
| if self.kv_repeat > 1: | |
| k = k.repeat_interleave(self.kv_repeat, dim=1) | |
| v = v.repeat_interleave(self.kv_repeat, dim=1) | |
| scale = 1.0 / math.sqrt(self.head_dim) | |
| attn_scores = torch.matmul(q_attn, k.transpose(-2, -1)) * scale | |
| if attention_mask is not None: | |
| # 1 = keep, 0 = mask -> additive -inf on masked KEYS, broadcast across heads | |
| key_mask = attention_mask[:, None, None, :].to(attn_scores.dtype) | |
| attn_scores = attn_scores.masked_fill(key_mask == 0, float("-inf")) | |
| attn = attn_scores.softmax(dim=-1) | |
| out = torch.matmul(attn, v) # (B, H, T, D) | |
| out = out.transpose(1, 2).reshape(bsz, seq_len, self.num_attn_heads * self.head_dim) | |
| # Apply gate signal (gate halves * SiLU as in Qwen3.5 gated attention) | |
| q_gate = SHARED_SILU(q_gate).reshape(bsz, seq_len, self.num_attn_heads * self.head_dim) | |
| out = out * q_gate | |
| return self.o_proj(out) | |
| class QwenAlignedBlock(nn.Module): | |
| """Mirrors a Qwen3.5 transformer block. | |
| ``layer_kind="attention"`` mirrors the 6 standard ``self_attn`` layers | |
| (3, 7, 11, 15, 19, 23). ``layer_kind="deltanet"`` mirrors the 18 | |
| ``linear_attn`` layers structurally but uses identity for the mix-token | |
| op so we do not depend on flash-linear-attention. Both kinds keep the | |
| Qwen-shaped MLP + norms so weight transplant is 1:1 for those tensors. | |
| """ | |
| def __init__( | |
| self, | |
| hidden_size: int, | |
| intermediate_size: int, | |
| layer_kind: str = "attention", | |
| num_q_heads: int = 16, | |
| num_kv_heads: int = 2, | |
| head_dim: int = 256, | |
| rope_dim: int = 64, | |
| rope_base: float = 1_000_000.0, | |
| ): | |
| super().__init__() | |
| if layer_kind not in {"attention", "deltanet"}: | |
| raise ValueError(f"unknown layer_kind: {layer_kind}") | |
| self.layer_kind = layer_kind | |
| self.input_layernorm = QwenRMSNorm(hidden_size) | |
| self.post_attention_layernorm = QwenRMSNorm(hidden_size) | |
| if layer_kind == "attention": | |
| self.self_attn = QwenGatedGQA( | |
| hidden_size=hidden_size, | |
| num_q_heads=num_q_heads, | |
| num_kv_heads=num_kv_heads, | |
| head_dim=head_dim, | |
| rope_dim=rope_dim, | |
| rope_base=rope_base, | |
| ) | |
| else: | |
| self.self_attn = None | |
| self.mlp = QwenSwiGLU(hidden_size, intermediate_size) | |
| def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor: | |
| if self.self_attn is not None: | |
| h = self.input_layernorm(x) | |
| x = x + self.self_attn(h, attention_mask=attention_mask) | |
| # deltanet layers contribute only their MLP (mix happens at the | |
| # attention layers; this gives a real residual transformer signal) | |
| h = self.post_attention_layernorm(x) | |
| x = x + self.mlp(h) | |
| return x | |
| class QwenAlignedTextRefiner(nn.Module): | |
| """Stack of Qwen-aligned blocks. | |
| Designed to sit on top of a host text encoder's hidden states and | |
| produce a Qwen-conditioned representation at the same hidden dim. The | |
| block layout mirrors Qwen3.5-0.8B: ``num_layers=24`` with attention at | |
| every 4th position (indices 3, 7, 11, 15, 19, 23), but is configurable | |
| so smaller refiners can be transplanted from a Qwen subset. | |
| Outputs are projected to ``out_dim`` (defaults to hidden_size) via a | |
| final ``norm`` + ``proj`` so the refiner can plug into any | |
| downstream conditioning bridge. | |
| """ | |
| DEFAULT_ATTENTION_INDICES = (3, 7, 11, 15, 19, 23) | |
| def __init__( | |
| self, | |
| hidden_size: int = 1024, | |
| intermediate_size: int = 3584, | |
| num_layers: int = 24, | |
| attention_indices: Optional[Tuple[int, ...]] = None, | |
| num_q_heads: int = 16, | |
| num_kv_heads: int = 2, | |
| head_dim: int = 256, | |
| rope_dim: int = 64, | |
| rope_base: float = 1_000_000.0, | |
| out_dim: Optional[int] = None, | |
| ): | |
| super().__init__() | |
| self.hidden_size = hidden_size | |
| self.intermediate_size = intermediate_size | |
| self.num_layers = num_layers | |
| self.attention_indices = tuple( | |
| self.DEFAULT_ATTENTION_INDICES if attention_indices is None else attention_indices | |
| ) | |
| self.num_q_heads = num_q_heads | |
| self.num_kv_heads = num_kv_heads | |
| self.head_dim = head_dim | |
| self.rope_dim = rope_dim | |
| self.rope_base = rope_base | |
| attention_set = set(self.attention_indices) | |
| self.layers = nn.ModuleList( | |
| [ | |
| QwenAlignedBlock( | |
| hidden_size=hidden_size, | |
| intermediate_size=intermediate_size, | |
| layer_kind="attention" if i in attention_set else "deltanet", | |
| num_q_heads=num_q_heads, | |
| num_kv_heads=num_kv_heads, | |
| head_dim=head_dim, | |
| rope_dim=rope_dim, | |
| rope_base=rope_base, | |
| ) | |
| for i in range(num_layers) | |
| ] | |
| ) | |
| self.norm = QwenRMSNorm(hidden_size) | |
| target_dim = hidden_size if out_dim is None else int(out_dim) | |
| self.out_dim = target_dim | |
| if target_dim == hidden_size: | |
| self.proj = nn.Identity() | |
| else: | |
| self.proj = nn.Linear(hidden_size, target_dim, bias=False) | |
| self.gate = nn.Parameter(torch.zeros(())) # learned residual gate, init 0 (identity) | |
| def forward( | |
| self, | |
| hidden_states: torch.Tensor, | |
| attention_mask: Optional[torch.Tensor] = None, | |
| ) -> torch.Tensor: | |
| if hidden_states.shape[-1] != self.hidden_size: | |
| raise ValueError( | |
| f"QwenAlignedTextRefiner expected dim {self.hidden_size}, got {hidden_states.shape[-1]}" | |
| ) | |
| residual = hidden_states | |
| h = hidden_states | |
| for layer in self.layers: | |
| h = layer(h, attention_mask=attention_mask) | |
| h = self.norm(h) | |
| h = self.proj(h) | |
| if isinstance(self.proj, nn.Identity): | |
| # gate=0 init: refiner starts as identity; training learns to mix it in | |
| return residual + torch.tanh(self.gate) * (h - residual) | |
| # When projecting to a new dim, residual is not addable — return h directly. | |
| # gate is still a learnable scalar so downstream training can dampen this path. | |
| return h * (1.0 + torch.tanh(self.gate)) | |
| def get_qwen_state_dict_map(self) -> List[Tuple[str, str]]: | |
| """Return list of (qwen_key, our_key) pairs for transplant. Only | |
| includes tensors whose shape matches between Qwen3.5 and us.""" | |
| pairs: List[Tuple[str, str]] = [] | |
| for i in range(self.num_layers): | |
| ours = f"layers.{i}" | |
| qwen = f"layers.{i}" | |
| pairs.append((f"{qwen}.input_layernorm.weight", f"{ours}.input_layernorm.weight")) | |
| pairs.append((f"{qwen}.post_attention_layernorm.weight", f"{ours}.post_attention_layernorm.weight")) | |
| pairs.append((f"{qwen}.mlp.gate_proj.weight", f"{ours}.mlp.gate_proj.weight")) | |
| pairs.append((f"{qwen}.mlp.up_proj.weight", f"{ours}.mlp.up_proj.weight")) | |
| pairs.append((f"{qwen}.mlp.down_proj.weight", f"{ours}.mlp.down_proj.weight")) | |
| if i in set(self.attention_indices): | |
| pairs.append((f"{qwen}.self_attn.q_proj.weight", f"{ours}.self_attn.q_proj.weight")) | |
| pairs.append((f"{qwen}.self_attn.k_proj.weight", f"{ours}.self_attn.k_proj.weight")) | |
| pairs.append((f"{qwen}.self_attn.v_proj.weight", f"{ours}.self_attn.v_proj.weight")) | |
| pairs.append((f"{qwen}.self_attn.o_proj.weight", f"{ours}.self_attn.o_proj.weight")) | |
| pairs.append((f"{qwen}.self_attn.q_norm.weight", f"{ours}.self_attn.q_norm.weight")) | |
| pairs.append((f"{qwen}.self_attn.k_norm.weight", f"{ours}.self_attn.k_norm.weight")) | |
| pairs.append(("norm.weight", "norm.weight")) | |
| return pairs | |