Instructions to use ayjays132/Phillnet-2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use ayjays132/Phillnet-2 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="ayjays132/Phillnet-2", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use ayjays132/Phillnet-2 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "ayjays132/Phillnet-2"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "ayjays132/Phillnet-2",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/ayjays132/Phillnet-2

SGLang

How to use ayjays132/Phillnet-2 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "ayjays132/Phillnet-2" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "ayjays132/Phillnet-2",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "ayjays132/Phillnet-2" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "ayjays132/Phillnet-2",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use ayjays132/Phillnet-2 with Docker Model Runner:
```
docker model run hf.co/ayjays132/Phillnet-2
```

Phillnet-2 / ImageGen /model /qwen_aligned_text.py

ayjays132

Upload 478 files

101858b verified 3 days ago

raw

history blame contribute delete

14.8 kB

	"""Qwen3.5-aligned text refiner.

	Mirrors the per-layer tensor shapes of ``Qwen3_5TextModel`` so that
	``transplant_qwen_text_weights.py`` can load real Qwen3.5 weights into our
	own modules. The mirror is intentionally minimal and architecture-faithful
	where it can be (RMSNorm, SwiGLU MLP, GQA + rotary), and approximate where
	Qwen3.5 uses an exotic op (Gated DeltaNet). Layers that mirror DeltaNet
	keep the input/post norms and MLP weights (which transplant 1:1) but
	replace the linear-attention mixing with an identity pass — letting the
	6 standard ``self_attn`` layers carry the cross-token mixing.

	This module shares activation singletons (``SHARED_SILU``) and keeps
	weight names aligned with Qwen's ``layers.{i}.{...}`` paths so transplant
	is a direct key map. It is dim-agnostic at construction time; defaults
	match Qwen3.5-0.8B exactly.
	"""

	from __future__ import annotations

	import math
	from typing import List, Optional, Tuple

	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	# ---------------------------------------------------------------------------
	# Shared activation singletons. Importing modules can grab these instead of
	# instantiating their own; all share the same nn.Module instance so the
	# adapter has one canonical SiLU rather than thirty.
	# ---------------------------------------------------------------------------

	SHARED_SILU = nn.SiLU()
	SHARED_GELU = nn.GELU()
	SHARED_SIGMOID = nn.Sigmoid()


	# ---------------------------------------------------------------------------
	# Primitives
	# ---------------------------------------------------------------------------


	class QwenRMSNorm(nn.Module):
	"""RMSNorm matching Qwen3.5: weight only, no bias, eps default 1e-6."""

	def __init__(self, dim: int, eps: float = 1e-6):
	super().__init__()
	self.eps = eps
	self.weight = nn.Parameter(torch.ones(dim))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	dtype = x.dtype
	x_f = x.float()
	x_f = x_f * torch.rsqrt(x_f.pow(2).mean(-1, keepdim=True) + self.eps)
	return (x_f * self.weight.float()).to(dtype)


	def _build_inv_freq(rope_dim: int, base: float, device, dtype) -> torch.Tensor:
	half = rope_dim // 2
	return 1.0 / (base ** (torch.arange(0, half, device=device, dtype=dtype) / half))


	def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
	"""Apply rotary to the first ``cos.shape[-1] * 2`` dims of head_dim."""
	rope_dim = cos.shape[-1] * 2
	x_rope, x_pass = x[..., :rope_dim], x[..., rope_dim:]
	x1, x2 = x_rope.chunk(2, dim=-1)
	rotated = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
	return torch.cat([rotated, x_pass], dim=-1)


	class QwenSwiGLU(nn.Module):
	"""Mirrors Qwen3.5 ``mlp`` layer: gate_proj, up_proj, down_proj (no bias)."""

	def __init__(self, hidden_size: int, intermediate_size: int):
	super().__init__()
	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
	self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
	self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
	self.act = SHARED_SILU

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))


	class QwenGatedGQA(nn.Module):
	"""Mirrors Qwen3.5 ``self_attn``: GQA + rotary + per-head q/k norm + a
	halved ``o_proj`` input (Qwen3.5 splits q into attn/gate halves).

	Shapes for Qwen3.5-0.8B exactly:
	q_proj: (q_heads*head_dim, hidden) = (4096, 1024)
	k_proj: (kv_heads*head_dim, hidden) = ( 512, 1024)
	v_proj: (kv_heads*head_dim, hidden) = ( 512, 1024)
	o_proj: (hidden, (q_heads//2)*head_dim) = (1024, 2048)
	q_norm: (head_dim,) = (256,)
	k_norm: (head_dim,) = (256,)
	"""

	def __init__(
	self,
	hidden_size: int = 1024,
	num_q_heads: int = 16,
	num_kv_heads: int = 2,
	head_dim: int = 256,
	rope_dim: int = 64,
	rope_base: float = 1_000_000.0,
	):
	super().__init__()
	assert num_q_heads % 2 == 0, "q heads must be even for Qwen3.5 gated split"
	assert num_q_heads % num_kv_heads == 0, "q heads must be a multiple of kv heads"
	self.hidden_size = hidden_size
	self.num_q_heads = num_q_heads
	self.num_kv_heads = num_kv_heads
	self.num_attn_heads = num_q_heads // 2 # half routed through attention
	self.head_dim = head_dim
	self.rope_dim = rope_dim
	self.rope_base = rope_base
	self.kv_repeat = self.num_attn_heads // num_kv_heads

	q_dim = num_q_heads * head_dim
	kv_dim = num_kv_heads * head_dim
	attn_out = self.num_attn_heads * head_dim
	self.q_proj = nn.Linear(hidden_size, q_dim, bias=False)
	self.k_proj = nn.Linear(hidden_size, kv_dim, bias=False)
	self.v_proj = nn.Linear(hidden_size, kv_dim, bias=False)
	self.o_proj = nn.Linear(attn_out, hidden_size, bias=False)
	self.q_norm = QwenRMSNorm(head_dim)
	self.k_norm = QwenRMSNorm(head_dim)

	def _rotary(self, seq_len: int, device, dtype) -> Tuple[torch.Tensor, torch.Tensor]:
	inv_freq = _build_inv_freq(self.rope_dim, self.rope_base, device, dtype)
	pos = torch.arange(seq_len, device=device, dtype=dtype)
	freqs = torch.einsum("i,j->ij", pos, inv_freq) # (T, rope_dim/2)
	cos, sin = freqs.cos(), freqs.sin()
	return cos[None, None, :, :], sin[None, None, :, :] # broadcast over (B, H, T, D/2)

	def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	bsz, seq_len, _ = x.shape
	q = self.q_proj(x) # (B, T, num_q_heads * hd)
	k = self.k_proj(x) # (B, T, num_kv * hd)
	v = self.v_proj(x)

	# Split Qwen3.5 "gated" q into attn and gate halves
	q = q.view(bsz, seq_len, self.num_q_heads, self.head_dim)
	q_attn, q_gate = q[:, :, : self.num_attn_heads, :], q[:, :, self.num_attn_heads :, :]
	q_attn = self.q_norm(q_attn)
	k = self.k_norm(k.view(bsz, seq_len, self.num_kv_heads, self.head_dim))
	v = v.view(bsz, seq_len, self.num_kv_heads, self.head_dim)

	# (B, H, T, D) for attention
	q_attn = q_attn.transpose(1, 2)
	k = k.transpose(1, 2)
	v = v.transpose(1, 2)

	cos, sin = self._rotary(seq_len, x.device, q_attn.dtype)
	q_attn = _apply_rotary(q_attn, cos, sin)
	k = _apply_rotary(k, cos, sin)

	# Expand kv heads to match attn heads (GQA)
	if self.kv_repeat > 1:
	k = k.repeat_interleave(self.kv_repeat, dim=1)
	v = v.repeat_interleave(self.kv_repeat, dim=1)

	scale = 1.0 / math.sqrt(self.head_dim)
	attn_scores = torch.matmul(q_attn, k.transpose(-2, -1)) * scale
	if attention_mask is not None:
	# 1 = keep, 0 = mask -> additive -inf on masked KEYS, broadcast across heads
	key_mask = attention_mask[:, None, None, :].to(attn_scores.dtype)
	attn_scores = attn_scores.masked_fill(key_mask == 0, float("-inf"))
	attn = attn_scores.softmax(dim=-1)
	out = torch.matmul(attn, v) # (B, H, T, D)
	out = out.transpose(1, 2).reshape(bsz, seq_len, self.num_attn_heads * self.head_dim)

	# Apply gate signal (gate halves * SiLU as in Qwen3.5 gated attention)
	q_gate = SHARED_SILU(q_gate).reshape(bsz, seq_len, self.num_attn_heads * self.head_dim)
	out = out * q_gate
	return self.o_proj(out)


	class QwenAlignedBlock(nn.Module):
	"""Mirrors a Qwen3.5 transformer block.

	``layer_kind="attention"`` mirrors the 6 standard ``self_attn`` layers
	(3, 7, 11, 15, 19, 23). ``layer_kind="deltanet"`` mirrors the 18
	``linear_attn`` layers structurally but uses identity for the mix-token
	op so we do not depend on flash-linear-attention. Both kinds keep the
	Qwen-shaped MLP + norms so weight transplant is 1:1 for those tensors.
	"""

	def __init__(
	self,
	hidden_size: int,
	intermediate_size: int,
	layer_kind: str = "attention",
	num_q_heads: int = 16,
	num_kv_heads: int = 2,
	head_dim: int = 256,
	rope_dim: int = 64,
	rope_base: float = 1_000_000.0,
	):
	super().__init__()
	if layer_kind not in {"attention", "deltanet"}:
	raise ValueError(f"unknown layer_kind: {layer_kind}")
	self.layer_kind = layer_kind
	self.input_layernorm = QwenRMSNorm(hidden_size)
	self.post_attention_layernorm = QwenRMSNorm(hidden_size)
	if layer_kind == "attention":
	self.self_attn = QwenGatedGQA(
	hidden_size=hidden_size,
	num_q_heads=num_q_heads,
	num_kv_heads=num_kv_heads,
	head_dim=head_dim,
	rope_dim=rope_dim,
	rope_base=rope_base,
	)
	else:
	self.self_attn = None
	self.mlp = QwenSwiGLU(hidden_size, intermediate_size)

	def forward(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	if self.self_attn is not None:
	h = self.input_layernorm(x)
	x = x + self.self_attn(h, attention_mask=attention_mask)
	# deltanet layers contribute only their MLP (mix happens at the
	# attention layers; this gives a real residual transformer signal)
	h = self.post_attention_layernorm(x)
	x = x + self.mlp(h)
	return x


	class QwenAlignedTextRefiner(nn.Module):
	"""Stack of Qwen-aligned blocks.

	Designed to sit on top of a host text encoder's hidden states and
	produce a Qwen-conditioned representation at the same hidden dim. The
	block layout mirrors Qwen3.5-0.8B: ``num_layers=24`` with attention at
	every 4th position (indices 3, 7, 11, 15, 19, 23), but is configurable
	so smaller refiners can be transplanted from a Qwen subset.

	Outputs are projected to ``out_dim`` (defaults to hidden_size) via a
	final ``norm`` + ``proj`` so the refiner can plug into any
	downstream conditioning bridge.
	"""

	DEFAULT_ATTENTION_INDICES = (3, 7, 11, 15, 19, 23)

	def __init__(
	self,
	hidden_size: int = 1024,
	intermediate_size: int = 3584,
	num_layers: int = 24,
	attention_indices: Optional[Tuple[int, ...]] = None,
	num_q_heads: int = 16,
	num_kv_heads: int = 2,
	head_dim: int = 256,
	rope_dim: int = 64,
	rope_base: float = 1_000_000.0,
	out_dim: Optional[int] = None,
	):
	super().__init__()
	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.num_layers = num_layers
	self.attention_indices = tuple(
	self.DEFAULT_ATTENTION_INDICES if attention_indices is None else attention_indices
	)
	self.num_q_heads = num_q_heads
	self.num_kv_heads = num_kv_heads
	self.head_dim = head_dim
	self.rope_dim = rope_dim
	self.rope_base = rope_base
	attention_set = set(self.attention_indices)
	self.layers = nn.ModuleList(
	[
	QwenAlignedBlock(
	hidden_size=hidden_size,
	intermediate_size=intermediate_size,
	layer_kind="attention" if i in attention_set else "deltanet",
	num_q_heads=num_q_heads,
	num_kv_heads=num_kv_heads,
	head_dim=head_dim,
	rope_dim=rope_dim,
	rope_base=rope_base,
	)
	for i in range(num_layers)
	]
	)
	self.norm = QwenRMSNorm(hidden_size)
	target_dim = hidden_size if out_dim is None else int(out_dim)
	self.out_dim = target_dim
	if target_dim == hidden_size:
	self.proj = nn.Identity()
	else:
	self.proj = nn.Linear(hidden_size, target_dim, bias=False)
	self.gate = nn.Parameter(torch.zeros(())) # learned residual gate, init 0 (identity)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	if hidden_states.shape[-1] != self.hidden_size:
	raise ValueError(
	f"QwenAlignedTextRefiner expected dim {self.hidden_size}, got {hidden_states.shape[-1]}"
	)
	residual = hidden_states
	h = hidden_states
	for layer in self.layers:
	h = layer(h, attention_mask=attention_mask)
	h = self.norm(h)
	h = self.proj(h)
	if isinstance(self.proj, nn.Identity):
	# gate=0 init: refiner starts as identity; training learns to mix it in
	return residual + torch.tanh(self.gate) * (h - residual)
	# When projecting to a new dim, residual is not addable — return h directly.
	# gate is still a learnable scalar so downstream training can dampen this path.
	return h * (1.0 + torch.tanh(self.gate))

	def get_qwen_state_dict_map(self) -> List[Tuple[str, str]]:
	"""Return list of (qwen_key, our_key) pairs for transplant. Only
	includes tensors whose shape matches between Qwen3.5 and us."""
	pairs: List[Tuple[str, str]] = []
	for i in range(self.num_layers):
	ours = f"layers.{i}"
	qwen = f"layers.{i}"
	pairs.append((f"{qwen}.input_layernorm.weight", f"{ours}.input_layernorm.weight"))
	pairs.append((f"{qwen}.post_attention_layernorm.weight", f"{ours}.post_attention_layernorm.weight"))
	pairs.append((f"{qwen}.mlp.gate_proj.weight", f"{ours}.mlp.gate_proj.weight"))
	pairs.append((f"{qwen}.mlp.up_proj.weight", f"{ours}.mlp.up_proj.weight"))
	pairs.append((f"{qwen}.mlp.down_proj.weight", f"{ours}.mlp.down_proj.weight"))
	if i in set(self.attention_indices):
	pairs.append((f"{qwen}.self_attn.q_proj.weight", f"{ours}.self_attn.q_proj.weight"))
	pairs.append((f"{qwen}.self_attn.k_proj.weight", f"{ours}.self_attn.k_proj.weight"))
	pairs.append((f"{qwen}.self_attn.v_proj.weight", f"{ours}.self_attn.v_proj.weight"))
	pairs.append((f"{qwen}.self_attn.o_proj.weight", f"{ours}.self_attn.o_proj.weight"))
	pairs.append((f"{qwen}.self_attn.q_norm.weight", f"{ours}.self_attn.q_norm.weight"))
	pairs.append((f"{qwen}.self_attn.k_norm.weight", f"{ours}.self_attn.k_norm.weight"))
	pairs.append(("norm.weight", "norm.weight"))
	return pairs