base_IIXIV

Instructions to use mainline777/base_IIXIV with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use mainline777/base_IIXIV with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="mainline777/base_IIXIV", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("mainline777/base_IIXIV", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use mainline777/base_IIXIV with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "mainline777/base_IIXIV"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "mainline777/base_IIXIV",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/mainline777/base_IIXIV

SGLang

How to use mainline777/base_IIXIV with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "mainline777/base_IIXIV" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "mainline777/base_IIXIV",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "mainline777/base_IIXIV" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "mainline777/base_IIXIV",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use mainline777/base_IIXIV with Docker Model Runner:
```
docker model run hf.co/mainline777/base_IIXIV
```

base_IIXIV / fla /layers /based.py

mainline777

Duplicate from silx-ai/Quasar-Preview

41865df 24 days ago

Raw

History Blame Contribute Delete

3.75 kB

	# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang

	"""
	Linear attention in Based.
	https://github.com/HazyResearch/zoology/blob/main/zoology/mixers/based.py
	"""

	import torch
	import torch.nn as nn
	from einops import rearrange

	from fla.modules.feature_map import TaylorFeatureMap
	from fla.ops.based import parallel_based
	from fla.ops.linear_attn import chunk_linear_attn, fused_chunk_linear_attn


	class BasedLinearAttention(nn.Module):

	def __init__(
	self,
	hidden_size: int,
	feature_dim: int = 16,
	num_key_value_heads: int = 12,
	num_heads: int = 12,
	feature_name: str = "taylor_exp",
	eps: float = 1e-12,
	causal: bool = True,
	mode: str = "parallel",
	):
	super().__init__()

	self.hidden_size = hidden_size
	self.mode = mode
	self.feature_name = feature_name
	self.feature_dim = feature_dim
	self.num_key_value_heads = num_key_value_heads
	self.num_heads = num_heads
	self.head_dim = self.hidden_size // self.num_key_value_heads
	assert self.hidden_size % self.head_dim == 0
	self.causal = causal

	self.q_proj = nn.Linear(self.hidden_size, self.feature_dim * self.num_heads, bias=False)
	self.k_proj = nn.Linear(self.hidden_size, self.feature_dim * self.num_heads, bias=False)
	self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
	self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
	self.dropout = nn.Identity()
	self.feature_map = TaylorFeatureMap(feature_dim)
	self.eps = eps

	def forward(self, hidden_states: torch.Tensor, **kwargs):
	mode = self.mode
	q, k, v = self.q_proj(hidden_states), self.k_proj(hidden_states), self.v_proj(hidden_states)
	q, k, v = map(lambda x: rearrange(x, "... (h d) -> ... h d", d=self.head_dim), [q, k, v])
	if mode == "fused_chunk":
	q, k = self.feature_map(q), self.feature_map(k)
	o, _ = fused_chunk_linear_attn(q, k, v, normalize=True, scale=1)
	elif mode == 'chunk':
	q, k = self.feature_map(q), self.feature_map(k)
	o, _ = chunk_linear_attn(q, k, v, normalize=True, scale=1)
	elif mode == 'parallel':
	assert q.shape[-1] <= 128
	o = parallel_based(q, k, v, scale=1, use_norm=True)
	o = rearrange(o, 'b t h d -> b t (h d)')
	o = self.o_proj(o)
	o = self.dropout(o)
	return o

	def forward_reference(self, hidden_states: torch.Tensor, **kwargs):
	"""
	x (torch.Tensor): tensor of shape (b, d, t)
	y (torch.Tensor): tensor of shape (b, d, t)
	"""
	# hidden_states = hidden_states.transpose(1, 2)
	b, t, _ = hidden_states.size()
	q, k, v = self.q_proj(hidden_states), self.k_proj(hidden_states), self.v_proj(hidden_states)

	q = q.view(b, t, self.num_heads, self.feature_dim).transpose(1, 2)
	k = k.view(b, t, self.num_key_value_heads, self.feature_dim).transpose(1, 2)
	v = v.view(b, t, self.num_key_value_heads, self.head_dim).transpose(1, 2)

	# Linear attention
	q, k = self.feature_map(q), self.feature_map(k)
	q, k, v = q.unsqueeze(-2), k.unsqueeze(-2), v.unsqueeze(-1)

	# Compute attention
	if self.causal:
	y = ((q * (k * v).cumsum(2)).sum(-1) / ((q * k.cumsum(2)).sum(-1) + self.eps))
	else:
	y = ((q * (k * v).sum(2, True)).sum(-1) / ((q * k.sum(2, True)).sum(-1) + self.eps))
	y = rearrange(y, 'b h t d -> b t (h d)')
	y = self.o_proj(y.to(hidden_states.dtype))
	y = self.dropout(y)
	return y.to(hidden_states.dtype)