base_IIXIV

Instructions to use mainline777/base_IIXIV with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use mainline777/base_IIXIV with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="mainline777/base_IIXIV", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("mainline777/base_IIXIV", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use mainline777/base_IIXIV with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "mainline777/base_IIXIV"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "mainline777/base_IIXIV",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/mainline777/base_IIXIV

SGLang

How to use mainline777/base_IIXIV with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "mainline777/base_IIXIV" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "mainline777/base_IIXIV",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "mainline777/base_IIXIV" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "mainline777/base_IIXIV",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use mainline777/base_IIXIV with Docker Model Runner:
```
docker model run hf.co/mainline777/base_IIXIV
```

base_IIXIV / fla /layers /utils.py

mainline777

Duplicate from silx-ai/Quasar-Preview

41865df 25 days ago

Raw

History Blame Contribute Delete

7.92 kB

	# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang

	# Code is adapted from flash-attn.bert_padding.py


	import torch
	from einops import rearrange, repeat

	from fla.ops.utils.index import prepare_cu_seqlens_from_mask, prepare_lens_from_mask
	from fla.utils import tensor_cache

	_LAYER_IDX_REQUIRED_MSG = "{cls} requires `layer_idx` when `past_key_values` is provided."


	class IndexFirstAxis(torch.autograd.Function):

	@staticmethod
	def forward(ctx, x, indices):
	ctx.save_for_backward(indices)
	assert x.ndim >= 2
	ctx.first_axis_dim, other_shape = x.shape[0], x.shape[1:]
	second_dim = other_shape.numel()
	# TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
	# return x[indices]
	return torch.gather(
	rearrange(x, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim),
	).reshape(-1, *other_shape)

	@staticmethod
	def backward(ctx, do):
	(indices,) = ctx.saved_tensors
	assert do.ndim >= 2
	other_shape = do.shape[1:]
	do = rearrange(do, "b ... -> b (...)")
	dx = torch.zeros(
	[ctx.first_axis_dim, do.shape[1]],
	device=do.device,
	dtype=do.dtype,
	)
	# TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
	# dx[indices] = do
	dx.scatter_(0, repeat(indices, "z -> z d", d=do.shape[1]), do)
	return dx.reshape(ctx.first_axis_dim, *other_shape), None


	index_first_axis = IndexFirstAxis.apply


	class IndexPutFirstAxis(torch.autograd.Function):

	@staticmethod
	def forward(ctx, x, indices, first_axis_dim):
	ctx.save_for_backward(indices)
	assert indices.ndim == 1
	assert x.ndim >= 2
	y = torch.zeros(first_axis_dim, *x.shape[1:], device=x.device, dtype=x.dtype)
	# TODO [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
	y[indices] = x
	# y.scatter_(0, repeat(indices, 'z -> z d', d=x.shape[1]), x)
	return y

	@staticmethod
	def backward(ctx, do):
	(indices,) = ctx.saved_tensors
	# TODO [2022-03-04] For some reason torch.gather is a bit faster than indexing.
	dx = do[indices]
	# dx = torch.gather(do, 0, repeat(indices, 'z -> z d', d=do.shape[1]))
	return dx, None, None


	index_put_first_axis = IndexPutFirstAxis.apply


	@tensor_cache
	def get_unpad_data(
	attention_mask: torch.Tensor,
	) -> tuple[torch.Tensor, torch.Tensor, int]:
	"""
	Retrieves indexing data required to repad unpadded (ragged) tensors.

	Args:
	attention_mask (`torch.Tensor`):
	Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

	Return:
	indices (`torch.Tensor`):
	The indices of non-masked tokens from the flattened input sequence.
	cu_seqlens (`torch.Tensor`):
	The cumulative sequence lengths, used to index into ragged (unpadded) tensors.
	`cu_seqlens` shape is [batch_size + 1].
	max_seqlen_in_batch (`int`):
	Maximum sequence length in batch.
	"""
	lens = prepare_lens_from_mask(attention_mask)
	indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
	max_seqlen_in_batch = lens.max().item()
	cu_seqlens = prepare_cu_seqlens_from_mask(attention_mask)
	return indices, cu_seqlens, max_seqlen_in_batch


	def unpad_input(
	q: torch.Tensor,
	states: tuple[torch.Tensor],
	attention_mask: torch.Tensor,
	q_len: int,
	keepdim: bool = False,
	):
	"""
	Unpads query, key, and values tensors, using a single dimension for all tokens
	even though they belong to different batches.


	Arguments:
	q (`torch.Tensor`):
	Query state with padding. Shape: [batch_size, q_len, ...].
	states (`Tuple[torch.Tensor]`):
	Attention state with padding. Shape: [batch_size, seq_len, ...].
	attention_mask (`torch.Tensor`):
	Boolean or int tensor of shape [batch_size, sequence_length], 1 means valid and 0 means not valid.
	q_len (`int`):
	Target length.
	keepdim (`bool`):
	Whether to keep the batch dimension. Default: `False`.

	Return:
	q (`torch.Tensor`):
	Query state without padding.
	Shape: [1, total_target_length, ...] if `keepdim=True` else [total_target_length, ...].
	states (`Tuple[torch.Tensor]`):
	Attention state without padding.
	Shape: [1, total_source_length, ...] if `keepdim=True` else [total_source_length, ...].
	indices_q (`torch.Tensor`):
	The indices of non-masked tokens from the flattened input target sequence.
	(cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`):
	The cumulative sequence lengths for the target (query) and source (key, value),
	used to index into ragged (unpadded) tensors.
	`cu_seqlens` shape is [batch_size + 1].
	(max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`):
	Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence
	i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
	"""
	indices_k, cu_seqlens_k, max_seqlen_in_batch_k = get_unpad_data(attention_mask)
	batch_size, seq_len, *_ = states[0].shape

	state = tuple(
	index_first_axis(rearrange(s, "b s ... -> (b s) ..."), indices_k)
	for s in states
	)

	if q_len == seq_len:
	q = index_first_axis(rearrange(q, "b s ... -> (b s) ..."), indices_k)
	cu_seqlens_q = cu_seqlens_k
	max_seqlen_in_batch_q = max_seqlen_in_batch_k
	indices_q = indices_k
	elif q_len == 1:
	max_seqlen_in_batch_q = 1
	cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=q.device)
	indices_q = cu_seqlens_q[:-1]
	q = q.squeeze(1)
	else:
	raise NotImplementedError("We only support either q_len == k_len (prefilling) or q_len == 1 (decoding)")

	if keepdim:
	q = q.unsqueeze(0)
	state = tuple(s.unsqueeze(0) for s in state)

	return (
	q,
	state,
	indices_q,
	(cu_seqlens_q, cu_seqlens_k),
	(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
	)


	def pad_input(
	hidden_states: torch.Tensor,
	indices: torch.LongTensor,
	batch_size: int,
	seq_len: int,
	) -> torch.Tensor:
	"""
	Args:
	hidden_states ([total_tokens, ...]):
	where total_tokens denotes the number of tokens in selected in attention_mask.
	indices ([total_tokens]):
	the indices that represent the non-masked tokens of the original padded input sequence.
	batch_size (int):
	batch_size size for the padded sequence.
	seq_len (int):
	maximum sequence length for the padded sequence.

	Return:
	hidden_states of shape [batch_size, seq_len, ...]
	"""
	output = index_put_first_axis(hidden_states, indices, batch_size * seq_len)
	return rearrange(output, "(b s) ... -> b s ...", b=batch_size)


	def require_cache_layer_idx(module, past_key_values):
	layer_idx = getattr(module, "layer_idx", None)
	if past_key_values is not None and layer_idx is None:
	raise ValueError(_LAYER_IDX_REQUIRED_MSG.format(cls=module.__class__.__name__))
	return layer_idx


	def get_layer_cache(module, past_key_values):
	layer_idx = require_cache_layer_idx(module, past_key_values)
	if past_key_values is not None and len(past_key_values) > layer_idx:
	return past_key_values[layer_idx]
	return None


	def update_layer_cache(module, past_key_values, **kwargs):
	layer_idx = require_cache_layer_idx(module, past_key_values)
	if past_key_values is not None:
	return past_key_values.update(layer_idx=layer_idx, **kwargs)
	return None