Text Generation
Transformers
Safetensors
English
Arabic
quasar_long
silx-ai
quasar-preview
quasar
foundation-model
Mixture of Experts
18b
2b-active
long-context
bittensor
sn24
decentralized-training
distillation
hybrid-transformer
loop-transformer
safe-nope
drope
conversational
custom_code
Instructions to use mainline777/base_IIXIV with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use mainline777/base_IIXIV with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="mainline777/base_IIXIV", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("mainline777/base_IIXIV", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use mainline777/base_IIXIV with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "mainline777/base_IIXIV" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mainline777/base_IIXIV", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/mainline777/base_IIXIV
- SGLang
How to use mainline777/base_IIXIV with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "mainline777/base_IIXIV" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mainline777/base_IIXIV", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "mainline777/base_IIXIV" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mainline777/base_IIXIV", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use mainline777/base_IIXIV with Docker Model Runner:
docker model run hf.co/mainline777/base_IIXIV
| # REVISED FROM | |
| # https://github.com/shawntan/stickbreaking-attention/blob/main/stickbreaking_attention/sb_varlen/softplus.py | |
| import triton | |
| from triton import language as tl | |
| from fla.utils import IS_NVIDIA | |
| def _generate_softplus(num_pack): | |
| template = """ | |
| .reg .pred p; | |
| setp.gt.f32 p, ${in_reg}, 20.; | |
| @p mov.f32 ${out_reg}, ${in_reg}; | |
| @!p mul.f32 ${out_reg}, ${in_reg}, 1.4426950408889634; | |
| @!p ex2.approx.ftz.f32 ${out_reg}, ${out_reg}; | |
| @!p add.f32 ${out_reg}, ${out_reg}, 1.0; | |
| @!p lg2.approx.ftz.f32 ${out_reg}, ${out_reg}; | |
| @!p mul.f32 ${out_reg}, ${out_reg}, 0.6931471805599453; | |
| """ | |
| out_str = "" | |
| for i in range(num_pack): | |
| inner_str = template.format(out_reg=i, in_reg=i + num_pack) | |
| out_str += "{" + inner_str + "}\n" | |
| # flatten out because torch.compile doesn't like newlines | |
| out_str = " ".join(out_str.split("\n")) | |
| return out_str | |
| def _generate_softplus2(num_pack): | |
| template = """ | |
| .reg .pred p; | |
| setp.gt.f32 p, ${in_reg}, 15.; | |
| @p mov.f32 ${out_reg}, ${in_reg}; | |
| @!p ex2.approx.ftz.f32 ${out_reg}, ${in_reg}; | |
| @!p add.f32 ${out_reg}, ${out_reg}, 1.0; | |
| @!p lg2.approx.ftz.f32 ${out_reg}, ${out_reg}; | |
| """ | |
| out_str = "" | |
| for i in range(num_pack): | |
| inner_str = template.format(out_reg=i, in_reg=i + num_pack) | |
| out_str += "{" + inner_str + "}\n" | |
| # flatten out because torch.compile doesn't like newlines | |
| out_str = " ".join(out_str.split("\n")) | |
| return out_str | |
| def _generate_constraints(num_pack): | |
| return ",".join("=r" for i in range(num_pack)) + "," + ",".join("r" for i in range(num_pack)) | |
| _NUM_REG = 1 | |
| s_softplus: tl.constexpr = tl.constexpr(_generate_softplus(_NUM_REG)) | |
| s_softplus2: tl.constexpr = tl.constexpr(_generate_softplus2(_NUM_REG)) | |
| s_constraints: tl.constexpr = tl.constexpr(_generate_constraints(_NUM_REG)) | |
| NUM_REG: tl.constexpr = tl.constexpr(_NUM_REG) | |
| def softplus_nv(x): | |
| # equivalent to: | |
| # return tl.where(x < 20.0, tl.math.log(1 + tl.math.exp(x)), x) | |
| return tl.inline_asm_elementwise( | |
| asm=s_softplus, | |
| constraints=s_constraints, | |
| pack=NUM_REG, | |
| args=[ | |
| x, | |
| ], | |
| dtype=tl.float32, | |
| is_pure=True, | |
| ) | |
| def softplus_triton(x): | |
| return tl.where(x < 20.0, tl.math.log(1 + tl.math.exp(x)), x) | |
| def softplus2_nv(x): | |
| # equivalent to: | |
| # return tl.where(x < 15.0, tl.math.log2(1 + tl.math.exp2(x)), x) | |
| return tl.inline_asm_elementwise( | |
| asm=s_softplus2, | |
| constraints=s_constraints, | |
| pack=NUM_REG, | |
| args=[ | |
| x, | |
| ], | |
| dtype=tl.float32, | |
| is_pure=True, | |
| ) | |
| def softplus2_triton(x): | |
| return tl.where(x < 15.0, tl.math.log2(1 + tl.math.exp2(x)), x) | |
| if IS_NVIDIA: | |
| softplus = softplus_nv | |
| softplus2 = softplus2_nv | |
| else: | |
| softplus = softplus_triton | |
| softplus2 = softplus2_triton | |