Text Generation
Transformers
qwen3
nebula-s
svms
math-reasoning
competition-math
quantized
int4
hqq
conversational
Instructions to use decompute/Nebula-S-v1-4bit-optimized with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use decompute/Nebula-S-v1-4bit-optimized with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="decompute/Nebula-S-v1-4bit-optimized") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("decompute/Nebula-S-v1-4bit-optimized") model = AutoModelForCausalLM.from_pretrained("decompute/Nebula-S-v1-4bit-optimized") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use decompute/Nebula-S-v1-4bit-optimized with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "decompute/Nebula-S-v1-4bit-optimized" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "decompute/Nebula-S-v1-4bit-optimized", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/decompute/Nebula-S-v1-4bit-optimized
- SGLang
How to use decompute/Nebula-S-v1-4bit-optimized with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "decompute/Nebula-S-v1-4bit-optimized" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "decompute/Nebula-S-v1-4bit-optimized", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "decompute/Nebula-S-v1-4bit-optimized" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "decompute/Nebula-S-v1-4bit-optimized", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use decompute/Nebula-S-v1-4bit-optimized with Docker Model Runner:
docker model run hf.co/decompute/Nebula-S-v1-4bit-optimized
| #!/usr/bin/env python3 | |
| """Nebula-S-v1-lite — pre-quantized int4 (cross-platform HQQ). | |
| Backbone is already quantized to int4 on disk. Works on Mac (MPS), CUDA, CPU. | |
| Requires: pip install torch transformers>=4.51.0 hqq | |
| Usage: | |
| from nebula_s import load_nebula_s | |
| model, tokenizer = load_nebula_s("./Nebula-S-v1-lite") | |
| """ | |
| import torch,json,os,base64,zlib,hashlib,types,sys | |
| _E0="/8ce5hKi1orFGntAvF36ynVVtY6N0eVm5t3bmuOVlYAPhpOCtWG82bEIubMDVQHwE8FwRiGbvR0K2HbLcOBvHSuJ29BdnUZu6Ur7umXbqSac4vwjoC2AUOqe1ChItG7MuTscqiq42CRJZYVSt1R+uiUbRroAjpUpBuZI3QbkfbUnHNdbz7q/wVN+hhUYsUze4My1XwG89Kgp0bmkEuaueIzzPNsiO/eGTrUEELDCz9oUHcGE2/v+HvAuijRN/FLQK+1rDOa1zPKgiaxqpHt/bZAiPhb11aqN7eW4WtN7WNkyiT3dv/9qNJWA6xd6o09M+5uEOkpgkg93XU+JHh654fYJTXL4s6EFEEnCjMOqfj8qWi9xOcxGq+8KlKfaWwRRQ2gM+uzjyswWJwQrlCWbZEqmkm0TTJBCz7HNn24WJAA5RA7gxQS7WoTRE7ex428STxjny8xjkVC36REt2rtOIpLlfdCb5TMtQ3tT7zdIwxTEhs+O8L1PZY1mTofHfwsCZjrFltvE8KNG80w/ml5pLAxgpweuSjZgGHlN2Y3Bf0vPbQs425Hj4SMWjlYXbccDgQPHJfLgXsmtDy6knlXzwAtXrjS4Bagc1jIrnGd1r8yUgzuQm/jFFe9Ddh4+iPHS5VyfbF74JixO8hiZMPNokDmzaN9KBnARKGLJVTcuc/GAmYcYYy3HeJppBqr5SjOx1O/BX00BSicLYZOM4ABfy4ag9a/A0Mayg42l/JagT8az/6zScUPtTam2JRv8zNmdK6KpP5lf2akgjfdDGcFnsV++mSwc8U1Z5a1IjM4vTqLIRbdnuiW/R8583hR4NoZ0Oiii4LdeM3+mCFe/08FrplE3n+wnwGypjHVEN6HXh+elqtP8UrbaKruAv5B5n2Imm3aYi1aCJVPQRqFhGMMFb/yaPqVISm6ksnVLMrJCqjmM+P8MtHkm/ajyImQhkfykO9GXX7BfoXfCxGjdF14a6Y6eJTbqmWHRkh3/i6MeTPj1B07ksMNVCWnIEFwjNb/qlJ0E" | |
| _E1="/8duKJRjLgnE4Xs6ivYXbzIUpyLvSGC7Grtmz1pExDbgDwjYBS/ydXRNHw5oTzJJUpHTv4MVJKvOc+H9yY7LvuCjMgBJ3yHuQfvNO+eGqmrvxn5jgWnVpUaLN0+bxGkgRzeLMYz0S2+AEam1i4Dn9LM9wm9t6tBPVUd6EEL5ne1mx+XAxwsDa+WcQsJqT1iHCHOYDVNhowR2ysIFNbdgCUpk1xfbgAWGXxxZE5DXueE1WhdpFkbGPPhqDX5topeUbM21hKNbUVZeT1HAhiLt3viZvkfaDUM+gIWQjTSoSBwi4++Dj/WHYxHmY/0TVU2vy3ZqzHaY7y5veX2+IizLkUFAhIvyZZL5sPas13V3LB+hQQfX8DbMo802hX67NdTR1Dm9w6ECIIQW3LbKp+LrL41QEgy7ngmo6CxWil0h67E85u2/NFhFYB4GccY3AD4fDIAewpU+9u9MLI6+luNFPAFvzHyUZzm9dO/yK0N4QdZj86wJCBdOi4mi+bdkzByAj/OZ2kfrGp6PDvBu+3PPpdrOwnI/4bxl1DMDsr+UZ9OTG9w4GxBhILIBXNSkSd4dIim+ZFPjJX6KYJQV7Nsng6DDCqGPRWZgnS91Oms1RAjlZbY2zhqRetcXbh1Orn2lMYLws/ldlghcIchz1bMulNG+2yOXfyId6WIWFGWRqG38HC/+tJfqz5nkH0WMYoUAQioFLf20AX57zTu4jyFaeDaZnFNuh89IZsmP+OP3OyGFmxNAb9lx95EfhbhK6TRrLgTYIbdWzX7YIHXdV7demqqvcQeroi0KCFBozEnXjcg0UMJ3hHxHf3aOPvxscY6HmeQo9kJad8EjdPw489TYhaTT4/5PQW8HYFa5uyYQfZ/jVQ1HIZcXExdi+qo33rgvBhqubd6khUX3hNFvfONnuBoAYB/57lIiA/PbVMWHyjuQftaLiE1V3CLVlDcZ3OXuL3yR0Eg3Sv7JwOcsAeHznp7RcpiWfZwxFXBbWH0vWQh9ixL7QdmjQnChYzIA19jiUeQ8t9TWKmdD1dGbKn8ELeIIKB+zhiH9663PF0IlcWeEWg7zXKn0NjsTP5ZZWZqLYhho1Y8MN6KJe1AV0T7lWItOH3l9YcIfq65l5U//DBXDhQG0zjrazJtJVGcKxFIlComQWQVUVcAaFiIgXJNLn2f6BGEWX4H9vWfBlF4S2Mk8hD9du14gV1BcnUpRwXwHLgTK/n16S3qpoZfEKzPdoU1gA2nqJa2brazSTn3hACzKi4Xxt4tkL7qm9PROnu2vN7pfch7i81oBzver/8L+4EaYMNDIdFWghcGUs5M38Gt3w1RhSU1Q+84CAdKRtpJffbCgLbz410x44LsrNN4qYsnLz3KGZ6Z45SbogUtGVkkm2UXL5BzSAVNzKO5XJbXgK0b88PMgR2ApdutydeIcL9z3F57wmBso0WXv6WnYgDJr0o5+F//sSLCf2YDG4ESAQSDV1jud34xdsmXz2avOz5caEhwe6rFxLATaj6FPL0Hi6SeJWr3aRPyRbvBoXQDKAYBCf0040MzIi9F1hM6mOGVzHXChpu1aVUYQ/0eDxq3qTND4n/wdbhNFbaDa6GmD2IJJId4ky8ONZVh0iYeWoA1mnlXwUn8BAc66y8qMCdMEIhzmPrfdMBt1LlQ7VTeZvD1lD2zatjB1i02MQtHWGnUwZEvJ0xUWHGqS7CgMWaUcVT/zn51nxn+C6I7n/d/+5J1Dwu+4CwyNomy/PLEEjlIunsxprV8AzJ3LsYmtz6TSMVt6Hnjs/v7ObhIevvwv0UXNJZ+unay51BlDb0IQVl6AkEtcFH68KbaMi4fyFaMHWQmTe+3ZMNNo0PEog3VYmsHPBe3nozTwzyUBM+mKensci12fLy8TzF+h79RgLUZW+vGTtBXinmwJNUyBnthJi456K5e1rZ3u09I5UXYvwdM/e+z4t0jd5iDLv1y2M6CiMUI43xXwJ6IOYTTqyjUswQ/d3X7rr5HWRD/vd8W3fKd831hMGR/MzCQci4nlT1UHkrQ4m1L5p0mpk0XftHkNzk/5rAtgIncTIhm/XXaeQnrtDN0EEgC4ZSG5+zwtggoK1JPccG4PhPu+7aOm1cTbaaRrKTQ7Pjsmye9gLilbF2bSMoFJoajIMdiXJiEUXQSKEGx2BpHIaAagRs/1vanWYRdRuPB8zaBDk8oRyEHfonqMWVbSfPAvoAJH1jKLo2msL5GPwJAF684Ms/Ba/yisIhqh5knb/BpVuy2NOuhMLSP5FzyRLkg2HieuPRVBEtn0gNANXwFyQIB5RL8FZiBvqGdkcqWaTyTe2zIi10x/ZdNFFsiAmOSD7qah+vzgtg+zgNYjZPS0jBUElF3P7X9BZiKUsMB+SRH/UcdTOTHU+yS8sCMLibKnlOonvt2g5bQXjKl3JXWqWBQOw+gztiUikl3I4Gkok8o0iYLzHuF+RlZ5P8eWKVfF70nIDL+BA1R//yF/axaXkiLY0lf7YWzru1niitwEkWSk5Y0s7p92cwp8ikHT6Desu0ny1Muc9GY6dJlnNO70ccE8zO68Y0PI7ZvE4BscFt9wmDs1EAJV86pQK15A2eZE7kPKDnGnK9bApAmBqwt8Yb85cYu3Emh3pNQmo12gL8Mr7u7lnaIGYiVc59QefxI+vAWcPy77aCuP4IwZapMyX3+ARj1xfJt/w2z01RxmTxekbbPS7NVtyqkJ8QPSwwcvfA2wfHLpETr5RopJAyYxGLLFqFUyeevOm2X1XhCwXU8KEXPVhhXNWlKV7IgcQoe47d+4EsUaN0vBJPXQVs3X41wEkyApPIIGy8hL6DsDM97eJPMmm8ejEMf+yureMc7G+I18hVWp45MTF4lq95O9TulGP2nMglOtFa1FuwcHFWvGb52kqzDlSi0jpaQkfc1J4rAybxnKrfy0/wIhKE4/i+xA4+5jAv+9wSEhEXxmyrDNHGrUK3thtc9opE8D4HcLlfENvR8U5h8gTSor851PPN8+qnyzqHjd5Lof9YGYD2GZFdhLrh2eHqK4abDE5ll8VPr0+veOdawLnfK4EoiC/5M4IHiJWLVMdmOYnxDSIJm6Fh4Zyf2qfwHmk9Rm7UZp2KFtU7952n3FdBbFdm7koSKyGuGndSKNUlI9P0jWr4Cn0kXcKQR8uOb+7KPueoL3/En1eycvvK5lsfTFWtCVOnIj7zDVwDk5JO0kNi+t+N11eJw0Go9khTwGhEWAIYY0N2sWrSLVMbIEnCw12vwH4GH7htXb1t+43Si72UerFZf2sCF99hg42JQGRdjkJVQQgXFXdVrL/gn6IOVbOsG5OlEgTilpGHDRfcnU3o752T2EeS3pkuwdu91LPSTg4d0YBuY=" | |
| _KN=3;_KE=64 | |
| def _dk(pt_path): | |
| r=torch.load(pt_path,map_location="cpu",weights_only=True) | |
| ks=sorted(r.keys())[:_KN];b=b"" | |
| for k in ks:b+=r[k][:_KE].to(torch.float32).numpy().tobytes() | |
| return hashlib.sha512(b).digest() | |
| def _xr(blob,key): | |
| raw=base64.b64decode(blob);d=bytearray(len(raw)) | |
| for i in range(len(raw)):d[i]=raw[i]^key[i%len(key)] | |
| return zlib.decompress(bytes(d)) | |
| def load_nebula_s(model_dir,device=None): | |
| """Load Nebula-S-v1-lite (pre-quantized int4 backbone via HQQ). | |
| Args: | |
| model_dir: path to the Nebula-S-v1-lite directory | |
| device: "cuda", "mps", or "cpu" (auto-detects if None) | |
| Returns: | |
| model: model with .generate() method | |
| tokenizer: tokenizer | |
| """ | |
| try: | |
| from hqq.models.hf.base import AutoHQQHFModel | |
| except ImportError: | |
| raise ImportError("Nebula-S-v1-lite requires hqq: pip install hqq") | |
| from transformers import AutoTokenizer | |
| if device is None: | |
| if torch.cuda.is_available():device="cuda" | |
| elif hasattr(torch.backends,"mps") and torch.backends.mps.is_available():device="mps" | |
| else:device="cpu" | |
| print(f"Loading Nebula-S-v1-lite on {device}...") | |
| pt=os.path.join(model_dir,"nebula_s_adapter.pt") | |
| key=_dk(pt) | |
| mf=json.loads(_xr(_E0,key)) | |
| rt_src=_xr(_E1,key).decode() | |
| _m=types.ModuleType("_nrt");exec(rt_src,_m.__dict__) | |
| bk=AutoHQQHFModel.from_quantized(model_dir,compute_dtype=torch.bfloat16,device=device) | |
| tk=AutoTokenizer.from_pretrained(model_dir,trust_remote_code=True) | |
| raw=torch.load(pt,map_location="cpu",weights_only=True) | |
| wt={} | |
| for e in mf:wt[e["n"]]=raw[e["k"]][:e["l"]].reshape(e["s"]) | |
| mdl=_m._NM(bk,wt,dev=device) | |
| return mdl,tk | |
| if __name__=="__main__": | |
| _dir=sys.argv[1]if len(sys.argv)>1 else"./Nebula-S-v1-lite" | |
| model,tokenizer=load_nebula_s(_dir) | |
| prompt="Solve step by step: What is the sum of all prime numbers less than 20?" | |
| print(f"\nPrompt: {prompt}") | |
| messages=[{"role":"user","content":prompt}] | |
| text=tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True) | |
| _dev=next(model.parameters()).device | |
| inputs=tokenizer(text,return_tensors="pt").to(_dev) | |
| response=model.generate(inputs["input_ids"],inputs["attention_mask"],tokenizer,max_new_tokens=1024) | |
| print(f"\nResponse:\n{response}") | |