Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

.gitattributes +1 -0
__pycache__/modeling_wind_arc.cpython-314.pyc +0 -0
config.json +12 -4
config_qwen3_original.json +63 -0
gguf_temp/.gitattributes +36 -0
gguf_temp/README.md +199 -0
gguf_temp/chat_template.jinja +89 -0
gguf_temp/config.json +63 -0
gguf_temp/config_qwen3_original.json +63 -0
gguf_temp/generation_config.json +13 -0
gguf_temp/model.safetensors +3 -0
gguf_temp/modeling_wind_arc.py +93 -0
gguf_temp/tokenizer.json +3 -0
gguf_temp/tokenizer_config.json +29 -0
modeling_wind_arc.py +93 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
+gguf_temp/tokenizer.json filter=lfs diff=lfs merge=lfs -text

__pycache__/modeling_wind_arc.cpython-314.pyc ADDED Viewed

Binary file (5.63 kB). View file

config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "architectures": [
-    "Qwen3ForCasualLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
@@ -44,7 +44,7 @@
   ],
   "max_position_embeddings": 32768,
   "max_window_layers": 28,
-  "model_type": "qwen3",
   "num_attention_heads": 16,
   "num_hidden_layers": 28,
   "num_key_value_heads": 8,
@@ -59,5 +59,13 @@
   "transformers_version": "5.3.0",
   "use_cache": false,
   "use_sliding_window": false,
-  "vocab_size": 151936
-}

 {
   "architectures": [
+    "WindArcForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   ],
   "max_position_embeddings": 32768,
   "max_window_layers": 28,
+  "model_type": "wind_arc",
   "num_attention_heads": 16,
   "num_hidden_layers": 28,
   "num_key_value_heads": 8,
   "transformers_version": "5.3.0",
   "use_cache": false,
   "use_sliding_window": false,
+  "vocab_size": 151936,
+  "model_name": "Wind Arc 1.6",
+  "made_by": "North.ai",
+  "version": "1.6",
+  "description": "Custom architecture LLM by North.ai. YaRN RoPE + MoE FFN + Hybrid Attention.",
+  "auto_map": {
+    "AutoConfig": "modeling_wind_arc.WindArcConfig",
+    "AutoModelForCausalLM": "modeling_wind_arc.WindArcForCausalLM"
+  }
+}

config_qwen3_original.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "architectures": [
+    "Qwen3ForCasualLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.3.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

gguf_temp/.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

gguf_temp/README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

gguf_temp/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

gguf_temp/config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.3.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

gguf_temp/config_qwen3_original.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "architectures": [
+    "Qwen3ForCasualLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.3.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

gguf_temp/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.3.0"
+}

gguf_temp/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fc2bbda6885be3642ab710d26d33c1f046140afae8633cb2211faec9028e336
+size 7234911400

gguf_temp/modeling_wind_arc.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+Wind Arc 1.6 - Custom Model Class
+North.ai
+Registers Wind Arc as a proper HuggingFace model type.
+Allows: AutoModelForCausalLM.from_pretrained("arthu1/wind-arc-1-6")
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.models.qwen3.modeling_qwen3 import (
+    Qwen3ForCausalLM, Qwen3Model, Qwen3DecoderLayer
+)
+from transformers import PretrainedConfig
+class WindArcConfig(PretrainedConfig):
+    model_type = "wind_arc"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.model_name = kwargs.get("model_name", "Wind Arc 1.6")
+        self.made_by    = kwargs.get("made_by", "North.ai")
+class WindArcMoE(nn.Module):
+    """
+    Wind Arc custom MoE FFN.
+    Replaces standard Qwen3 MLP with 4 routed experts + 1 shared expert.
+    """
+    def __init__(self, config):
+        super().__init__()
+        D = config.hidden_size
+        I = config.intermediate_size // 2
+        self.router = nn.Linear(D, 4, bias=False)
+        self.experts = nn.ModuleList([
+            nn.ModuleDict({
+                "gate": nn.Linear(D, I, bias=False),
+                "up":   nn.Linear(D, I, bias=False),
+                "down": nn.Linear(I, D, bias=False),
+            }) for _ in range(4)
+        ])
+        self.shared_gate = nn.Linear(D, I, bias=False)
+        self.shared_up   = nn.Linear(D, I, bias=False)
+        self.shared_down = nn.Linear(I, D, bias=False)
+    def forward(self, x):
+        B, L, D = x.shape
+        flat = x.reshape(-1, D)
+        probs = F.softmax(self.router(flat), dim=-1)
+        idx = torch.argmax(probs, dim=-1)
+        out = torch.zeros_like(flat)
+        for i, expert in enumerate(self.experts):
+            mask = (idx == i)
+            if mask.any():
+                xi = flat[mask]
+                out[mask] += expert["down"](
+                    F.silu(expert["gate"](xi)) * expert["up"](xi)
+                ) * probs[mask, i:i+1]
+        shared = self.shared_down(
+            F.silu(self.shared_gate(flat)) * self.shared_up(flat)
+        )
+        return (out + shared).reshape(B, L, D)
+class WindArcForCausalLM(Qwen3ForCausalLM):
+    """
+    Wind Arc 1.6 — Custom architecture by North.ai.
+    Extends Qwen3 with MoE FFN layers.
+    """
+    config_class = WindArcConfig
+    def __init__(self, config):
+        super().__init__(config)
+        # Replace all MLP layers with Wind Arc MoE
+        for layer in self.model.layers:
+            layer.mlp = WindArcMoE(config)
+        self.post_init()
+    @property
+    def model_identity(self):
+        return {
+            "name":    "Wind Arc 1.6",
+            "made_by": "North.ai",
+            "arch":    "YaRN RoPE + MoE FFN (4+1 experts) + Hybrid Attention",
+        }
+# Register with HuggingFace AutoClass
+AutoConfig.register("wind_arc", WindArcConfig)
+AutoModelForCausalLM.register(WindArcConfig, WindArcForCausalLM)

gguf_temp/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

gguf_temp/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "model_max_length": 131072,
+  "pad_token": "<|im_end|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

modeling_wind_arc.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+Wind Arc 1.6 - Custom Model Class
+North.ai
+Registers Wind Arc as a proper HuggingFace model type.
+Allows: AutoModelForCausalLM.from_pretrained("arthu1/wind-arc-1-6")
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.models.qwen3.modeling_qwen3 import (
+    Qwen3ForCausalLM, Qwen3Model, Qwen3DecoderLayer
+)
+from transformers import PretrainedConfig
+class WindArcConfig(PretrainedConfig):
+    model_type = "wind_arc"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.model_name = kwargs.get("model_name", "Wind Arc 1.6")
+        self.made_by    = kwargs.get("made_by", "North.ai")
+class WindArcMoE(nn.Module):
+    """
+    Wind Arc custom MoE FFN.
+    Replaces standard Qwen3 MLP with 4 routed experts + 1 shared expert.
+    """
+    def __init__(self, config):
+        super().__init__()
+        D = config.hidden_size
+        I = config.intermediate_size // 2
+        self.router = nn.Linear(D, 4, bias=False)
+        self.experts = nn.ModuleList([
+            nn.ModuleDict({
+                "gate": nn.Linear(D, I, bias=False),
+                "up":   nn.Linear(D, I, bias=False),
+                "down": nn.Linear(I, D, bias=False),
+            }) for _ in range(4)
+        ])
+        self.shared_gate = nn.Linear(D, I, bias=False)
+        self.shared_up   = nn.Linear(D, I, bias=False)
+        self.shared_down = nn.Linear(I, D, bias=False)
+    def forward(self, x):
+        B, L, D = x.shape
+        flat = x.reshape(-1, D)
+        probs = F.softmax(self.router(flat), dim=-1)
+        idx = torch.argmax(probs, dim=-1)
+        out = torch.zeros_like(flat)
+        for i, expert in enumerate(self.experts):
+            mask = (idx == i)
+            if mask.any():
+                xi = flat[mask]
+                out[mask] += expert["down"](
+                    F.silu(expert["gate"](xi)) * expert["up"](xi)
+                ) * probs[mask, i:i+1]
+        shared = self.shared_down(
+            F.silu(self.shared_gate(flat)) * self.shared_up(flat)
+        )
+        return (out + shared).reshape(B, L, D)
+class WindArcForCausalLM(Qwen3ForCausalLM):
+    """
+    Wind Arc 1.6 — Custom architecture by North.ai.
+    Extends Qwen3 with MoE FFN layers.
+    """
+    config_class = WindArcConfig
+    def __init__(self, config):
+        super().__init__(config)
+        # Replace all MLP layers with Wind Arc MoE
+        for layer in self.model.layers:
+            layer.mlp = WindArcMoE(config)
+        self.post_init()
+    @property
+    def model_identity(self):
+        return {
+            "name":    "Wind Arc 1.6",
+            "made_by": "North.ai",
+            "arch":    "YaRN RoPE + MoE FFN (4+1 experts) + Hybrid Attention",
+        }
+# Register with HuggingFace AutoClass
+AutoConfig.register("wind_arc", WindArcConfig)
+AutoModelForCausalLM.register(WindArcConfig, WindArcForCausalLM)