File size: 4,835 Bytes
d25c5f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
model:
  name: "DeepXR/Helion-2.5-Rnd"
  version: "2.5.0-research"
  type: "transformer"
  architecture: "llama"
  description: "Helion-2.5 Research & Development - Advanced multimodal language model"
  
  capabilities:
    - text_generation
    - code_generation
    - mathematical_reasoning
    - multilingual_understanding
    - instruction_following
    - context_understanding
    - creative_writing
    - analytical_reasoning
    - scientific_computation
    - conversational_ai

  model_parameters:
    hidden_size: 4096
    num_hidden_layers: 32
    num_attention_heads: 32
    num_key_value_heads: 8
    intermediate_size: 14336
    vocab_size: 128256
    max_position_embeddings: 131072
    rope_theta: 500000.0
    rope_scaling:
      type: "yarn"
      factor: 8.0
      original_max_position_embeddings: 16384
    attention_bias: false
    attention_dropout: 0.0
    mlp_bias: false
    
  tokenizer:
    type: "sentencepiece"
    model_max_length: 131072
    padding_side: "right"
    truncation_side: "right"
    chat_template: "{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>\n' }}{% endfor %}{{ '<|im_start|>assistant\n' }}"

  training:
    base_model: "meta-llama/Meta-Llama-3.1-70B"
    training_data:
      - "scientific_papers"
      - "code_repositories"
      - "mathematical_proofs"
      - "conversational_data"
      - "multilingual_corpus"
      - "technical_documentation"
    total_tokens: "2.5T"
    training_steps: 150000
    warmup_steps: 2000
    learning_rate: 2.0e-5
    weight_decay: 0.01
    gradient_accumulation_steps: 8
    per_device_batch_size: 4
    fp16: false
    bf16: true
    
  optimization:
    optimizer: "adamw_torch_fused"
    scheduler: "cosine_with_restarts"
    gradient_checkpointing: true
    flash_attention: true
    tensor_parallel_size: 4
    pipeline_parallel_size: 2
    
  quantization:
    bits: 16
    supported_formats:
      - "fp16"
      - "bf16"
      - "int8"
      - "int4"
      - "awq"
      - "gptq"
      - "gguf"

inference:
  default_parameters:
    temperature: 0.7
    top_p: 0.9
    top_k: 50
    repetition_penalty: 1.1
    max_new_tokens: 4096
    do_sample: true
    num_beams: 1
    
  generation_config:
    pad_token_id: 128001
    bos_token_id: 128000
    eos_token_id: 128009
    use_cache: true
    output_attentions: false
    output_hidden_states: false
    return_dict_in_generate: true
    
  performance:
    batch_size: 1
    max_batch_size: 32
    streaming: true
    gpu_memory_utilization: 0.95
    tensor_parallel: true
    
  special_tokens:
    bos_token: "<|begin_of_text|>"
    eos_token: "<|end_of_text|>"
    pad_token: "<|pad|>"
    unk_token: "<|unk|>"
    system_token: "<|im_start|>system"
    user_token: "<|im_start|>user"
    assistant_token: "<|im_start|>assistant"
    end_token: "<|im_end|>"

deployment:
  framework: "transformers"
  recommended_hardware:
    gpu: "A100 80GB (minimum 2x)"
    vram: "160GB+"
    ram: "256GB+"
    storage: "500GB+ NVMe SSD"
    
  serving:
    engine: "vllm"
    max_concurrent_requests: 128
    max_model_len: 131072
    gpu_memory_utilization: 0.9
    swap_space: 16
    
  endpoints:
    - name: "completions"
      path: "/v1/completions"
      methods: ["POST"]
    - name: "chat_completions"
      path: "/v1/chat/completions"
      methods: ["POST"]
    - name: "embeddings"
      path: "/v1/embeddings"
      methods: ["POST"]
      
research:
  status: "experimental"
  stage: "development"
  evaluation_metrics:
    perplexity: 2.34
    accuracy_mmlu: 0.847
    accuracy_gsm8k: 0.892
    accuracy_humaneval: 0.756
    accuracy_mbpp: 0.723
    
  benchmarks:
    reasoning:
      arc_challenge: 0.834
      hellaswag: 0.889
      winogrande: 0.823
    code:
      humaneval: 0.756
      mbpp: 0.723
      ds1000: 0.645
    mathematics:
      gsm8k: 0.892
      math: 0.567
      minerva: 0.534
    knowledge:
      mmlu: 0.847
      truthfulqa: 0.612
      
  limitations:
    - "Model is in research phase - outputs should be verified"
    - "May exhibit biases present in training data"
    - "Performance on specialized domains may vary"
    - "Long context performance degrades beyond 64K tokens"
    
  license: "Apache-2.0"
  citation: |
    @misc{helion-2.5-rnd,
      title={Helion-2.5-Rnd: Advanced Research Language Model},
      author={DeepXR Team},
      year={2025},
      publisher={DeepXR},
      url={https://huggingface.co/DeepXR/Helion-2.5-Rnd}
    }

safety:
  content_filtering: true
  toxicity_threshold: 0.5
  pii_detection: true
  prompt_injection_protection: true
  
metadata:
  created_at: "2025-01-15"
  updated_at: "2025-01-30"
  status: "research"
  visibility: "public"
  tags:
    - "language-model"
    - "research"
    - "multimodal"
    - "instruction-tuned"
    - "long-context"