dixisouls commited on
Commit
6d542c5
·
1 Parent(s): b90308d

Added model files

Browse files
Files changed (3) hide show
  1. README.md +40 -0
  2. config.json +59 -0
  3. pytorch_model.bin +3 -0
README.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - text-generation
5
+ - pytorch
6
+ - transformer
7
+ - custom-model
8
+ language:
9
+ - en
10
+ pipeline_tag: text-generation
11
+ ---
12
+
13
+ # VelocityLM - 2B Parameter Language Model
14
+
15
+ A custom transformer model with 2B parameters trained for text generation.
16
+
17
+ ## Model Details
18
+
19
+ - **Parameters:** ~2 billion
20
+ - **Architecture:** Custom Transformer with RoPE, RMSNorm, SwiGLU
21
+ - **Context Length:** 2,048 tokens
22
+ - **Tokenizer:** GPT-2 compatible
23
+ - **Training:** Falcon RefinedWeb dataset
24
+
25
+ ## Usage
26
+
27
+ ```python
28
+ from transformers import AutoTokenizer
29
+ import torch
30
+
31
+ # Load tokenizer
32
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
33
+
34
+ # Load model (you'll need custom loading code)
35
+ # See the Space implementation for details
36
+
37
+ Files
38
+
39
+ - config.json - Model configuration
40
+ - pytorch_model.bin - Model weights
config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "vocab_size": 50257,
4
+ "hidden_size": 2048,
5
+ "num_hidden_layers": 24,
6
+ "num_attention_heads": 32,
7
+ "intermediate_size": 8192,
8
+ "hidden_act": "silu",
9
+ "max_position_embeddings": 2048,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_eps": 1e-05,
12
+ "use_cache": false,
13
+ "rope_theta": 10000.0,
14
+ "attention_dropout": 0.0,
15
+ "hidden_dropout": 0.0
16
+ },
17
+ "training": {
18
+ "batch_size_per_device": 4,
19
+ "gradient_accumulation_steps": 8,
20
+ "learning_rate": 0.0006,
21
+ "min_learning_rate": 6e-05,
22
+ "weight_decay": 0.1,
23
+ "adam_beta1": 0.9,
24
+ "adam_beta2": 0.95,
25
+ "adam_epsilon": 1e-08,
26
+ "max_grad_norm": 1.0,
27
+ "warmup_steps": 2000,
28
+ "num_training_steps": 100000,
29
+ "logging_steps": 10,
30
+ "save_steps": 1000,
31
+ "eval_steps": 500,
32
+ "save_total_limit": 5,
33
+ "fp16": true,
34
+ "gradient_checkpointing": true
35
+ },
36
+ "data": {
37
+ "dataset_name": "tiiuae/falcon-refinedweb",
38
+ "dataset_config": "default",
39
+ "text_column": "content",
40
+ "max_seq_length": 2048,
41
+ "num_workers": 8,
42
+ "preprocessing_num_workers": 16
43
+ },
44
+ "tokenizer": {
45
+ "tokenizer_name": "gpt2",
46
+ "add_special_tokens": true
47
+ },
48
+ "infrastructure": {
49
+ "num_gpus": 4,
50
+ "seed": 42,
51
+ "output_dir": "./checkpoints",
52
+ "logging_dir": "./logs",
53
+ "resume_from_checkpoint": "./checkpoints/checkpoint-4000"
54
+ },
55
+ "distributed": {
56
+ "backend": "nccl",
57
+ "find_unused_parameters": false
58
+ }
59
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:194cb060512a6a56887d2e0d3f623627cdef7231e8de1655d0f7c013883c21f0
3
+ size 6854652150