dd101bb commited on
Commit
ca5eaf8
·
verified ·
1 Parent(s): 141f845

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,205 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # COCONUT Model
2
+
3
+ <div align="center">
4
+
5
+ [![HuggingFace](https://img.shields.io/badge/🤗%20HuggingFace-Model-fcc21b?style=for-the-badge&logo=huggingface&logoColor=white)](https://huggingface.co/dd101bb/latent-tts-coconut)
6
+
7
+ </div>
8
+
9
+ ## Overview
10
+
11
+ **COCONUT** (Continuous COnceptual Neural Thought) is a latent reasoning model based on GPT-2 that enables continuous thought generation in latent space. This model is part of the [Parallel Test-Time Scaling for Latent Reasoning Models](https://arxiv.org/abs/2510.07745) framework.
12
+
13
+ ## Model Details
14
+
15
+ - **Base Architecture**: GPT-2 Language Model
16
+ - **Model Class**: `COCONUTGPT2` (extends `GPT2LMHeadModel`)
17
+ - **Latent Tokens**: Uses special tokens `<|latent|>`, `<|start-latent|>`, `<|end-latent|>` for latent reasoning
18
+ - **Input Format**: Requires newline after input question before `<|start-latent|>` token
19
+
20
+ ## Related Models
21
+
22
+ This repository includes other latent reasoning models that you might find useful:
23
+
24
+ - **[CODI Model](../codi/README.md)** - GPT-2 based model with optional projector module for enhanced hidden states
25
+ - **[CoLaR Model](../colar/README.md)** - LLaMA based model with specialized LatentHead module
26
+
27
+ ## Installation
28
+
29
+ Download the model from HuggingFace:
30
+
31
+ ```bash
32
+ huggingface-cli download dd101bb/latent-tts-coconut --local-dir checkpoints/coconut
33
+ ```
34
+
35
+ ## Quick Start
36
+
37
+ ### Basic Usage
38
+
39
+ ```python
40
+ from transformers import AutoTokenizer
41
+ from src.generation_mixin import LatentGenerationMixin, LatentGenerationConfig
42
+ from src.paths import MODELS
43
+
44
+ # Load tokenizer
45
+ model_id = "checkpoints/coconut"
46
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
47
+ if tokenizer.pad_token is None:
48
+ tokenizer.pad_token = tokenizer.eos_token
49
+
50
+ # Get latent token IDs
51
+ latent_id = tokenizer.convert_tokens_to_ids("<|latent|>")
52
+ start_id = tokenizer.convert_tokens_to_ids("<|start-latent|>")
53
+ end_id = tokenizer.convert_tokens_to_ids("<|end-latent|>")
54
+
55
+ # Create model class with generation mixin
56
+ class LatentCOCONUT(MODELS["coconut"]["class"], LatentGenerationMixin):
57
+ def __init__(self, config):
58
+ super().__init__(config)
59
+
60
+ # Load model
61
+ model = LatentCOCONUT.from_pretrained(
62
+ model_id,
63
+ latent_id=latent_id,
64
+ latent_start_id=start_id,
65
+ latent_end_id=end_id,
66
+ device_map="auto",
67
+ )
68
+
69
+ # Prepare input (note: newline before <|start-latent|>)
70
+ question = "What is 2 + 2?\n<|start-latent|>"
71
+ inputs = tokenizer(question, return_tensors="pt").to(model.device)
72
+
73
+ # Configure generation
74
+ generation_config = LatentGenerationConfig(
75
+ max_new_tokens=512,
76
+ latent_length=6,
77
+ latent_do_sample=True,
78
+ latent_do_sample_by="dropout", # or "noise"
79
+ dropout_p=0.1,
80
+ pad_token_id=tokenizer.pad_token_id,
81
+ eos_token_id=tokenizer.eos_token_id,
82
+ )
83
+
84
+ # Generate
85
+ output = model.generate(
86
+ **inputs,
87
+ generation_config=generation_config,
88
+ num_return_sequences=1,
89
+ )
90
+
91
+ # Decode result
92
+ result = tokenizer.decode(output[0], skip_special_tokens=True)
93
+ print(result)
94
+ ```
95
+
96
+ ### Batch Processing
97
+
98
+ The model fully supports batch processing:
99
+
100
+ ```python
101
+ # Prepare batch inputs
102
+ questions = [
103
+ "What is 2 + 2?\n<|start-latent|>",
104
+ "What is 5 * 3?\n<|start-latent|>",
105
+ "What is 10 - 4?\n<|start-latent|>",
106
+ ]
107
+ inputs = tokenizer(questions, return_tensors="pt", padding=True).to(model.device)
108
+
109
+ # Generate for batch
110
+ outputs = model.generate(
111
+ **inputs,
112
+ generation_config=generation_config,
113
+ num_return_sequences=1,
114
+ )
115
+
116
+ # Decode batch results
117
+ results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
118
+ for result in results:
119
+ print(result)
120
+ ```
121
+
122
+ ## Generation Parameters
123
+
124
+ ### LatentGenerationConfig
125
+
126
+ - `max_new_tokens` (int): Maximum number of tokens to generate
127
+ - `latent_length` (int): Number of latent tokens (default: 6)
128
+ - `latent_do_sample` (bool): Whether to use stochastic sampling
129
+ - `latent_do_sample_by` (str): Sampling method - `"dropout"` or `"noise"`
130
+ - `dropout_p` (float): Dropout probability for Monte Carlo Dropout (e.g., 0.1)
131
+ - `noise_std` (float): Standard deviation for Additive Gaussian Noise
132
+
133
+ ### Sampling Methods
134
+
135
+ 1. **Monte Carlo Dropout**: Randomly drops activations during forward passes
136
+
137
+ ```python
138
+ generation_config = LatentGenerationConfig(
139
+ latent_do_sample_by="dropout",
140
+ dropout_p=0.1,
141
+ # ...
142
+ )
143
+ ```
144
+ 2. **Additive Gaussian Noise**: Injects noise into latent embeddings
145
+
146
+ ```python
147
+ generation_config = LatentGenerationConfig(
148
+ latent_do_sample_by="noise",
149
+ noise_std=0.1,
150
+ # ...
151
+ )
152
+ ```
153
+
154
+ ## Answer Extraction
155
+
156
+ COCONUT uses a special answer format with `#` separator:
157
+
158
+ ```python
159
+ from src.paths import coconut_extract_answer_number
160
+
161
+ # Extract answer from generated text
162
+ answer = coconut_extract_answer_number(result)
163
+ print(f"Answer: {answer}")
164
+ ```
165
+
166
+ ## Evaluation
167
+
168
+ Run evaluation using the provided scripts:
169
+
170
+ ```bash
171
+ # For COCONUT (GPT-2 based models)
172
+ ./run_tests.sh
173
+ ```
174
+
175
+ ## Model Card
176
+
177
+ - **Paper**: [Parallel Test-Time Scaling for Latent Reasoning Models](https://arxiv.org/abs/2510.07745)
178
+ - **HuggingFace**: [dd101bb/latent-tts-coconut](https://huggingface.co/dd101bb/latent-tts-coconut)
179
+ - **Benchmarks**: GSM8K Test, GSM8K Hard, MultiArith
180
+
181
+ ## Citation
182
+
183
+ If you use this model, please cite:
184
+
185
+ ```bibtex
186
+ @misc{you2025paralleltesttimescalinglatent,
187
+ title={Parallel Test-Time Scaling for Latent Reasoning Models},
188
+ author={Runyang You and Yongqi Li and Meng Liu and Wenjie Wang and Liqiang Nie and Wenjie Li},
189
+ year={2025},
190
+ eprint={2510.07745},
191
+ archivePrefix={arXiv},
192
+ primaryClass={cs.CL},
193
+ url={https://arxiv.org/abs/2510.07745},
194
+ }
195
+
196
+ @misc{hao2025traininglargelanguagemodels,
197
+ title={Training Large Language Models to Reason in a Continuous Latent Space},
198
+ author={Shibo Hao and Sainbayar Sukhbaatar and DiJia Su and Xian Li and Zhiting Hu and Jason Weston and Yuandong Tian},
199
+ year={2025},
200
+ eprint={2412.06769},
201
+ archivePrefix={arXiv},
202
+ primaryClass={cs.CL},
203
+ url={https://arxiv.org/abs/2412.06769},
204
+ }
205
+ ```
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|end-latent|>": 50258,
3
+ "<|latent|>": 50259,
4
+ "<|start-latent|>": 50257
5
+ }
config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "COCONUTGPT2"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "latent_end_id": -100,
12
+ "latent_id": -100,
13
+ "latent_start_id": -100,
14
+ "layer_norm_epsilon": 1e-05,
15
+ "model_type": "gpt2",
16
+ "n_ctx": 1024,
17
+ "n_embd": 768,
18
+ "n_head": 12,
19
+ "n_inner": null,
20
+ "n_layer": 12,
21
+ "n_positions": 1024,
22
+ "reorder_and_upcast_attn": false,
23
+ "resid_pdrop": 0.1,
24
+ "scale_attn_by_inverse_layer_idx": false,
25
+ "scale_attn_weights": true,
26
+ "summary_activation": null,
27
+ "summary_first_dropout": 0.1,
28
+ "summary_proj_to_labels": true,
29
+ "summary_type": "cls_index",
30
+ "summary_use_proj": true,
31
+ "target_id": -100,
32
+ "task_specific_params": {
33
+ "text-generation": {
34
+ "do_sample": true,
35
+ "max_length": 50
36
+ }
37
+ },
38
+ "torch_dtype": "float32",
39
+ "transformers_version": "4.52.4",
40
+ "use_cache": true,
41
+ "vocab_size": 50260
42
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.52.4"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad7f3ce896a610a4293556a30b7bf2b9e9c0d463a248eea448d44a7e962be390
3
+ size 497783424
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "50257": {
13
+ "content": "<|start-latent|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": false
19
+ },
20
+ "50258": {
21
+ "content": "<|end-latent|>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": false
27
+ },
28
+ "50259": {
29
+ "content": "<|latent|>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": false
35
+ }
36
+ },
37
+ "bos_token": "<|endoftext|>",
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token": "<|endoftext|>",
40
+ "extra_special_tokens": {},
41
+ "model_max_length": 1024,
42
+ "pad_token": "<|endoftext|>",
43
+ "tokenizer_class": "GPT2Tokenizer",
44
+ "unk_token": "<|endoftext|>"
45
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff