Add pipeline tag, library name, link to paper

#1
by nielsr HF Staff - opened
Files changed (1) hide show
  1. README.md +174 -1
README.md CHANGED
@@ -1,9 +1,13 @@
1
  ---
2
  license: mit
 
 
3
  ---
4
 
5
  # Cuckoo 🐦 [[Github]](https://github.com/KomeijiForce/Cuckoo)
6
 
 
 
7
  Cuckoo is a small (300M) information extraction (IE) model that imitates the next token prediction paradigm of large language models. Instead of retrieving from the vocabulary, Cuckoo predicts the next tokens by tagging them in the given input context as shown below:
8
 
9
  ![cuckoo](https://github.com/user-attachments/assets/d000f275-82a7-4939-aca8-341c61a774dc)
@@ -155,4 +159,173 @@ sea ['blue']
155
  fire ['red']
156
  night []
157
  ```
158
- which shows Cuckoo is not extracting any plausible spans but has the knowledge to understand the context.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ library_name: transformers
4
+ pipeline_tag: question-answering
5
  ---
6
 
7
  # Cuckoo 🐦 [[Github]](https://github.com/KomeijiForce/Cuckoo)
8
 
9
+ This repository contains the model of the paper [Cuckoo: An IE Free Rider Hatched by Massive Nutrition in LLM's Nest](https://huggingface.co/papers/2502.11275).
10
+
11
  Cuckoo is a small (300M) information extraction (IE) model that imitates the next token prediction paradigm of large language models. Instead of retrieving from the vocabulary, Cuckoo predicts the next tokens by tagging them in the given input context as shown below:
12
 
13
  ![cuckoo](https://github.com/user-attachments/assets/d000f275-82a7-4939-aca8-341c61a774dc)
 
159
  fire ['red']
160
  night []
161
  ```
162
+ which shows Cuckoo is not extracting any plausible spans but has the knowledge to understand the context.
163
+
164
+ # File information
165
+
166
+ The repository contains the following file information:
167
+
168
+ Filename: special_tokens_map.json
169
+ Content: {
170
+ "bos_token": {
171
+ "content": "<s>",
172
+ "lstrip": false,
173
+ "normalized": true,
174
+ "rstrip": false,
175
+ "single_word": false
176
+ },
177
+ "cls_token": {
178
+ "content": "<s>",
179
+ "lstrip": false,
180
+ "normalized": true,
181
+ "rstrip": false,
182
+ "single_word": false
183
+ },
184
+ "eos_token": {
185
+ "content": "</s>",
186
+ "lstrip": false,
187
+ "normalized": true,
188
+ "rstrip": false,
189
+ "single_word": false
190
+ },
191
+ "mask_token": {
192
+ "content": "<mask>",
193
+ "lstrip": true,
194
+ "normalized": false,
195
+ "rstrip": false,
196
+ "single_word": false
197
+ },
198
+ "pad_token": {
199
+ "content": "<pad>",
200
+ "lstrip": false,
201
+ "normalized": true,
202
+ "rstrip": false,
203
+ "single_word": false
204
+ },
205
+ "sep_token": {
206
+ "content": "</s>",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false
211
+ },
212
+ "unk_token": {
213
+ "content": "<unk>",
214
+ "lstrip": false,
215
+ "normalized": true,
216
+ "rstrip": false,
217
+ "single_word": false
218
+ }
219
+ }
220
+
221
+ Filename: tokenizer_config.json
222
+ Content: {
223
+ "add_prefix_space": true,
224
+ "added_tokens_decoder": {
225
+ "0": {
226
+ "content": "<s>",
227
+ "lstrip": false,
228
+ "normalized": true,
229
+ "rstrip": false,
230
+ "single_word": false,
231
+ "special": true
232
+ },
233
+ "1": {
234
+ "content": "<pad>",
235
+ "lstrip": false,
236
+ "normalized": true,
237
+ "rstrip": false,
238
+ "single_word": false,
239
+ "special": true
240
+ },
241
+ "2": {
242
+ "content": "</s>",
243
+ "lstrip": false,
244
+ "normalized": true,
245
+ "rstrip": false,
246
+ "single_word": false,
247
+ "special": true
248
+ },
249
+ "3": {
250
+ "content": "<unk>",
251
+ "lstrip": false,
252
+ "normalized": true,
253
+ "rstrip": false,
254
+ "single_word": false,
255
+ "special": true
256
+ },
257
+ "50264": {
258
+ "content": "<mask>",
259
+ "lstrip": true,
260
+ "normalized": false,
261
+ "rstrip": false,
262
+ "single_word": false,
263
+ "special": true
264
+ }
265
+ },
266
+ "bos_token": "<s>",
267
+ "clean_up_tokenization_spaces": false,
268
+ "cls_token": "<s>",
269
+ "eos_token": "</s>",
270
+ "errors": "replace",
271
+ "mask_token": "<mask>",
272
+ "max_length": 512,
273
+ "model_max_length": 512,
274
+ "pad_token": "<pad>",
275
+ "sep_token": "</s>",
276
+ "stride": 0,
277
+ "tokenizer_class": "RobertaTokenizer",
278
+ "trim_offsets": true,
279
+ "truncation_side": "right",
280
+ "truncation_strategy": "longest_first",
281
+ "unk_token": "<unk>"
282
+ }
283
+
284
+ Filename: merges.txt
285
+ Content: "Content of the file is larger than 50 KB, too long to display."
286
+
287
+ Filename: vocab.json
288
+ Content: "Content of the file is larger than 50 KB, too long to display."
289
+
290
+ Filename: config.json
291
+ Content: {
292
+ "_name_or_path": "models/ptr-large-c4-stage9",
293
+ "architectures": [
294
+ "RobertaForTokenClassification"
295
+ ],
296
+ "attention_probs_dropout_prob": 0.1,
297
+ "bos_token_id": 0,
298
+ "classifier_dropout": null,
299
+ "eos_token_id": 2,
300
+ "finetuning_task": "ner",
301
+ "hidden_act": "gelu",
302
+ "hidden_dropout_prob": 0.1,
303
+ "hidden_size": 1024,
304
+ "id2label": {
305
+ "0": "B",
306
+ "1": "I",
307
+ "2": "O"
308
+ },
309
+ "initializer_range": 0.02,
310
+ "intermediate_size": 4096,
311
+ "label2id": {
312
+ "B": 0,
313
+ "I": 1,
314
+ "O": 2
315
+ },
316
+ "layer_norm_eps": 1e-05,
317
+ "max_position_embeddings": 514,
318
+ "model_type": "roberta",
319
+ "num_attention_heads": 16,
320
+ "num_hidden_layers": 24,
321
+ "pad_token_id": 1,
322
+ "position_embedding_type": "absolute",
323
+ "torch_dtype": "float32",
324
+ "transformers_version": "4.45.2",
325
+ "type_vocab_size": 1,
326
+ "use_cache": true,
327
+ "vocab_size": 50265
328
+ }
329
+
330
+ Filename: tokenizer.json
331
+ Content: "Content of the file is larger than 50 KB, too long to display."