CocoLng
commited on
Commit
·
3ea7f84
1
Parent(s):
943edcc
Ajout OSCAR 100G
Browse files- cam_run30/model_info.txt +90 -0
- cam_run30/weights/config.json +26 -0
- cam_run30/weights/final_metrics.json +0 -0
- cam_run30/weights/merges.txt +0 -0
- cam_run30/weights/model.safetensors +3 -0
- cam_run30/weights/special_tokens_map.json +15 -0
- cam_run30/weights/tokenizer.json +0 -0
- cam_run30/weights/tokenizer_config.json +57 -0
- cam_run30/weights/vocab.json +0 -0
cam_run30/model_info.txt
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
=== Dataset Information ===
|
| 2 |
+
Total tokens: 26,843,545,600
|
| 3 |
+
Estimated size: 100.00 GB
|
| 4 |
+
|
| 5 |
+
=== Model Architecture ===
|
| 6 |
+
return_dict: True
|
| 7 |
+
output_hidden_states: False
|
| 8 |
+
output_attentions: False
|
| 9 |
+
torchscript: False
|
| 10 |
+
torch_dtype: float32
|
| 11 |
+
use_bfloat16: False
|
| 12 |
+
tf_legacy_loss: False
|
| 13 |
+
pruned_heads: {}
|
| 14 |
+
tie_word_embeddings: True
|
| 15 |
+
chunk_size_feed_forward: 0
|
| 16 |
+
is_encoder_decoder: False
|
| 17 |
+
is_decoder: False
|
| 18 |
+
cross_attention_hidden_size: None
|
| 19 |
+
add_cross_attention: False
|
| 20 |
+
tie_encoder_decoder: False
|
| 21 |
+
max_length: 20
|
| 22 |
+
min_length: 0
|
| 23 |
+
do_sample: False
|
| 24 |
+
early_stopping: False
|
| 25 |
+
num_beams: 1
|
| 26 |
+
num_beam_groups: 1
|
| 27 |
+
diversity_penalty: 0.0
|
| 28 |
+
temperature: 1.0
|
| 29 |
+
top_k: 50
|
| 30 |
+
top_p: 1.0
|
| 31 |
+
typical_p: 1.0
|
| 32 |
+
repetition_penalty: 1.0
|
| 33 |
+
length_penalty: 1.0
|
| 34 |
+
no_repeat_ngram_size: 0
|
| 35 |
+
encoder_no_repeat_ngram_size: 0
|
| 36 |
+
bad_words_ids: None
|
| 37 |
+
num_return_sequences: 1
|
| 38 |
+
output_scores: False
|
| 39 |
+
return_dict_in_generate: False
|
| 40 |
+
forced_bos_token_id: None
|
| 41 |
+
forced_eos_token_id: None
|
| 42 |
+
remove_invalid_values: False
|
| 43 |
+
exponential_decay_length_penalty: None
|
| 44 |
+
suppress_tokens: None
|
| 45 |
+
begin_suppress_tokens: None
|
| 46 |
+
architectures: ['RobertaForMaskedLM']
|
| 47 |
+
finetuning_task: None
|
| 48 |
+
id2label: {0: 'LABEL_0', 1: 'LABEL_1'}
|
| 49 |
+
label2id: {'LABEL_0': 0, 'LABEL_1': 1}
|
| 50 |
+
tokenizer_class: None
|
| 51 |
+
prefix: None
|
| 52 |
+
bos_token_id: 0
|
| 53 |
+
pad_token_id: 1
|
| 54 |
+
eos_token_id: 2
|
| 55 |
+
sep_token_id: None
|
| 56 |
+
decoder_start_token_id: None
|
| 57 |
+
task_specific_params: None
|
| 58 |
+
problem_type: None
|
| 59 |
+
_name_or_path:
|
| 60 |
+
_attn_implementation_autoset: False
|
| 61 |
+
transformers_version: 4.46.3
|
| 62 |
+
vocab_size: 50265
|
| 63 |
+
hidden_size: 768
|
| 64 |
+
num_hidden_layers: 12
|
| 65 |
+
num_attention_heads: 12
|
| 66 |
+
hidden_act: gelu
|
| 67 |
+
intermediate_size: 3072
|
| 68 |
+
hidden_dropout_prob: 0.1
|
| 69 |
+
attention_probs_dropout_prob: 0.1
|
| 70 |
+
max_position_embeddings: 514
|
| 71 |
+
type_vocab_size: 1
|
| 72 |
+
initializer_range: 0.02
|
| 73 |
+
layer_norm_eps: 1e-05
|
| 74 |
+
position_embedding_type: absolute
|
| 75 |
+
use_cache: True
|
| 76 |
+
classifier_dropout: None
|
| 77 |
+
model_type: roberta
|
| 78 |
+
|
| 79 |
+
=== Training Parameters ===
|
| 80 |
+
learning_rate: 0.0007
|
| 81 |
+
batch_size: 64
|
| 82 |
+
gradient_accumulation: 5
|
| 83 |
+
warmup_steps: 10000
|
| 84 |
+
total_steps: 100000
|
| 85 |
+
|
| 86 |
+
=== Dataset Configuration ===
|
| 87 |
+
Dataset: oscar-corpus/OSCAR-2301
|
| 88 |
+
Subset: fr
|
| 89 |
+
Split: train
|
| 90 |
+
Streaming: True
|
cam_run30/weights/config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"RobertaForMaskedLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"bos_token_id": 0,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_norm_eps": 1e-05,
|
| 15 |
+
"max_position_embeddings": 514,
|
| 16 |
+
"model_type": "roberta",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 12,
|
| 19 |
+
"pad_token_id": 1,
|
| 20 |
+
"position_embedding_type": "absolute",
|
| 21 |
+
"torch_dtype": "float32",
|
| 22 |
+
"transformers_version": "4.46.3",
|
| 23 |
+
"type_vocab_size": 1,
|
| 24 |
+
"use_cache": true,
|
| 25 |
+
"vocab_size": 50265
|
| 26 |
+
}
|
cam_run30/weights/final_metrics.json
ADDED
|
Binary file (1.02 kB). View file
|
|
|
cam_run30/weights/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
cam_run30/weights/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dde2920dde116000e56b077c7f4b1f567967425441e1b3993fea0fe2b8f2328b
|
| 3 |
+
size 498813948
|
cam_run30/weights/special_tokens_map.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"cls_token": "<s>",
|
| 4 |
+
"eos_token": "</s>",
|
| 5 |
+
"mask_token": {
|
| 6 |
+
"content": "<mask>",
|
| 7 |
+
"lstrip": true,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false
|
| 11 |
+
},
|
| 12 |
+
"pad_token": "<pad>",
|
| 13 |
+
"sep_token": "</s>",
|
| 14 |
+
"unk_token": "<unk>"
|
| 15 |
+
}
|
cam_run30/weights/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
cam_run30/weights/tokenizer_config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"0": {
|
| 5 |
+
"content": "<s>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
},
|
| 12 |
+
"1": {
|
| 13 |
+
"content": "<pad>",
|
| 14 |
+
"lstrip": false,
|
| 15 |
+
"normalized": true,
|
| 16 |
+
"rstrip": false,
|
| 17 |
+
"single_word": false,
|
| 18 |
+
"special": true
|
| 19 |
+
},
|
| 20 |
+
"2": {
|
| 21 |
+
"content": "</s>",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": true,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false,
|
| 26 |
+
"special": true
|
| 27 |
+
},
|
| 28 |
+
"3": {
|
| 29 |
+
"content": "<unk>",
|
| 30 |
+
"lstrip": false,
|
| 31 |
+
"normalized": true,
|
| 32 |
+
"rstrip": false,
|
| 33 |
+
"single_word": false,
|
| 34 |
+
"special": true
|
| 35 |
+
},
|
| 36 |
+
"50264": {
|
| 37 |
+
"content": "<mask>",
|
| 38 |
+
"lstrip": true,
|
| 39 |
+
"normalized": false,
|
| 40 |
+
"rstrip": false,
|
| 41 |
+
"single_word": false,
|
| 42 |
+
"special": true
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"bos_token": "<s>",
|
| 46 |
+
"clean_up_tokenization_spaces": false,
|
| 47 |
+
"cls_token": "<s>",
|
| 48 |
+
"eos_token": "</s>",
|
| 49 |
+
"errors": "replace",
|
| 50 |
+
"mask_token": "<mask>",
|
| 51 |
+
"model_max_length": 512,
|
| 52 |
+
"pad_token": "<pad>",
|
| 53 |
+
"sep_token": "</s>",
|
| 54 |
+
"tokenizer_class": "RobertaTokenizer",
|
| 55 |
+
"trim_offsets": true,
|
| 56 |
+
"unk_token": "<unk>"
|
| 57 |
+
}
|
cam_run30/weights/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|