Yaning1001 commited on
Commit
54f7697
·
verified ·
1 Parent(s): 711e738

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. babylm_dataset.py +141 -0
  2. babylm_dataset_llama.py +141 -0
  3. babylm_dataset_test.py +145 -0
  4. run.sh +82 -0
  5. run_train.sh +28 -0
  6. train_accelerate.py +99 -0
  7. train_deep.py +233 -0
  8. train_ftp.py +137 -0
  9. train_gpt2.py +117 -0
  10. train_llama.py +99 -0
  11. train_llama_1B.py +117 -0
  12. train_llama_3B.py +117 -0
  13. train_qwen.py +97 -0
  14. train_qwen_lora.py +93 -0
  15. wandb/debug-cli.chunhui.log +0 -0
  16. wandb/debug-internal.log +17 -0
  17. wandb/debug.log +33 -0
  18. wandb/run-20241030_010306-uhzyjdga/run-uhzyjdga.wandb +0 -0
  19. wandb/run-20241030_011013-8qrwqf2b/files/config.yaml +47 -0
  20. wandb/run-20241030_011013-8qrwqf2b/files/wandb-metadata.json +97 -0
  21. wandb/run-20241030_011509-3dp0dtmk/files/output.log +15 -0
  22. wandb/run-20241030_011509-3dp0dtmk/logs/debug.log +26 -0
  23. wandb/run-20241030_011509-cqcwsj7s/logs/debug.log +26 -0
  24. wandb/run-20241030_013141-v317zdzd/files/config.yaml +47 -0
  25. wandb/run-20241030_013141-v317zdzd/files/output.log +46 -0
  26. wandb/run-20241030_013141-v317zdzd/files/requirements.txt +147 -0
  27. wandb/run-20241030_013141-v317zdzd/files/wandb-metadata.json +97 -0
  28. wandb/run-20241030_013141-v317zdzd/logs/debug-internal.log +11 -0
  29. wandb/run-20241030_013141-v317zdzd/logs/debug.log +27 -0
  30. wandb/run-20241030_222932-l8nv7d2l/files/output.log +14 -0
  31. wandb/run-20241030_222932-l8nv7d2l/logs/debug-internal.log +10 -0
  32. wandb/run-20241030_222932-l8nv7d2l/logs/debug.log +26 -0
  33. wandb/run-20241030_222932-lsfm0d2q/files/wandb-metadata.json +97 -0
  34. wandb/run-20241030_222932-lsfm0d2q/logs/debug-internal.log +10 -0
  35. wandb/run-20241030_222932-lsfm0d2q/logs/debug.log +26 -0
  36. wandb/run-20241101_012733-4u8e027p/files/output.log +16 -0
  37. wandb/run-20241101_012733-4u8e027p/files/requirements.txt +147 -0
  38. wandb/run-20241101_012733-4u8e027p/files/wandb-metadata.json +97 -0
  39. wandb/run-20241101_012733-4u8e027p/logs/debug.log +26 -0
  40. wandb/run-20241101_012733-e3zsr634/files/output.log +20 -0
  41. wandb/run-20241101_012733-e3zsr634/files/wandb-metadata.json +97 -0
  42. wandb/run-20241101_200502-28ivel81/files/output.log +1 -0
  43. wandb/run-20241101_200502-28ivel81/files/wandb-metadata.json +97 -0
  44. wandb/run-20241101_201708-b4wkk29o/files/output.log +13 -0
  45. wandb/run-20241101_201708-b4wkk29o/files/wandb-metadata.json +97 -0
  46. wandb/run-20241101_201708-b4wkk29o/logs/debug.log +26 -0
  47. wandb/run-20241101_201708-b4wkk29o/run-b4wkk29o.wandb +0 -0
  48. wandb/run-20241101_201926-5y6ulxig/files/output.log +13 -0
  49. wandb/run-20241101_201926-5y6ulxig/files/requirements.txt +147 -0
  50. wandb/run-20241101_201926-5y6ulxig/files/wandb-metadata.json +97 -0
babylm_dataset.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # babylm_dataset.py
2
+ # author: Julie Kallini
3
+
4
+ import datasets
5
+ import os
6
+ import glob
7
+ import tqdm
8
+ from numpy.random import default_rng
9
+ from itertools import product
10
+
11
+ logger = datasets.logging.get_logger(__name__)
12
+
13
+ _DESCRIPTION = """\
14
+ Pre-tokenized BabyLM HuggingFace dataset for verb perturbations.
15
+ """
16
+ MODEL_NAME = "Llama-3.2-3B"
17
+ _PERTURBED_DATA_PATH = f"../data/Perturbed_data/{MODEL_NAME}"
18
+ _PERTURBATIONS = ["hop_control", "hop_tokens4", "hop_words4",
19
+ "reverse_control", "reverse_partial", "reverse_full",
20
+ "shuffle_control", "shuffle_nondeterministic",
21
+ "shuffle_deterministic21", "shuffle_deterministic57", "shuffle_deterministic84",
22
+ "shuffle_local3", "shuffle_local5", "shuffle_local10",
23
+ "shuffle_even_odd"]
24
+ # _RANDOM_SEEDS = [0, 14, 41, 53, 96]
25
+ _RANDOM_SEEDS = [0]
26
+ # _TRAIN_SETS = ["100M", "10M"]
27
+ _TRAIN_SETS = ["10M"]
28
+ _EOS_TOKEN_ID = 50256
29
+
30
+
31
+ class BabyConfig(datasets.BuilderConfig):
32
+
33
+ def __init__(self, data_dir, babylm_train_set, random_seed, **kwargs):
34
+ """BuilderConfig for IzParens
35
+
36
+ Args:
37
+ data_dir: path to directory of tokenized, perturbed BabyLM dataset
38
+ """
39
+ super(BabyConfig, self).__init__(
40
+ **kwargs,
41
+ )
42
+ self.data_dir = data_dir
43
+ self.babylm_train_set = babylm_train_set
44
+ self.random_seed = random_seed
45
+
46
+
47
+ class BabyLMCorpus(datasets.GeneratorBasedBuilder):
48
+ BUILDER_CONFIGS = [
49
+ BabyConfig(
50
+ name=f"babylm_{perturbation}_{train_set}_seed{random_seed}",
51
+ data_dir=os.path.join(
52
+ _PERTURBED_DATA_PATH, "babylm_" + perturbation),
53
+ babylm_train_set=train_set,
54
+ random_seed=random_seed,
55
+ ) for perturbation, train_set, random_seed in list(product(_PERTURBATIONS, _TRAIN_SETS, _RANDOM_SEEDS))
56
+ ]
57
+
58
+ def _info(self):
59
+ return datasets.DatasetInfo(
60
+ # This is the description that will appear on the datasets page.
61
+ description=_DESCRIPTION,
62
+ # datasets.features.FeatureConnectors
63
+ features=datasets.Features(
64
+ {
65
+ "text": datasets.Value("string")
66
+ # These are the features of your dataset like images, labels ...
67
+ }
68
+ ),
69
+ # If there's a common (input, target) tuple from the features,
70
+ # specify them here. They'll be used if as_supervised=True in
71
+ # builder.as_dataset.
72
+ supervised_keys=None,
73
+ )
74
+
75
+ def _split_generators(self, dl_manager):
76
+ return [
77
+ datasets.SplitGenerator(
78
+ name=datasets.Split.TRAIN,
79
+ gen_kwargs={"data_dir": os.path.join(
80
+ self.config.data_dir, "babylm_" + self.config.babylm_train_set), "random_seed": self.config.random_seed, "split": "train"},
81
+ ),
82
+ # datasets.SplitGenerator(
83
+ # name=datasets.Split.VALIDATION,
84
+ # gen_kwargs={"data_dir": os.path.join(
85
+ # self.config.data_dir, "babylm_dev"), "random_seed": self.config.random_seed, "split": "valid"},
86
+ #
87
+ ]
88
+
89
+ def __chunk(self, sentences, eos_token):
90
+
91
+ # Tokenize each sentence
92
+ logger.info("Loading pre-tokenized data")
93
+ tokenized_sentences = []
94
+ for sent in tqdm.tqdm(sentences):
95
+ tokenized_sentences.append([int(tok) for tok in sent.split()])
96
+
97
+ # Concatenate the tokenized sentences using the EOS token
98
+ logger.info("Concatenating tokenized data using EOS token")
99
+ all_tokens = []
100
+ for tokens in tqdm.tqdm(tokenized_sentences):
101
+ all_tokens.extend(tokens)
102
+ all_tokens.append(eos_token)
103
+
104
+ # Chunk the tokens into sublists of max_seq_len tokens each
105
+ logger.info("Chunking tokens into sublists of 1024")
106
+ max_seq_len = 1024
107
+ chunked_tokens = []
108
+ for i in tqdm.tqdm(range(0, len(all_tokens), max_seq_len)):
109
+ chunked_tokens.append(all_tokens[i:i + max_seq_len])
110
+
111
+ # Drop last line if not a multiple of max_seq_len
112
+ if len(chunked_tokens[-1]) < max_seq_len:
113
+ chunked_tokens.pop()
114
+
115
+ return chunked_tokens
116
+
117
+ def _generate_examples(self, data_dir, random_seed, split):
118
+ """This function returns the BabyLM text in the discretized, tokenized form."""
119
+
120
+ logger.info("Generating examples from = %s", data_dir)
121
+ infiles = sorted(glob.glob(os.path.join(data_dir, "*")))
122
+
123
+ # Extend sentences
124
+ all_sentences = []
125
+ for infile in infiles:
126
+ f = open(infile, encoding="utf-8")
127
+ all_sentences.extend(f.readlines())
128
+ logger.info("Total sentences: {}".format(len(all_sentences)))
129
+
130
+ # Shuffle because we are pre-tokenizing
131
+ rng = default_rng(seed=random_seed)
132
+ rng.shuffle(all_sentences)
133
+
134
+ # Tokenize and chunk
135
+ tokenized_lines = self.__chunk(all_sentences, _EOS_TOKEN_ID)
136
+
137
+ # Generate data
138
+ logger.info("Writing dataset as space-separated sequences of tokens")
139
+ for idx, line in enumerate(tokenized_lines):
140
+ l = " ".join([str(tok) for tok in line]) + "\n"
141
+ yield idx, {"text": l}
babylm_dataset_llama.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # babylm_dataset.py
2
+ # author: Julie Kallini
3
+
4
+ import datasets
5
+ import os
6
+ import glob
7
+ import tqdm
8
+ from numpy.random import default_rng
9
+ from itertools import product
10
+
11
+ logger = datasets.logging.get_logger(__name__)
12
+
13
+ _DESCRIPTION = """\
14
+ Pre-tokenized BabyLM HuggingFace dataset for verb perturbations.
15
+ """
16
+ MODEL_NAME = "Llama-3.2-3B"
17
+ _PERTURBED_DATA_PATH = f"../data/Perturbed_data/{MODEL_NAME}"
18
+ _PERTURBATIONS = ["hop_control", "hop_tokens4", "hop_words4",
19
+ "reverse_control", "reverse_partial", "reverse_full",
20
+ "shuffle_control", "shuffle_nondeterministic",
21
+ "shuffle_deterministic21", "shuffle_deterministic57", "shuffle_deterministic84",
22
+ "shuffle_local3", "shuffle_local5", "shuffle_local10",
23
+ "shuffle_even_odd"]
24
+ # _RANDOM_SEEDS = [0, 14, 41, 53, 96]
25
+ _RANDOM_SEEDS = [0]
26
+ # _TRAIN_SETS = ["100M", "10M"]
27
+ _TRAIN_SETS = ["10M"]
28
+ _EOS_TOKEN_ID = 50256
29
+
30
+
31
+ class BabyConfig(datasets.BuilderConfig):
32
+
33
+ def __init__(self, data_dir, babylm_train_set, random_seed, **kwargs):
34
+ """BuilderConfig for IzParens
35
+
36
+ Args:
37
+ data_dir: path to directory of tokenized, perturbed BabyLM dataset
38
+ """
39
+ super(BabyConfig, self).__init__(
40
+ **kwargs,
41
+ )
42
+ self.data_dir = data_dir
43
+ self.babylm_train_set = babylm_train_set
44
+ self.random_seed = random_seed
45
+
46
+
47
+ class BabyLMCorpus(datasets.GeneratorBasedBuilder):
48
+ BUILDER_CONFIGS = [
49
+ BabyConfig(
50
+ name=f"babylm_{perturbation}_{train_set}_seed{random_seed}",
51
+ data_dir=os.path.join(
52
+ _PERTURBED_DATA_PATH, "babylm_" + perturbation),
53
+ babylm_train_set=train_set,
54
+ random_seed=random_seed,
55
+ ) for perturbation, train_set, random_seed in list(product(_PERTURBATIONS, _TRAIN_SETS, _RANDOM_SEEDS))
56
+ ]
57
+
58
+ def _info(self):
59
+ return datasets.DatasetInfo(
60
+ # This is the description that will appear on the datasets page.
61
+ description=_DESCRIPTION,
62
+ # datasets.features.FeatureConnectors
63
+ features=datasets.Features(
64
+ {
65
+ "text": datasets.Value("string")
66
+ # These are the features of your dataset like images, labels ...
67
+ }
68
+ ),
69
+ # If there's a common (input, target) tuple from the features,
70
+ # specify them here. They'll be used if as_supervised=True in
71
+ # builder.as_dataset.
72
+ supervised_keys=None,
73
+ )
74
+
75
+ def _split_generators(self, dl_manager):
76
+ return [
77
+ datasets.SplitGenerator(
78
+ name=datasets.Split.TRAIN,
79
+ gen_kwargs={"data_dir": os.path.join(
80
+ self.config.data_dir, "babylm_" + self.config.babylm_train_set), "random_seed": self.config.random_seed, "split": "train"},
81
+ ),
82
+ # datasets.SplitGenerator(
83
+ # name=datasets.Split.VALIDATION,
84
+ # gen_kwargs={"data_dir": os.path.join(
85
+ # self.config.data_dir, "babylm_dev"), "random_seed": self.config.random_seed, "split": "valid"},
86
+ #
87
+ ]
88
+
89
+ def __chunk(self, sentences, eos_token):
90
+
91
+ # Tokenize each sentence
92
+ logger.info("Loading pre-tokenized data")
93
+ tokenized_sentences = []
94
+ for sent in tqdm.tqdm(sentences):
95
+ tokenized_sentences.append([int(tok) for tok in sent.split()])
96
+
97
+ # Concatenate the tokenized sentences using the EOS token
98
+ logger.info("Concatenating tokenized data using EOS token")
99
+ all_tokens = []
100
+ for tokens in tqdm.tqdm(tokenized_sentences):
101
+ all_tokens.extend(tokens)
102
+ all_tokens.append(eos_token)
103
+
104
+ # Chunk the tokens into sublists of max_seq_len tokens each
105
+ logger.info("Chunking tokens into sublists of 1024")
106
+ max_seq_len = 1024
107
+ chunked_tokens = []
108
+ for i in tqdm.tqdm(range(0, len(all_tokens), max_seq_len)):
109
+ chunked_tokens.append(all_tokens[i:i + max_seq_len])
110
+
111
+ # Drop last line if not a multiple of max_seq_len
112
+ if len(chunked_tokens[-1]) < max_seq_len:
113
+ chunked_tokens.pop()
114
+
115
+ return chunked_tokens
116
+
117
+ def _generate_examples(self, data_dir, random_seed, split):
118
+ """This function returns the BabyLM text in the discretized, tokenized form."""
119
+
120
+ logger.info("Generating examples from = %s", data_dir)
121
+ infiles = sorted(glob.glob(os.path.join(data_dir, "*")))
122
+
123
+ # Extend sentences
124
+ all_sentences = []
125
+ for infile in infiles:
126
+ f = open(infile, encoding="utf-8")
127
+ all_sentences.extend(f.readlines())
128
+ logger.info("Total sentences: {}".format(len(all_sentences)))
129
+
130
+ # Shuffle because we are pre-tokenizing
131
+ rng = default_rng(seed=random_seed)
132
+ rng.shuffle(all_sentences)
133
+
134
+ # Tokenize and chunk
135
+ tokenized_lines = self.__chunk(all_sentences, _EOS_TOKEN_ID)
136
+
137
+ # Generate data
138
+ logger.info("Writing dataset as space-separated sequences of tokens")
139
+ for idx, line in enumerate(tokenized_lines):
140
+ l = " ".join([str(tok) for tok in line]) + "\n"
141
+ yield idx, {"text": l}
babylm_dataset_test.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # babylm_dataset_test.py
2
+
3
+ import datasets
4
+ import os
5
+ import glob
6
+ import tqdm
7
+ from numpy.random import default_rng
8
+ from itertools import product
9
+
10
+ logger = datasets.logging.get_logger(__name__)
11
+
12
+ _DESCRIPTION = """\
13
+ Pre-tokenized BabyLM HuggingFace dataset for verb perturbations.
14
+ """
15
+ MODEL_NAME = "Llama-3.2-3B"
16
+ _PERTURBED_DATA_PATH = f"../data/Perturbed_data/{MODEL_NAME}"
17
+ _PERTURBATIONS = ["hop_control", "hop_tokens4", "hop_words4",
18
+ "reverse_control", "reverse_partial", "reverse_full",
19
+ "shuffle_control", "shuffle_nondeterministic",
20
+ "shuffle_deterministic21", "shuffle_deterministic57", "shuffle_deterministic84",
21
+ "shuffle_local3", "shuffle_local5", "shuffle_local10",
22
+ "shuffle_even_odd"]
23
+ # _RANDOM_SEEDS = [0, 14, 41, 53, 96]
24
+ _RANDOM_SEEDS = [0]
25
+ # _TRAIN_SETS = ["100M", "10M"]
26
+ _TRAIN_SETS = ["10M"]
27
+ _EOS_TOKEN_ID = 50256
28
+
29
+
30
+ class BabyConfig(datasets.BuilderConfig):
31
+
32
+ def __init__(self, data_dir, babylm_train_set, random_seed, **kwargs):
33
+ """BuilderConfig for IzParens
34
+
35
+ Args:
36
+ data_dir: path to directory of tokenized, perturbed BabyLM dataset
37
+ """
38
+ super(BabyConfig, self).__init__(
39
+ **kwargs,
40
+ )
41
+ self.data_dir = data_dir
42
+ self.babylm_train_set = babylm_train_set
43
+ self.random_seed = random_seed
44
+
45
+
46
+ class BabyLMCorpus(datasets.GeneratorBasedBuilder):
47
+ BUILDER_CONFIGS = [
48
+ BabyConfig(
49
+ name=f"babylm_{perturbation}_{train_set}_seed{random_seed}",
50
+ data_dir=os.path.join(
51
+ _PERTURBED_DATA_PATH, "babylm_" + perturbation),
52
+ babylm_train_set=train_set,
53
+ random_seed=random_seed,
54
+ ) for perturbation, train_set, random_seed in list(product(_PERTURBATIONS, _TRAIN_SETS, _RANDOM_SEEDS))
55
+ ]
56
+
57
+ def _info(self):
58
+ return datasets.DatasetInfo(
59
+ # This is the description that will appear on the datasets page.
60
+ description=_DESCRIPTION,
61
+ # datasets.features.FeatureConnectors
62
+ features=datasets.Features(
63
+ {
64
+ "text": datasets.Value("string")
65
+ # These are the features of your dataset like images, labels ...
66
+ }
67
+ ),
68
+ # If there's a common (input, target) tuple from the features,
69
+ # specify them here. They'll be used if as_supervised=True in
70
+ # builder.as_dataset.
71
+ supervised_keys=None,
72
+ )
73
+
74
+ def _split_generators(self, dl_manager):
75
+ return [
76
+ datasets.SplitGenerator(
77
+ name=datasets.Split.TRAIN,
78
+ gen_kwargs={"data_dir": os.path.join(
79
+ self.config.data_dir, "babylm_" + self.config.babylm_train_set), "random_seed": self.config.random_seed, "split": "train"},
80
+ ),
81
+ datasets.SplitGenerator(
82
+ name=datasets.Split.VALIDATION,
83
+ gen_kwargs={"data_dir": os.path.join(
84
+ self.config.data_dir, "babylm_dev"), "random_seed": self.config.random_seed, "split": "valid"},
85
+ ),
86
+ datasets.SplitGenerator(
87
+ name=datasets.Split.TEST,
88
+ gen_kwargs={"data_dir": os.path.join(
89
+ self.config.data_dir, "babylm_test_affected"), "random_seed": self.config.random_seed, "split": "test"},
90
+ ),
91
+ ]
92
+
93
+ def __chunk(self, sentences, eos_token):
94
+
95
+ # Tokenize each sentence
96
+ logger.info("Loading pre-tokenized data")
97
+ tokenized_sentences = []
98
+ for sent in tqdm.tqdm(sentences):
99
+ tokenized_sentences.append([int(tok) for tok in sent.split()])
100
+
101
+ # Concatenate the tokenized sentences using the EOS token
102
+ logger.info("Concatenating tokenized data using EOS token")
103
+ all_tokens = []
104
+ for tokens in tqdm.tqdm(tokenized_sentences):
105
+ all_tokens.extend(tokens)
106
+ all_tokens.append(eos_token)
107
+
108
+ # Chunk the tokens into sublists of max_seq_len tokens each
109
+ logger.info("Chunking tokens into sublists of 1024")
110
+ max_seq_len = 1024
111
+ chunked_tokens = []
112
+ for i in tqdm.tqdm(range(0, len(all_tokens), max_seq_len)):
113
+ chunked_tokens.append(all_tokens[i:i + max_seq_len])
114
+
115
+ # Drop last line if not a multiple of max_seq_len
116
+ if len(chunked_tokens[-1]) < max_seq_len:
117
+ chunked_tokens.pop()
118
+
119
+ return chunked_tokens
120
+
121
+ def _generate_examples(self, data_dir, random_seed, split):
122
+ """This function returns the BabyLM text in the discretized, tokenized form."""
123
+
124
+ logger.info("Generating examples from = %s", data_dir)
125
+ infiles = sorted(glob.glob(os.path.join(data_dir, "*")))
126
+
127
+ # Extend sentences
128
+ all_sentences = []
129
+ for infile in infiles:
130
+ f = open(infile, encoding="utf-8")
131
+ all_sentences.extend(f.readlines())
132
+ logger.info("Total sentences: {}".format(len(all_sentences)))
133
+
134
+ # Shuffle because we are pre-tokenizing
135
+ rng = default_rng(seed=random_seed)
136
+ rng.shuffle(all_sentences)
137
+
138
+ # Tokenize and chunk
139
+ tokenized_lines = self.__chunk(all_sentences, _EOS_TOKEN_ID)
140
+
141
+ # Generate data
142
+ logger.info("Writing dataset as space-separated sequences of tokens")
143
+ for idx, line in enumerate(tokenized_lines):
144
+ l = " ".join([str(tok) for tok in line]) + "\n"
145
+ yield idx, {"text": l}
run.sh ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Launch the first task in the background
4
+ CUDA_VISIBLE_DEVICES=1,2,3 torchrun --nproc_per_node=3 --master_port=22224 train_deep_wandb.py --perturbation reverse_full --train_set 10M --batch_size 3 --epoch 3 --seed 0 > log_1.out 2>&1 &
5
+
6
+ # Launch the second task in the background
7
+ CUDA_VISIBLE_DEVICES=5,6,7 torchrun --nproc_per_node=3 --master_port=22225 train_deep_wandb.py --perturbation reverse_partial --train_set 10M --batch_size 3 --epoch 3 --seed 0 > log_2.out 2>&1 &
8
+
9
+ # Launch the second task in the background
10
+ # tmux attach-session -t impo1-0
11
+ CUDA_VISIBLE_DEVICES=1,2,3 torchrun --nproc_per_node=3 --master_port=22226 train_deep_wandb.py --perturbation reverse_control --train_set 10M --batch_size 3 --epoch 3 --seed 0 > log_3.out 2>&1 &
12
+
13
+
14
+ # Launch the second task in the background
15
+ # tmux attach-session -t impo3
16
+ CUDA_VISIBLE_DEVICES=5,6,7 torchrun --nproc_per_node=3 --master_port=22227 train_deep_hop.py --perturbation hop_control --train_set 10M --batch_size 3 --epoch 3 --seed 0 > log_3.out 2>&1 &
17
+
18
+ # tmux attach-session -t impo1-0
19
+ CUDA_VISIBLE_DEVICES=1,2,3 torchrun --nproc_per_node=3 --master_port=22228 train_deep_hop.py --perturbation hop_words4 --train_set 10M --batch_size 3 --epoch 3 --seed 0 > log_3.out 2>&1 &
20
+
21
+ # tmux attach-session -t impo3
22
+ CUDA_VISIBLE_DEVICES=5,6,7 torchrun --nproc_per_node=3 --master_port=22229 train_deep_hop.py --perturbation hop_tokens4 --train_set 10M --batch_size 3 --epoch 3 --seed 0 > log_3.out 2>&1 &
23
+
24
+ # -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
25
+ # tmux attach-session -t impo1-0
26
+ CUDA_VISIBLE_DEVICES=2,3,4 torchrun --nproc_per_node=3 --master_port=22230 train_deep_wandb.py --perturbation shuffle_deterministic21 --train_set 10M --batch_size 3 --epoch 3 --seed 0 > log_3.out 2>&1 &
27
+
28
+ # tmux attach-session -t impo2
29
+ CUDA_VISIBLE_DEVICES=5,6,7 torchrun --nproc_per_node=3 --master_port=22231 train_deep_wandb.py --perturbation shuffle_deterministic57 --train_set 10M --batch_size 3 --epoch 3 --seed 0 > log_3.out 2>&1 &
30
+
31
+ # tmux attach-session -t impo2-1
32
+ CUDA_VISIBLE_DEVICES=5,6,7 torchrun --nproc_per_node=3 --master_port=22231 train_deep_wandb.py --perturbation shuffle_deterministic84 --train_set 10M --batch_size 3 --epoch 3 --seed 0 > log_3.out 2>&1 &
33
+
34
+ # tmux attach-session -t impo3
35
+ CUDA_VISIBLE_DEVICES=2,3,4 torchrun --nproc_per_node=3 --master_port=22231 train_deep_wandb.py --perturbation shuffle_even_odd --train_set 10M --batch_size 3 --epoch 3 --seed 0 > log_3.out 2>&1 &
36
+
37
+ #----------------------------------------------------------------------------------------------------------------------------------------------------
38
+ #----------------------------------------------------------------------------------------------------------------------------------------------------
39
+ # tmux attach-session -t impo1-0
40
+ CUDA_VISIBLE_DEVICES=0,1,2 torchrun --nproc_per_node=3 --master_port=22229 train_ftp.py --perturbation reverse_full --train_set 10M --batch_size 3 --epoch 3 --seed 0
41
+
42
+ # *tmux attach-session -t impo2-1
43
+ CUDA_VISIBLE_DEVICES=3,4,5 torchrun --nproc_per_node=3 --master_port=22230 train_ftp.py --perturbation reverse_partial --train_set 10M --batch_size 3 --epoch 3 --seed 0
44
+
45
+ # *tmux attach-session -t impo2-1
46
+ CUDA_VISIBLE_DEVICES=3,4,5 torchrun --nproc_per_node=3 --master_port=22230 train_ftp.py --perturbation reverse_control --train_set 10M --batch_size 3 --epoch 3 --seed 0
47
+
48
+ # tmux attach-session -t impo1-0
49
+ CUDA_VISIBLE_DEVICES=3,4,5 torchrun --nproc_per_node=3 --master_port=22230 train_ftp.py --perturbation shuffle_deterministic84 --train_set 10M --batch_size 3 --epoch 3 --seed 0
50
+
51
+ # tmux attach-session -t impo1-0
52
+ CUDA_VISIBLE_DEVICES=1,2,3 torchrun --nproc_per_node=3 --master_port=22230 train_ftp.py --perturbation shuffle_nondeterministic --train_set 10M --batch_size 3 --epoch 3 --seed 0
53
+
54
+ # Wait for all background processes to complete
55
+
56
+ ###
57
+ # LLama3.2-1B
58
+ ###
59
+ # tmux attach-session -t impo1-0
60
+ CUDA_VISIBLE_DEVICES=0,1,2 torchrun --nproc_per_node=3 --master_port=22229 train_llama_1B.py --perturbation reverse_control --train_set 10M --batch_size 3 --epoch 3 --seed 0
61
+
62
+ # tmux attach-session -t impo2-7
63
+ CUDA_VISIBLE_DEVICES=3,4,5 torchrun --nproc_per_node=3 --master_port=22230 train_llama_1B.py --perturbation reverse_full --train_set 10M --batch_size 3 --epoch 3 --seed 0
64
+
65
+ ###
66
+ # GPT-2
67
+ ###
68
+
69
+ # tmux attach-session -t impo1-0
70
+ CUDA_VISIBLE_DEVICES=0,1,2 torchrun --nproc_per_node=3 --master_port=22235 train_gpt2.py --perturbation reverse_control --train_set 10M --batch_size 3 --epoch 3 --seed 0
71
+
72
+ # tmux attach-session -t impo2-1
73
+ CUDA_VISIBLE_DEVICES=3,4,5 torchrun --nproc_per_node=3 --master_port=22236 train_gpt2.py --perturbation reverse_full --train_set 10M --batch_size 3 --epoch 3 --seed 0
74
+
75
+ # tmux attach-session -t impo1-0
76
+ CUDA_VISIBLE_DEVICES=0,1,2 torchrun --nproc_per_node=3 --master_port=22237 train_gpt2.py --perturbation reverse_partial --train_set 10M --batch_size 3 --epoch 3 --seed 0
77
+
78
+
79
+
80
+ wait
81
+
82
+ echo "Both tasks have been launched."
run_train.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ CUDA_VISIBLE_DEVICES=1,2,3,5,6,7 torchrun --nproc_per_node=6 train_deep_wandb.py --perturbation reverse_full --train_set 10M --batch_size 3 --epoch 7 --seed 0
4
+
5
+ if [ $? -eq 0 ]; then
6
+ echo "First script completed successfully."
7
+ else
8
+ echo "First script failed."
9
+ exit 1
10
+ fi
11
+
12
+ CUDA_VISIBLE_DEVICES=1,2,3,5,6,7 torchrun --nproc_per_node=6 train_deep_wandb.py --perturbation reverse_partial --train_set 10M --batch_size 3 --epoch 7 --seed 0
13
+
14
+ if [ $? -eq 0 ]; then
15
+ echo "Second script completed successfully."
16
+ else
17
+ echo "Second script failed."
18
+ exit 1
19
+ fi
20
+
21
+ CUDA_VISIBLE_DEVICES=1,2,3,5,6,7 torchrun --nproc_per_node=6 train_deep_wandb.py --perturbation reverse_control --train_set 10M --batch_size 3 --epoch 7 --seed 0
22
+
23
+ if [ $? -eq 0 ]; then
24
+ echo "Third script completed successfully."
25
+ else
26
+ echo "Third script failed."
27
+ exit 1
28
+ fi
train_accelerate.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import torch
3
+ sys.path.append("..")
4
+
5
+ import os
6
+ from datasets import load_dataset
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
8
+ from utils_llama import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
9
+ GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
10
+ import argparse
11
+
12
+ # import wandb
13
+
14
+ # Setup for Weights & Biases
15
+ # wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id)
16
+
17
+ if __name__ == "__main__":
18
+
19
+ # === CONFIGURATION SETTINGS ===
20
+ parser = argparse.ArgumentParser(description="Training configuration.")
21
+
22
+ parser.add_argument('--perturbation', type=str, default='hop_tokens4', help='Type of perturbation to use.')
23
+ parser.add_argument('--train_set', type=str, default='10M', help='Dataset size for training.')
24
+ parser.add_argument('--batch_size', type=int, default=4, help='Batch size for training.')
25
+ parser.add_argument('--epoch', type=int, default=20, help='train epoch')
26
+ parser.add_argument('--seed', type=int, default=0, help='Random seed.')
27
+
28
+ args = parser.parse_args()
29
+
30
+ # no_pos_encodings_underscore = "" # Ex: "_nopos" if needed
31
+ ckpt_path = "./checkpoints"
32
+ # effective_bsz = 512
33
+
34
+ model_name = "meta-llama/Llama-3.2-3B"
35
+
36
+ model_save_name = "Llama-3.2-3B"
37
+ # === FILE PATHS BASED ON CONFIGURATION ===
38
+ run_id = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
39
+ cache_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "artifacts")
40
+ run_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "runs")
41
+ os.makedirs(cache_dir, exist_ok=True)
42
+ os.makedirs(run_dir, exist_ok=True)
43
+
44
+ # === DATASET LOADING ===
45
+ dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
46
+ dataset = load_dataset('babylm_dataset_llama.py', name=dataset_name, trust_remote_code=True)
47
+ train_dataset = dataset['train']
48
+
49
+ # === TOKENIZER & MODEL LOADING ===
50
+ # model_name = f"gpt2{'' if no_pos_encodings_underscore == '' else '-no-pos'}-small-{perturbation}-{paren_model}"
51
+
52
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
53
+ tokenizer = PERTURBATIONS[args.perturbation]['llama_tokenizer']
54
+ model = AutoModelForCausalLM.from_pretrained(model_name,
55
+ device_map="auto",
56
+ cache_dir=cache_dir)
57
+
58
+ # print("model:", model)
59
+ # === TOKENIZATION ===
60
+ def tokenize_function(examples):
61
+ return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
62
+ tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
63
+
64
+ # === DATA COLLATOR ===
65
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
66
+
67
+ # === TRAINING ARGUMENTS ===
68
+ training_args = TrainingArguments(
69
+ output_dir=run_dir,
70
+ # evaluation_strategy="steps",
71
+ evaluation_strategy="no",
72
+ # per_device_train_batch_size=int(effective_bsz / 1), # Assuming 1 GPU for this example
73
+ per_device_train_batch_size=args.batch_size, # Assuming 1 GPU for this example
74
+ logging_dir='./logs',
75
+ logging_steps=1000,
76
+ save_steps=1000,
77
+ # save_total_limit=5,
78
+ learning_rate=2e-5,
79
+ num_train_epochs=args.epoch,
80
+ seed=args.seed,
81
+ # load_best_model_at_end=True,
82
+ gradient_accumulation_steps=1, # help reduce gpu memory
83
+ fp16 = True, # Enable mixed precision training
84
+ report_to="none",
85
+ )
86
+
87
+ # === TRAINER ===
88
+ trainer = Trainer(
89
+ model=model,
90
+ args=training_args,
91
+ train_dataset=tokenized_train,
92
+ tokenizer=tokenizer,
93
+ data_collator=data_collator
94
+ )
95
+
96
+ # === TRAIN MODEL ===
97
+ trainer.train()
98
+ # End logging
99
+ # wandb.finish()
train_deep.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import torch
3
+ sys.path.append("..")
4
+
5
+ import os
6
+ from datasets import load_dataset
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
8
+ from utils_llama import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
9
+ GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
10
+ import argparse
11
+
12
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
13
+
14
+ # import wandb
15
+
16
+ # Setup for Weights & Biases
17
+ # wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id)
18
+
19
+ if __name__ == "__main__":
20
+
21
+ # === CONFIGURATION SETTINGS ===
22
+ parser = argparse.ArgumentParser(description="Training configuration.")
23
+
24
+ parser.add_argument('--perturbation', type=str, default='hop_tokens4', help='Type of perturbation to use.')
25
+ parser.add_argument('--train_set', type=str, default='10M', help='Dataset size for training.')
26
+ parser.add_argument('--batch_size', type=int, default=4, help='Batch size for training.')
27
+ parser.add_argument('--epoch', type=int, default=20, help='train epoch')
28
+ parser.add_argument('--seed', type=int, default=0, help='Random seed.')
29
+
30
+ args = parser.parse_args()
31
+
32
+ # no_pos_encodings_underscore = "" # Ex: "_nopos" if needed
33
+ ckpt_path = "./checkpoints"
34
+ # effective_bsz = 512
35
+
36
+ model_name = "meta-llama/Llama-3.2-3B"
37
+
38
+ model_save_name = "Llama-3.2-3B"
39
+ # === FILE PATHS BASED ON CONFIGURATION ===
40
+ run_id = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
41
+ cache_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "artifacts")
42
+ run_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "runs")
43
+ os.makedirs(cache_dir, exist_ok=True)
44
+ os.makedirs(run_dir, exist_ok=True)
45
+
46
+ # === DATASET LOADING ===
47
+ dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
48
+ # dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True)
49
+ dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True)
50
+ train_dataset = dataset['train']
51
+ val_dataset = dataset['validation']
52
+
53
+
54
+ print(train_dataset)
55
+
56
+ # === TOKENIZER & MODEL LOADING ===
57
+ # model_name = f"gpt2{'' if no_pos_encodings_underscore == '' else '-no-pos'}-small-{perturbation}-{paren_model}"
58
+
59
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
60
+ tokenizer = PERTURBATIONS[args.perturbation]['llama_tokenizer']
61
+ model = AutoModelForCausalLM.from_pretrained(model_name,
62
+ # device_map="auto", # deepspeed needs to delete this setting
63
+ cache_dir=cache_dir)
64
+
65
+ # print("model:", model)
66
+ # === TOKENIZATION ===
67
+ def tokenize_function(examples):
68
+ return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
69
+ tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
70
+ tokenized_valid = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
71
+
72
+ shuffled_valid = tokenized_valid.shuffle()
73
+ tokenized_valid = shuffled_valid.select(range(600))
74
+
75
+ # === DATA COLLATOR ===
76
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
77
+
78
+ # === TRAINING ARGUMENTS ===
79
+ training_args = TrainingArguments(
80
+ output_dir=run_dir,
81
+ evaluation_strategy="steps",
82
+ eval_steps=10,
83
+ per_device_train_batch_size=args.batch_size, # set "auto" in deepspeed config, adjust it in trainer
84
+ logging_dir='./logs',
85
+ logging_steps=10,
86
+ save_steps=150,
87
+ learning_rate=5e-5, # align with deepspeed
88
+ num_train_epochs=args.epoch,
89
+ seed=args.seed,
90
+ gradient_accumulation_steps=2, # # set "auto" in deepspeed config, adjust it in trainer
91
+ fp16 = True, # align with deepspeed
92
+ report_to="none",
93
+ deepspeed="deepspeed_config/train_dp_config.json"
94
+ )
95
+
96
+ # === TRAINER ===
97
+ trainer = Trainer(
98
+ model=model,
99
+ args=training_args,
100
+ train_dataset=tokenized_train,
101
+ eval_dataset=tokenized_valid,
102
+ tokenizer=tokenizer,
103
+ data_collator=data_collator
104
+ )
105
+
106
+ # === TRAIN MODEL ===
107
+ trainer.train()
108
+ # End logging
109
+ # wandb.finish()
110
+
111
+ # import sys
112
+ # import torch
113
+ # sys.path.append("..")
114
+
115
+ # import os
116
+ # from datasets import load_dataset
117
+ # from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
118
+ # from utils_llama import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
119
+ # GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
120
+ # import argparse
121
+
122
+ # os.environ["TOKENIZERS_PARALLELISM"] = "false"
123
+
124
+
125
+ # class Trainer(Trainer):
126
+ # def save_model(self, output_dir=None, _internal_call=False):
127
+ # if output_dir is None:
128
+ # output_dir = self.args.output_dir
129
+
130
+ # # 确保输出目录存在
131
+ # os.makedirs(output_dir, exist_ok=True)
132
+
133
+ # # 保存检查点
134
+ # super().save_model(output_dir, _internal_call=_internal_call)
135
+
136
+ # # 检查 output_dir 中的每个子文件夹
137
+ # for folder_name in os.listdir(output_dir):
138
+ # folder_path = os.path.join(output_dir, folder_name)
139
+ # if os.path.isdir(folder_path):
140
+ # print(f"Checking contents of {folder_path}")
141
+
142
+ # # 检查当前子文件夹的一级目录
143
+ # for name in os.listdir(folder_path):
144
+ # path = os.path.join(folder_path, name)
145
+ # if os.path.isdir(path) and "global_step" in name:
146
+ # shutil.rmtree(path)
147
+ # print(f"Removed directory {path}")
148
+
149
+ # # Setup for Weights & Biases
150
+ # # wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id)
151
+
152
+ # if __name__ == "__main__":
153
+
154
+ # # === CONFIGURATION SETTINGS ===
155
+ # parser = argparse.ArgumentParser(description="Training configuration.")
156
+
157
+ # parser.add_argument('--perturbation', type=str, default='hop_tokens4', help='Type of perturbation to use.')
158
+ # parser.add_argument('--train_set', type=str, default='10M', help='Dataset size for training.')
159
+ # parser.add_argument('--batch_size', type=int, default=4, help='Batch size for training.')
160
+ # parser.add_argument('--epoch', type=int, default=20, help='train epoch')
161
+ # parser.add_argument('--seed', type=int, default=0, help='Random seed.')
162
+
163
+ # args = parser.parse_args()
164
+
165
+ # # no_pos_encodings_underscore = "" # Ex: "_nopos" if needed
166
+ # ckpt_path = "./checkpoints"
167
+ # # effective_bsz = 512
168
+
169
+ # model_name = "meta-llama/Llama-3.2-3B"
170
+
171
+ # model_save_name = "Llama-3.2-3B"
172
+ # # === FILE PATHS BASED ON CONFIGURATION ===
173
+ # run_id = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
174
+ # cache_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "artifacts")
175
+ # run_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "runs")
176
+ # os.makedirs(cache_dir, exist_ok=True)
177
+ # os.makedirs(run_dir, exist_ok=True)
178
+
179
+ # # === DATASET LOADING ===
180
+ # dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
181
+ # # dataset = load_dataset('babylm_dataset_llama.py', name=dataset_name)
182
+ # dataset = load_dataset('babylm_dataset_llama.py', name=dataset_name, trust_remote_code=True)
183
+ # train_dataset = dataset['train']
184
+
185
+ # # === TOKENIZER & MODEL LOADING ===
186
+ # # model_name = f"gpt2{'' if no_pos_encodings_underscore == '' else '-no-pos'}-small-{perturbation}-{paren_model}"
187
+
188
+ # # tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
189
+ # tokenizer = PERTURBATIONS[args.perturbation]['llama_tokenizer']
190
+ # model = AutoModelForCausalLM.from_pretrained(model_name,
191
+ # # device_map="auto", # deepspeed needs to delete this setting
192
+ # cache_dir=cache_dir)
193
+
194
+ # # print("model:", model)
195
+ # # === TOKENIZATION ===
196
+ # def tokenize_function(examples):
197
+ # return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
198
+ # tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
199
+
200
+ # # === DATA COLLATOR ===
201
+ # data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
202
+
203
+ # # === TRAINING ARGUMENTS ===
204
+ # training_args = TrainingArguments(
205
+ # output_dir=run_dir,
206
+ # evaluation_strategy="no",
207
+ # per_device_train_batch_size=args.batch_size, # set "auto" in deepspeed config, adjust it in trainer
208
+ # logging_dir='./logs',
209
+ # logging_steps=150,
210
+ # save_steps=5,
211
+ # learning_rate=5e-5, # align with deepspeed
212
+ # num_train_epochs=args.epoch,
213
+ # seed=args.seed,
214
+ # gradient_accumulation_steps=4, # # set "auto" in deepspeed config, adjust it in trainer
215
+ # fp16 = True, # align with deepspeed
216
+ # report_to="none",
217
+ # deepspeed="deepspeed_config/train_dp_config.json"
218
+ # )
219
+
220
+ # # === TRAINER ===
221
+ # trainer = Trainer(
222
+ # model=model,
223
+ # args=training_args,
224
+ # train_dataset=tokenized_train,
225
+ # tokenizer=tokenizer,
226
+ # data_collator=data_collator
227
+ # )
228
+
229
+ # # === TRAIN MODEL ===
230
+ # trainer.train()
231
+ # # End logging
232
+ # # wandb.finish()
233
+
train_ftp.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import torch
3
+ sys.path.append("..")
4
+
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
6
+ from utils_llama import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
7
+ GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
8
+ from datasets import load_dataset
9
+ from FTP import AdamP
10
+
11
+ import wandb
12
+ import argparse
13
+ import copy
14
+ import math
15
+ import os
16
+
17
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
18
+
19
+ ftp_k = 1
20
+ class TrainerAdamP(Trainer):
21
+
22
+ def create_optimizer(self):
23
+ optimizer_params = {
24
+ "lr": 5e-6,
25
+ "weight_decay": 0.0,
26
+ "k": ftp_k, # Example parameter for AdamP
27
+ "exclude_set": set() # Use empty set if you don't want exclusion
28
+ }
29
+
30
+ # Cache pre-trained model weights
31
+ params_to_opt = [x[1] for x in self.model.named_parameters() if x[1].requires_grad]
32
+ params_to_opt_name = [x[0] for x in self.model.named_parameters() if x[1].requires_grad]
33
+ params_anchor = copy.deepcopy(params_to_opt)
34
+ param_group = [{'params': params_to_opt, 'pre': params_anchor, 'name': params_to_opt_name}]
35
+
36
+ # Initialize the AdamP optimizer
37
+ self.optimizer = AdamP(param_group, **optimizer_params)
38
+
39
+
40
+
41
+ if __name__ == "__main__":
42
+
43
+ # === CONFIGURATION SETTINGS ===
44
+ parser = argparse.ArgumentParser(description="Training configuration.")
45
+
46
+ parser.add_argument('--perturbation', type=str, default='hop_tokens4', help='Type of perturbation to use.')
47
+ parser.add_argument('--train_set', type=str, default='10M', help='Dataset size for training.')
48
+ parser.add_argument('--batch_size', type=int, default=3, help='Batch size for training.')
49
+ parser.add_argument('--epoch', type=int, default=3, help='train epoch')
50
+ parser.add_argument('--seed', type=int, default=0, help='Random seed.')
51
+ parser.add_argument('--lr', type=float, default=5e-6, help='Learning rate.')
52
+
53
+ args = parser.parse_args()
54
+
55
+ # no_pos_encodings_underscore = "" # Ex: "_nopos" if needed
56
+ ckpt_path = "./checkpoints"
57
+ # effective_bsz = 512
58
+
59
+ model_name = "meta-llama/Llama-3.2-3B"
60
+ model_save_name = "Llama-3.2-3B-FTP"
61
+
62
+ # === FILE PATHS BASED ON CONFIGURATION ===
63
+
64
+
65
+ wandb_id = f"{model_save_name}_{args.perturbation}_train_set_{args.train_set}_epoch_{args.epoch}_batch_size_{args.batch_size}_seed_{args.seed}_lr_{args.lr}_wandb_ftp_{ftp_k}"
66
+ wandb.init(project="exp-impo-shuffle", group="ftp-1", name=wandb_id)
67
+ wandb.config.update(args)
68
+
69
+ run_id = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
70
+ cache_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "artifacts")
71
+ run_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "runs")
72
+ os.makedirs(cache_dir, exist_ok=True)
73
+ os.makedirs(run_dir, exist_ok=True)
74
+
75
+ # === DATASET LOADING ===
76
+ dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
77
+ dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True)
78
+ train_dataset = dataset['train']
79
+ valid_dataset = dataset['validation']
80
+
81
+ # === TOKENIZER & MODEL LOADING ===
82
+ # model_name = f"gpt2{'' if no_pos_encodings_underscore == '' else '-no-pos'}-small-{perturbation}-{paren_model}"
83
+
84
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
85
+ tokenizer = PERTURBATIONS[args.perturbation]['llama_tokenizer']
86
+ model = AutoModelForCausalLM.from_pretrained(model_name,
87
+ # device_map="auto", # deepspeed needs to delete this setting
88
+ cache_dir=cache_dir)
89
+
90
+ # print("model:", model)
91
+ # === TOKENIZATION ===
92
+ def tokenize_function(examples):
93
+ return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
94
+ tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
95
+ tokenized_valid = valid_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
96
+
97
+ shuffled_valid = tokenized_valid.shuffle()
98
+ tokenized_valid = shuffled_valid.select(range(1000))
99
+ print("tokenized_valid:", tokenized_valid)
100
+ # print(train_dataset)
101
+ # === DATA COLLATOR ===2
102
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
103
+
104
+ # === TRAINING ARGUMENTS ===
105
+ training_args = TrainingArguments(
106
+ output_dir=run_dir,
107
+ evaluation_strategy="steps",
108
+ eval_steps=10,
109
+ per_device_train_batch_size=args.batch_size, # set "auto" in deepspeed config, adjust it in trainer
110
+ logging_dir='./logs',
111
+ logging_steps=1,
112
+ save_steps=100,
113
+ learning_rate=args.lr, # align with deepspeed
114
+ num_train_epochs=args.epoch,
115
+ seed=args.seed,
116
+ gradient_accumulation_steps=2, # # set "auto" in deepspeed config, adjust it in trainer
117
+ fp16=True, # align with deepspeed
118
+ report_to="wandb",
119
+ warmup_ratio=0.1,
120
+ deepspeed="deepspeed_config/train_dp_config.json"
121
+ )
122
+
123
+ # === TRAINER ===
124
+ trainer = TrainerAdamP(
125
+ model=model,
126
+ args=training_args,
127
+ train_dataset=tokenized_train,
128
+ eval_dataset=tokenized_valid,
129
+ tokenizer=tokenizer,
130
+ data_collator=data_collator
131
+ )
132
+
133
+ # === TRAIN MODEL ===
134
+ trainer.train()
135
+ # End logging
136
+ wandb.finish()
137
+
train_gpt2.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import torch
3
+ sys.path.append("..")
4
+
5
+ import os
6
+ from datasets import load_dataset
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
8
+ from utils_gpt2 import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
9
+ GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
10
+ import wandb
11
+ import argparse
12
+
13
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
14
+ # os.environ['MASTER_PORT'] = '12345'
15
+
16
+ # import wandb
17
+
18
+ # Setup for Weights & Biases
19
+ # wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id)
20
+
21
+ if __name__ == "__main__":
22
+
23
+ # === CONFIGURATION SETTINGS ===
24
+ parser = argparse.ArgumentParser(description="Training configuration.")
25
+
26
+ parser.add_argument('--perturbation', type=str, default='hop_tokens4', help='Type of perturbation to use.')
27
+ parser.add_argument('--train_set', type=str, default='10M', help='Dataset size for training.')
28
+ parser.add_argument('--batch_size', type=int, default=4, help='Batch size for training.')
29
+ parser.add_argument('--epoch', type=int, default=3, help='train epoch')
30
+ parser.add_argument('--seed', type=int, default=0, help='Random seed.')
31
+ parser.add_argument('--lr', type=float, default=5e-6, help='Learning rate.')
32
+
33
+ args = parser.parse_args()
34
+
35
+ # no_pos_encodings_underscore = "" # Ex: "_nopos" if needed
36
+ ckpt_path = "./checkpoints"
37
+ # effective_bsz = 512
38
+
39
+ model_name = "gpt2"
40
+ model_save_name = "GPT2"
41
+
42
+ # === FILE PATHS BASED ON CONFIGURATION ===
43
+
44
+
45
+ wandb_id = f"{model_save_name}_{args.perturbation}_train_set_{args.train_set}_epoch_{args.epoch}_batch_size_{args.batch_size}_seed_{args.seed}_lr_{args.lr}_wandb"
46
+ wandb.init(project="exp-impo-reverse", group="reverse-gpt2", name=wandb_id)
47
+ wandb.config.update(args)
48
+
49
+ run_id = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
50
+ cache_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "artifacts")
51
+ run_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "runs")
52
+ os.makedirs(cache_dir, exist_ok=True)
53
+ os.makedirs(run_dir, exist_ok=True)
54
+
55
+ # === DATASET LOADING ===
56
+ dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
57
+ dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True)
58
+ train_dataset = dataset['train']
59
+ valid_dataset = dataset['validation']
60
+
61
+ # === TOKENIZER & MODEL LOADING ===
62
+ # model_name = f"gpt2{'' if no_pos_encodings_underscore == '' else '-no-pos'}-small-{perturbation}-{paren_model}"
63
+
64
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
65
+ tokenizer = PERTURBATIONS[args.perturbation]['gpt2_tokenizer']
66
+ model = AutoModelForCausalLM.from_pretrained(model_name,
67
+ # device_map="auto", # deepspeed needs to delete this setting
68
+ cache_dir=cache_dir)
69
+
70
+ # print("model:", model)
71
+ # === TOKENIZATION ===
72
+ def tokenize_function(examples):
73
+ return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
74
+ tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
75
+ tokenized_valid = valid_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
76
+
77
+ shuffled_valid = tokenized_valid.shuffle()
78
+ tokenized_valid = shuffled_valid.select(range(1000))
79
+ print("tokenized_valid:", tokenized_valid)
80
+ # print(train_dataset)
81
+ # === DATA COLLATOR ===2
82
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
83
+
84
+ # === TRAINING ARGUMENTS ===
85
+ training_args = TrainingArguments(
86
+ output_dir=run_dir,
87
+ evaluation_strategy="steps",
88
+ eval_steps=10,
89
+ per_device_train_batch_size=args.batch_size, # set "auto" in deepspeed config, adjust it in trainer
90
+ logging_dir='./logs',
91
+ logging_steps=1,
92
+ save_steps=100,
93
+ learning_rate=args.lr, # align with deepspeed
94
+ num_train_epochs=args.epoch,
95
+ seed=args.seed,
96
+ gradient_accumulation_steps=2, # # set "auto" in deepspeed config, adjust it in trainer
97
+ fp16=True, # align with deepspeed
98
+ report_to="wandb",
99
+ warmup_ratio=0.1,
100
+ deepspeed="deepspeed_config/train_dp_config.json"
101
+ )
102
+
103
+ # === TRAINER ===
104
+ trainer = Trainer(
105
+ model=model,
106
+ args=training_args,
107
+ train_dataset=tokenized_train,
108
+ eval_dataset=tokenized_valid,
109
+ tokenizer=tokenizer,
110
+ data_collator=data_collator
111
+ )
112
+
113
+ # === TRAIN MODEL ===
114
+ trainer.train()
115
+ # End logging
116
+ wandb.finish()
117
+
train_llama.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import torch
3
+ sys.path.append("..")
4
+
5
+ import os
6
+ from datasets import load_dataset
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
8
+ from utils_llama import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
9
+ GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
10
+ import argparse
11
+
12
+ # import wandb
13
+
14
+ # Setup for Weights & Biases
15
+ # wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id)
16
+
17
+ if __name__ == "__main__":
18
+
19
+ # === CONFIGURATION SETTINGS ===
20
+ parser = argparse.ArgumentParser(description="Training configuration.")
21
+
22
+ parser.add_argument('--perturbation', type=str, default='hop_tokens4', help='Type of perturbation to use.')
23
+ parser.add_argument('--train_set', type=str, default='10M', help='Dataset size for training.')
24
+ parser.add_argument('--batch_size', type=int, default=4, help='Batch size for training.')
25
+ parser.add_argument('--epoch', type=int, default=20, help='train epoch')
26
+ parser.add_argument('--seed', type=int, default=0, help='Random seed.')
27
+
28
+ args = parser.parse_args()
29
+
30
+ # no_pos_encodings_underscore = "" # Ex: "_nopos" if needed
31
+ ckpt_path = "./checkpoints"
32
+ # effective_bsz = 512
33
+
34
+ model_name = "meta-llama/Llama-3.2-3B"
35
+
36
+ model_save_name = "Llama-3.2-3B"
37
+ # === FILE PATHS BASED ON CONFIGURATION ===
38
+ run_id = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
39
+ cache_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "artifacts")
40
+ run_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "runs")
41
+ os.makedirs(cache_dir, exist_ok=True)
42
+ os.makedirs(run_dir, exist_ok=True)
43
+
44
+ # === DATASET LOADING ===
45
+ dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
46
+ dataset = load_dataset('babylm_dataset_llama.py', name=dataset_name, trust_remote_code=True)
47
+ train_dataset = dataset['train']
48
+
49
+ # === TOKENIZER & MODEL LOADING ===
50
+ # model_name = f"gpt2{'' if no_pos_encodings_underscore == '' else '-no-pos'}-small-{perturbation}-{paren_model}"
51
+
52
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
53
+ tokenizer = PERTURBATIONS[args.perturbation]['llama_tokenizer']
54
+ model = AutoModelForCausalLM.from_pretrained(model_name,
55
+ device_map="auto",
56
+ cache_dir=cache_dir)
57
+
58
+ # print("model:", model)
59
+ # === TOKENIZATION ===
60
+ def tokenize_function(examples):
61
+ return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
62
+ tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
63
+
64
+ # === DATA COLLATOR ===
65
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
66
+
67
+ # === TRAINING ARGUMENTS ===
68
+ training_args = TrainingArguments(
69
+ output_dir=run_dir,
70
+ # evaluation_strategy="steps",
71
+ evaluation_strategy="no",
72
+ # per_device_train_batch_size=int(effective_bsz / 1), # Assuming 1 GPU for this example
73
+ per_device_train_batch_size=args.batch_size, # Assuming 1 GPU for this example
74
+ logging_dir='./logs',
75
+ logging_steps=1000,
76
+ save_steps=1000,
77
+ # save_total_limit=5,
78
+ learning_rate=2e-5,
79
+ num_train_epochs=args.epoch,
80
+ seed=args.seed,
81
+ # load_best_model_at_end=True,
82
+ gradient_accumulation_steps=1, # help reduce gpu memory
83
+ fp16 = True, # Enable mixed precision training
84
+ report_to="none",
85
+ )
86
+
87
+ # === TRAINER ===
88
+ trainer = Trainer(
89
+ model=model,
90
+ args=training_args,
91
+ train_dataset=tokenized_train,
92
+ tokenizer=tokenizer,
93
+ data_collator=data_collator
94
+ )
95
+
96
+ # === TRAIN MODEL ===
97
+ trainer.train()
98
+ # End logging
99
+ # wandb.finish()
train_llama_1B.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import torch
3
+ sys.path.append("..")
4
+
5
+ import os
6
+ from datasets import load_dataset
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
8
+ from utils_llama_1B import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
9
+ GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
10
+ import wandb
11
+ import argparse
12
+
13
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
14
+ # os.environ['MASTER_PORT'] = '12345'
15
+
16
+ # import wandb
17
+
18
+ # Setup for Weights & Biases
19
+ # wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id)
20
+
21
+ if __name__ == "__main__":
22
+
23
+ # === CONFIGURATION SETTINGS ===
24
+ parser = argparse.ArgumentParser(description="Training configuration.")
25
+
26
+ parser.add_argument('--perturbation', type=str, default='hop_tokens4', help='Type of perturbation to use.')
27
+ parser.add_argument('--train_set', type=str, default='10M', help='Dataset size for training.')
28
+ parser.add_argument('--batch_size', type=int, default=4, help='Batch size for training.')
29
+ parser.add_argument('--epoch', type=int, default=3, help='train epoch')
30
+ parser.add_argument('--seed', type=int, default=0, help='Random seed.')
31
+ parser.add_argument('--lr', type=float, default=5e-6, help='Learning rate.')
32
+
33
+ args = parser.parse_args()
34
+
35
+ # no_pos_encodings_underscore = "" # Ex: "_nopos" if needed
36
+ ckpt_path = "./checkpoints"
37
+ # effective_bsz = 512
38
+
39
+ model_name = "meta-llama/Llama-3.2-1B"
40
+ model_save_name = "Llama-3.2-1B"
41
+
42
+ # === FILE PATHS BASED ON CONFIGURATION ===
43
+
44
+
45
+ wandb_id = f"{model_save_name}_{args.perturbation}_train_set_{args.train_set}_epoch_{args.epoch}_batch_size_{args.batch_size}_seed_{args.seed}_lr_{args.lr}_wandb"
46
+ wandb.init(project="exp-impo-reverse", group="reverse-1B", name=wandb_id)
47
+ wandb.config.update(args)
48
+
49
+ run_id = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
50
+ cache_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "artifacts")
51
+ run_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "runs")
52
+ os.makedirs(cache_dir, exist_ok=True)
53
+ os.makedirs(run_dir, exist_ok=True)
54
+
55
+ # === DATASET LOADING ===
56
+ dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
57
+ dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True)
58
+ train_dataset = dataset['train']
59
+ valid_dataset = dataset['validation']
60
+
61
+ # === TOKENIZER & MODEL LOADING ===
62
+ # model_name = f"gpt2{'' if no_pos_encodings_underscore == '' else '-no-pos'}-small-{perturbation}-{paren_model}"
63
+
64
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
65
+ tokenizer = PERTURBATIONS[args.perturbation]['llama_tokenizer']
66
+ model = AutoModelForCausalLM.from_pretrained(model_name,
67
+ # device_map="auto", # deepspeed needs to delete this setting
68
+ cache_dir=cache_dir)
69
+
70
+ # print("model:", model)
71
+ # === TOKENIZATION ===
72
+ def tokenize_function(examples):
73
+ return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
74
+ tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
75
+ tokenized_valid = valid_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
76
+
77
+ shuffled_valid = tokenized_valid.shuffle()
78
+ tokenized_valid = shuffled_valid.select(range(1000))
79
+ print("tokenized_valid:", tokenized_valid)
80
+ # print(train_dataset)
81
+ # === DATA COLLATOR ===2
82
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
83
+
84
+ # === TRAINING ARGUMENTS ===
85
+ training_args = TrainingArguments(
86
+ output_dir=run_dir,
87
+ evaluation_strategy="steps",
88
+ eval_steps=10,
89
+ per_device_train_batch_size=args.batch_size, # set "auto" in deepspeed config, adjust it in trainer
90
+ logging_dir='./logs',
91
+ logging_steps=1,
92
+ save_steps=100,
93
+ learning_rate=args.lr, # align with deepspeed
94
+ num_train_epochs=args.epoch,
95
+ seed=args.seed,
96
+ gradient_accumulation_steps=2, # # set "auto" in deepspeed config, adjust it in trainer
97
+ fp16=True, # align with deepspeed
98
+ report_to="wandb",
99
+ warmup_ratio=0.1,
100
+ deepspeed="deepspeed_config/train_dp_config.json"
101
+ )
102
+
103
+ # === TRAINER ===
104
+ trainer = Trainer(
105
+ model=model,
106
+ args=training_args,
107
+ train_dataset=tokenized_train,
108
+ eval_dataset=tokenized_valid,
109
+ tokenizer=tokenizer,
110
+ data_collator=data_collator
111
+ )
112
+
113
+ # === TRAIN MODEL ===
114
+ trainer.train()
115
+ # End logging
116
+ wandb.finish()
117
+
train_llama_3B.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import torch
3
+ sys.path.append("..")
4
+
5
+ import os
6
+ from datasets import load_dataset
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
8
+ from utils_llama_3B import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
9
+ GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
10
+ import wandb
11
+ import argparse
12
+
13
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
14
+ # os.environ['MASTER_PORT'] = '12345'
15
+
16
+ # import wandb
17
+
18
+ # Setup for Weights & Biases
19
+ # wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id)
20
+
21
+ if __name__ == "__main__":
22
+
23
+ # === CONFIGURATION SETTINGS ===
24
+ parser = argparse.ArgumentParser(description="Training configuration.")
25
+
26
+ parser.add_argument('--perturbation', type=str, default='hop_tokens4', help='Type of perturbation to use.')
27
+ parser.add_argument('--train_set', type=str, default='10M', help='Dataset size for training.')
28
+ parser.add_argument('--batch_size', type=int, default=4, help='Batch size for training.')
29
+ parser.add_argument('--epoch', type=int, default=3, help='train epoch')
30
+ parser.add_argument('--seed', type=int, default=0, help='Random seed.')
31
+ parser.add_argument('--lr', type=float, default=5e-6, help='Learning rate.')
32
+
33
+ args = parser.parse_args()
34
+
35
+ # no_pos_encodings_underscore = "" # Ex: "_nopos" if needed
36
+ ckpt_path = "./checkpoints"
37
+ # effective_bsz = 512
38
+
39
+ model_name = "meta-llama/Llama-3.2-3B"
40
+ model_save_name = "Llama-3.2-3B"
41
+
42
+ # === FILE PATHS BASED ON CONFIGURATION ===
43
+
44
+
45
+ wandb_id = f"{model_save_name}_{args.perturbation}_train_set_{args.train_set}_epoch_{args.epoch}_batch_size_{args.batch_size}_seed_{args.seed}_lr_{args.lr}_wandb"
46
+ wandb.init(project="exp-impo-shuffle", group="shuffle", name=wandb_id)
47
+ wandb.config.update(args)
48
+
49
+ run_id = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
50
+ cache_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "artifacts")
51
+ run_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "runs")
52
+ os.makedirs(cache_dir, exist_ok=True)
53
+ os.makedirs(run_dir, exist_ok=True)
54
+
55
+ # === DATASET LOADING ===
56
+ dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
57
+ dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True)
58
+ train_dataset = dataset['train']
59
+ valid_dataset = dataset['validation']
60
+
61
+ # === TOKENIZER & MODEL LOADING ===
62
+ # model_name = f"gpt2{'' if no_pos_encodings_underscore == '' else '-no-pos'}-small-{perturbation}-{paren_model}"
63
+
64
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
65
+ tokenizer = PERTURBATIONS[args.perturbation]['llama_tokenizer']
66
+ model = AutoModelForCausalLM.from_pretrained(model_name,
67
+ # device_map="auto", # deepspeed needs to delete this setting
68
+ cache_dir=cache_dir)
69
+
70
+ # print("model:", model)
71
+ # === TOKENIZATION ===
72
+ def tokenize_function(examples):
73
+ return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
74
+ tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
75
+ tokenized_valid = valid_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
76
+
77
+ shuffled_valid = tokenized_valid.shuffle()
78
+ tokenized_valid = shuffled_valid.select(range(1000))
79
+ print("tokenized_valid:", tokenized_valid)
80
+ # print(train_dataset)
81
+ # === DATA COLLATOR ===2
82
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
83
+
84
+ # === TRAINING ARGUMENTS ===
85
+ training_args = TrainingArguments(
86
+ output_dir=run_dir,
87
+ evaluation_strategy="steps",
88
+ eval_steps=10,
89
+ per_device_train_batch_size=args.batch_size, # set "auto" in deepspeed config, adjust it in trainer
90
+ logging_dir='./logs',
91
+ logging_steps=1,
92
+ save_steps=100,
93
+ learning_rate=args.lr, # align with deepspeed
94
+ num_train_epochs=args.epoch,
95
+ seed=args.seed,
96
+ gradient_accumulation_steps=2, # # set "auto" in deepspeed config, adjust it in trainer
97
+ fp16=True, # align with deepspeed
98
+ report_to="wandb",
99
+ warmup_ratio=0.1,
100
+ deepspeed="deepspeed_config/train_dp_config.json"
101
+ )
102
+
103
+ # === TRAINER ===
104
+ trainer = Trainer(
105
+ model=model,
106
+ args=training_args,
107
+ train_dataset=tokenized_train,
108
+ eval_dataset=tokenized_valid,
109
+ tokenizer=tokenizer,
110
+ data_collator=data_collator
111
+ )
112
+
113
+ # === TRAIN MODEL ===
114
+ trainer.train()
115
+ # End logging
116
+ wandb.finish()
117
+
train_qwen.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append("..")
3
+
4
+ import os
5
+ from datasets import load_dataset
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
7
+ from utils_qwen import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
8
+ GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
9
+ import argparse
10
+ # import wandb
11
+
12
+ # Setup for Weights & Biases
13
+ # wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id)
14
+
15
+ if __name__ == "__main__":
16
+
17
+ # === CONFIGURATION SETTINGS ===
18
+ parser = argparse.ArgumentParser(description="Training configuration.")
19
+
20
+ parser.add_argument('--perturbation', type=str, default='hop_tokens4', help='Type of perturbation to use.')
21
+ parser.add_argument('--train_set', type=str, default='10M', help='Dataset size for training.')
22
+ parser.add_argument('--batch_size', type=int, default=4, help='Batch size for training.')
23
+ parser.add_argument('--epoch', type=int, default=20, help='train epoch')
24
+ parser.add_argument('--seed', type=int, default=0, help='Random seed.')
25
+
26
+ args = parser.parse_args()
27
+
28
+ # no_pos_encodings_underscore = "" # Ex: "_nopos" if needed
29
+ ckpt_path = "./checkpoints"
30
+ # effective_bsz = 512
31
+
32
+ model_name = "Qwen/Qwen2.5-7B"
33
+
34
+ model_save_name = "Qwen2.5-7B"
35
+ # === FILE PATHS BASED ON CONFIGURATION ===
36
+ run_id = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
37
+ cache_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "artifacts")
38
+ run_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "runs")
39
+ os.makedirs(cache_dir, exist_ok=True)
40
+ os.makedirs(run_dir, exist_ok=True)
41
+
42
+ # === DATASET LOADING ===
43
+ dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}"
44
+ dataset = load_dataset('babylm_dataset.py', name=dataset_name, trust_remote_code=True)
45
+ train_dataset = dataset['train']
46
+
47
+ # === TOKENIZER & MODEL LOADING ===
48
+ # model_name = f"gpt2{'' if no_pos_encodings_underscore == '' else '-no-pos'}-small-{perturbation}-{paren_model}"
49
+
50
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
51
+ tokenizer = PERTURBATIONS[args.perturbation]['qwen_tokenizer']
52
+ model = AutoModelForCausalLM.from_pretrained(model_name,
53
+ device_map="auto", # Place different layers of the model on different GPUs
54
+ cache_dir=cache_dir)
55
+
56
+ # print("model:", model)
57
+ # === TOKENIZATION ===
58
+ def tokenize_function(examples):
59
+ return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
60
+ tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
61
+
62
+ # === DATA COLLATOR ===
63
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
64
+
65
+ # === TRAINING ARGUMENTS ===
66
+ training_args = TrainingArguments(
67
+ output_dir=run_dir,
68
+ # evaluation_strategy="steps",
69
+ evaluation_strategy="no",
70
+ # per_device_train_batch_size=int(effective_bsz / 1), # Assuming 1 GPU for this example
71
+ per_device_train_batch_size=args.batch_size, # Assuming 1 GPU for this example
72
+ logging_dir='./logs',
73
+ logging_steps=1000,
74
+ save_steps=1000,
75
+ # save_total_limit=5,
76
+ learning_rate=2e-5,
77
+ num_train_epochs=args.epoch,
78
+ seed=args.seed,
79
+ # load_best_model_at_end=True,
80
+ gradient_accumulation_steps=1, # help reduce gpu memory
81
+ fp16 = True, # Enable mixed precision training
82
+ # report_to="wandb"
83
+ )
84
+
85
+ # === TRAINER ===
86
+ trainer = Trainer(
87
+ model=model,
88
+ args=training_args,
89
+ train_dataset=tokenized_train,
90
+ tokenizer=tokenizer,
91
+ data_collator=data_collator
92
+ )
93
+
94
+ # === TRAIN MODEL ===
95
+ trainer.train()
96
+ # End logging
97
+ # wandb.finish()
train_qwen_lora.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append("..")
3
+
4
+ import os
5
+ from datasets import load_dataset
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
7
+ from utils_qwen import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
8
+ GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
9
+ from peft import get_peft_model, LoraConfig, TaskType # Import PEFT components for LoRA
10
+ # import wandb
11
+
12
+ # === CONFIGURATION SETTINGS ===
13
+ perturbation = "shuffle_deterministic21"
14
+ train_set = "10M"
15
+ seed = 0
16
+ ckpt_path = "./checkpoints"
17
+ effective_bsz = 512
18
+
19
+ # === FILE PATHS BASED ON CONFIGURATION ===
20
+ run_id = f"babylm_{perturbation}_{train_set}_seed{seed}"
21
+ cache_dir = os.path.join(ckpt_path, "babylm_lora", run_id, "artifacts")
22
+ run_dir = os.path.join(ckpt_path, "babylm_lora", run_id, "runs")
23
+ os.makedirs(cache_dir, exist_ok=True)
24
+ os.makedirs(run_dir, exist_ok=True)
25
+
26
+ # Setup for Weights & Biases
27
+ # wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id)
28
+
29
+ # === DATASET LOADING ===
30
+ dataset_name = f"babylm_{perturbation}_{train_set}_seed{seed}"
31
+ dataset = load_dataset('babylm_dataset.py', name=dataset_name, trust_remote_code=True)
32
+ train_dataset = dataset['train']
33
+
34
+ # === TOKENIZER & MODEL LOADING ===
35
+ model_name = "Qwen/Qwen2.5-0.5B"
36
+ tokenizer = PERTURBATIONS[perturbation]['qwen_tokenizer']
37
+ model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
38
+
39
+ # === APPLYING LoRA ===
40
+ lora_config = LoraConfig(
41
+ task_type=TaskType.CAUSAL_LM, # This specifies the task type
42
+ r=16, # Rank of the decomposed matrices
43
+ lora_alpha=16, # Amplitude of the LoRA updates
44
+ lora_dropout=0.1, # Dropout for LoRA layers
45
+ )
46
+ model = get_peft_model(model, lora_config)
47
+
48
+ # print("model:", model)
49
+ # for name, param in model.named_parameters():
50
+ # if param.requires_grad:
51
+ # print(f"Trainable parameter: {name}, shape: {param.shape}")
52
+ # === TOKENIZATION ===
53
+ def tokenize_function(examples):
54
+ return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
55
+
56
+ tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
57
+ # === DATA COLLATOR ===
58
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
59
+
60
+ # === TRAINING ARGUMENTS ===
61
+ training_args = TrainingArguments(
62
+ output_dir=run_dir,
63
+ # evaluation_strategy="steps", # use with load_best_model_at_end=True
64
+ evaluation_strategy="no",
65
+ per_device_train_batch_size=1, # Set based on your hardware capabilities
66
+ logging_dir='./logs',
67
+ logging_steps=10,
68
+ save_steps=10,
69
+ # save_total_limit=5,
70
+ learning_rate=5e-4, # You may want to tune this for LoRA
71
+ num_train_epochs=10, # Fewer epochs might be sufficient due to the efficiency of LoRA
72
+ seed=seed,
73
+ # load_best_model_at_end=True,
74
+ gradient_accumulation_steps=1,
75
+ fp16=True,
76
+ warmup_ratio=0.1,
77
+ # report_to="wandb"
78
+ )
79
+
80
+ # === TRAINER ===
81
+ trainer = Trainer(
82
+ model=model,
83
+ args=training_args,
84
+ train_dataset=tokenized_train,
85
+ tokenizer=tokenizer,
86
+ data_collator=data_collator
87
+ )
88
+
89
+ # === TRAIN MODEL ===
90
+ trainer.train()
91
+
92
+ # End logging
93
+ # wandb.finish()
wandb/debug-cli.chunhui.log ADDED
File without changes
wandb/debug-internal.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-30T01:12:00.584497778-05:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-11-30T01:12:00.584512378-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241130_011200-7p4fy9o8/logs/debug-core.log"}
3
+ {"time":"2024-11-30T01:12:00.690595233-05:00","level":"INFO","msg":"created new stream","id":"7p4fy9o8"}
4
+ {"time":"2024-11-30T01:12:00.690619613-05:00","level":"INFO","msg":"stream: started","id":"7p4fy9o8"}
5
+ {"time":"2024-11-30T01:12:00.690681993-05:00","level":"INFO","msg":"sender: started","stream_id":"7p4fy9o8"}
6
+ {"time":"2024-11-30T01:12:00.690644643-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"7p4fy9o8"}}
7
+ {"time":"2024-11-30T01:12:00.690641003-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"7p4fy9o8"}}
8
+ {"time":"2024-11-30T01:12:00.859481271-05:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-11-30T02:11:11.340074116-05:00","level":"INFO","msg":"Stopping system monitor"}
10
+ {"time":"2024-11-30T02:11:11.340981582-05:00","level":"INFO","msg":"Stopped system monitor"}
11
+ {"time":"2024-11-30T02:11:11.766871033-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
12
+ {"time":"2024-11-30T02:11:11.948700811-05:00","level":"INFO","msg":"handler: operation stats","stats":{}}
13
+ {"time":"2024-11-30T02:11:12.958514865-05:00","level":"INFO","msg":"stream: closing","id":"7p4fy9o8"}
14
+ {"time":"2024-11-30T02:11:12.958555006-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"7p4fy9o8"}}
15
+ {"time":"2024-11-30T02:11:12.958588266-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"7p4fy9o8"}}
16
+ {"time":"2024-11-30T02:11:12.958624136-05:00","level":"INFO","msg":"sender: closed","stream_id":"7p4fy9o8"}
17
+ {"time":"2024-11-30T02:11:12.958703497-05:00","level":"INFO","msg":"stream: closed","id":"7p4fy9o8"}
wandb/debug.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-30 01:12:00,579 INFO MainThread:3204336 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_setup.py:_flush():79] Configure stats pid to 3204336
3
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_gpt2.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_gpt2.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_gpt2.py'}
8
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241130_011200-7p4fy9o8/logs/debug.log
10
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241130_011200-7p4fy9o8/logs/debug-internal.log
11
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_init.py:init():621] calling init triggers
12
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_init.py:init():671] starting backend
15
+ 2024-11-30 01:12:00,580 INFO MainThread:3204336 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-11-30 01:12:00,581 INFO MainThread:3204336 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-11-30 01:12:00,582 INFO MainThread:3204336 [wandb_init.py:init():688] backend started and connected
18
+ 2024-11-30 01:12:00,585 INFO MainThread:3204336 [wandb_init.py:init():783] updated telemetry
19
+ 2024-11-30 01:12:00,613 INFO MainThread:3204336 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-11-30 01:12:00,856 INFO MainThread:3204336 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-11-30 01:12:00,949 INFO MainThread:3204336 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-11-30 01:12:00,949 INFO MainThread:3204336 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-11-30 01:12:00,949 INFO MainThread:3204336 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-11-30 01:12:00,949 INFO MainThread:3204336 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-11-30 01:12:00,951 INFO MainThread:3204336 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-11-30 01:12:00,951 INFO MainThread:3204336 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_partial', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
27
+ 2024-11-30 02:11:11,338 INFO MainThread:3204336 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-reverse/7p4fy9o8
28
+ 2024-11-30 02:11:11,339 INFO MainThread:3204336 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
29
+ 2024-11-30 02:11:11,339 INFO MainThread:3204336 [wandb_run.py:_restore():2408] restore
30
+ 2024-11-30 02:11:11,339 INFO MainThread:3204336 [wandb_run.py:_restore():2414] restore done
31
+ 2024-11-30 02:11:12,952 INFO MainThread:3204336 [wandb_run.py:_footer_history_summary_info():3975] rendering history
32
+ 2024-11-30 02:11:12,952 INFO MainThread:3204336 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
33
+ 2024-11-30 02:11:12,957 INFO MainThread:3204336 [wandb_run.py:_footer_sync_info():3934] logging synced files
wandb/run-20241030_010306-uhzyjdga/run-uhzyjdga.wandb ADDED
Binary file (1.6 kB). View file
 
wandb/run-20241030_011013-8qrwqf2b/files/config.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.5
4
+ m: []
5
+ python_version: 3.9.19
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 49
12
+ - 51
13
+ - 53
14
+ - 55
15
+ - 71
16
+ - 98
17
+ "2":
18
+ - 1
19
+ - 5
20
+ - 11
21
+ - 49
22
+ - 51
23
+ - 53
24
+ - 55
25
+ - 71
26
+ - 98
27
+ "3":
28
+ - 13
29
+ - 23
30
+ - 55
31
+ "4": 3.9.19
32
+ "5": 0.18.5
33
+ "6": 4.45.1
34
+ "8":
35
+ - 5
36
+ "12": 0.18.5
37
+ "13": linux-x86_64
38
+ batch_size:
39
+ value: 3
40
+ epoch:
41
+ value: 7
42
+ perturbation:
43
+ value: reverse_control
44
+ seed:
45
+ value: 0
46
+ train_set:
47
+ value: 10M
wandb/run-20241030_011013-8qrwqf2b/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-10-30T05:10:13.809520Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_control",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "7",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1719200362496"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241030_011509-3dp0dtmk/files/output.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.29s/it]
2
+ Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:55<00:00, 313.12 examples/s]
3
+ Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:54<00:00, 330.90 examples/s]
4
+ tokenized_valid: Dataset({
5
+ features: ['input_ids', 'attention_mask'],
6
+ num_rows: 600
7
+ })
8
+ /mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
9
+ warnings.warn(
10
+ [2024-10-30 01:17:06,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
11
+ [2024-10-30 01:17:14,061] [INFO] [comm.py:652:init_distributed] cdb=None
12
+ Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
13
+ Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
14
+ Loading extension module cpu_adam...
15
+ Time to load cpu_adam op: 4.238509893417358 seconds
wandb/run-20241030_011509-3dp0dtmk/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-30 01:15:09,509 INFO MainThread:324927 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-10-30 01:15:09,509 INFO MainThread:324927 [wandb_setup.py:_flush():79] Configure stats pid to 324927
3
+ 2024-10-30 01:15:09,509 INFO MainThread:324927 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-10-30 01:15:09,509 INFO MainThread:324927 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-10-30 01:15:09,509 INFO MainThread:324927 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-10-30 01:15:09,509 INFO MainThread:324927 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-10-30 01:15:09,509 INFO MainThread:324927 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
8
+ 2024-10-30 01:15:09,509 INFO MainThread:324927 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-10-30 01:15:09,509 INFO MainThread:324927 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_011509-3dp0dtmk/logs/debug.log
10
+ 2024-10-30 01:15:09,510 INFO MainThread:324927 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_011509-3dp0dtmk/logs/debug-internal.log
11
+ 2024-10-30 01:15:09,510 INFO MainThread:324927 [wandb_init.py:init():621] calling init triggers
12
+ 2024-10-30 01:15:09,510 INFO MainThread:324927 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-10-30 01:15:09,510 INFO MainThread:324927 [wandb_init.py:init():671] starting backend
15
+ 2024-10-30 01:15:09,510 INFO MainThread:324927 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-10-30 01:15:09,510 INFO MainThread:324927 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-10-30 01:15:09,511 INFO MainThread:324927 [wandb_init.py:init():688] backend started and connected
18
+ 2024-10-30 01:15:09,514 INFO MainThread:324927 [wandb_init.py:init():783] updated telemetry
19
+ 2024-10-30 01:15:09,557 INFO MainThread:324927 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-10-30 01:15:09,824 INFO MainThread:324927 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-10-30 01:15:09,917 INFO MainThread:324927 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-10-30 01:15:09,917 INFO MainThread:324927 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-10-30 01:15:09,917 INFO MainThread:324927 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-10-30 01:15:09,917 INFO MainThread:324927 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-10-30 01:15:09,919 INFO MainThread:324927 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-10-30 01:15:09,919 INFO MainThread:324927 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0}
wandb/run-20241030_011509-cqcwsj7s/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_setup.py:_flush():79] Configure stats pid to 324930
3
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
8
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_011509-cqcwsj7s/logs/debug.log
10
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_011509-cqcwsj7s/logs/debug-internal.log
11
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_init.py:init():621] calling init triggers
12
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_init.py:init():671] starting backend
15
+ 2024-10-30 01:15:09,346 INFO MainThread:324930 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-10-30 01:15:09,347 INFO MainThread:324930 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-10-30 01:15:09,348 INFO MainThread:324930 [wandb_init.py:init():688] backend started and connected
18
+ 2024-10-30 01:15:09,351 INFO MainThread:324930 [wandb_init.py:init():783] updated telemetry
19
+ 2024-10-30 01:15:09,378 INFO MainThread:324930 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-10-30 01:15:09,675 INFO MainThread:324930 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-10-30 01:15:09,766 INFO MainThread:324930 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-10-30 01:15:09,766 INFO MainThread:324930 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-10-30 01:15:09,766 INFO MainThread:324930 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-10-30 01:15:09,766 INFO MainThread:324930 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-10-30 01:15:09,767 INFO MainThread:324930 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-10-30 01:15:09,768 INFO MainThread:324930 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0}
wandb/run-20241030_013141-v317zdzd/files/config.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.5
4
+ m: []
5
+ python_version: 3.9.19
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 49
12
+ - 51
13
+ - 53
14
+ - 55
15
+ - 71
16
+ - 98
17
+ "2":
18
+ - 1
19
+ - 5
20
+ - 11
21
+ - 49
22
+ - 51
23
+ - 53
24
+ - 55
25
+ - 71
26
+ - 98
27
+ "3":
28
+ - 13
29
+ - 23
30
+ - 55
31
+ "4": 3.9.19
32
+ "5": 0.18.5
33
+ "6": 4.45.1
34
+ "8":
35
+ - 5
36
+ "12": 0.18.5
37
+ "13": linux-x86_64
38
+ batch_size:
39
+ value: 3
40
+ epoch:
41
+ value: 7
42
+ perturbation:
43
+ value: reverse_full
44
+ seed:
45
+ value: 0
46
+ train_set:
47
+ value: 10M
wandb/run-20241030_013141-v317zdzd/files/output.log ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098360/1098360 [00:04<00:00, 230478.22it/s]
2
+ 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098360/1098360 [00:00<00:00, 2642280.53it/s]
3
+ 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17520/17520 [00:00<00:00, 30255.44it/s]
4
+ Generating train split: 17519 examples [00:08, 1969.90 examples/s]█████████████████████████████████████████████████▎ | 14106/17520 [00:00<00:00, 30804.20it/s]
5
+ 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1086121/1086121 [00:05<00:00, 181782.17it/s]
6
+ 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1086121/1086121 [00:00<00:00, 2919420.19it/s]
7
+ 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18141/18141 [00:00<00:00, 30801.29it/s]
8
+ Generating validation split: 18140 examples [00:10, 1711.07 examples/s]███████████████████████████████████████████████████████▌ | 16094/18141 [00:00<00:00, 32755.54it/s]
9
+ 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031323/1031323 [00:05<00:00, 192774.00it/s]
10
+ 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031323/1031323 [00:00<00:00, 1666459.47it/s]
11
+ 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16483/16483 [00:00<00:00, 23461.88it/s]
12
+ Generating test split: 16482 examples [00:09, 1649.33 examples/s]███████████████████████████████████████████████████████████████████▌ | 15349/16483 [00:00<00:00, 26830.11it/s]
13
+ Downloading shards: 0%| | 0/2 [01:04<?, ?it/s]
14
+ Error in sys.excepthook:
15
+ Traceback (most recent call last):
16
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/lib/exit_hooks.py", line 41, in exc_handler
17
+ def exc_handler(
18
+ KeyboardInterrupt
19
+
20
+ Original exception was:
21
+ Traceback (most recent call last):
22
+ File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 172, in <module>
23
+ model = AutoModelForCausalLM.from_pretrained(model_name,
24
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
25
+ return model_class.from_pretrained(
26
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained
27
+ resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
28
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files
29
+ cached_filename = cached_file(
30
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file
31
+ resolved_file = hf_hub_download(
32
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f
33
+ return f(*args, **kwargs)
34
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
35
+ return fn(*args, **kwargs)
36
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download
37
+ return _hf_hub_download_to_cache_dir(
38
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir
39
+ with WeakFileLock(lock_path):
40
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__
41
+ return next(self.gen)
42
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock
43
+ lock.acquire()
44
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire
45
+ time.sleep(poll_interval)
46
+ KeyboardInterrupt
wandb/run-20241030_013141-v317zdzd/files/requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ funcsigs==1.0.2
2
+ sentry-sdk==2.17.0
3
+ multiprocess==0.70.16
4
+ numpy==1.26.2
5
+ pluralizer==1.2.0
6
+ debugpy==1.6.7
7
+ nvidia-cudnn-cu11==8.5.0.96
8
+ deepspeed==0.15.2
9
+ data==0.4
10
+ pandas==2.1.3
11
+ tomli==2.0.1
12
+ charset-normalizer==3.3.2
13
+ attrs==24.2.0
14
+ aiosignal==1.3.1
15
+ fsspec==2023.10.0
16
+ nvidia-cusparse-cu11==11.7.4.91
17
+ zipp==3.12.0
18
+ mypy-extensions==1.0.0
19
+ datasets==3.0.1
20
+ joblib==1.3.2
21
+ hjson==3.1.0
22
+ traitlets==5.7.1
23
+ stack-data==0.6.0
24
+ transformers==4.45.1
25
+ sympy==1.11.1
26
+ Pygments==2.15.0
27
+ docker-pycreds==0.4.0
28
+ dill==0.3.8
29
+ wheel==0.44.0
30
+ prompt-toolkit==3.0.30
31
+ parso==0.8.3
32
+ ipykernel==6.23.1
33
+ pyarrow==17.0.0
34
+ certifi==2023.11.17
35
+ nvidia-cufft-cu11==10.9.0.58
36
+ six==1.16.0
37
+ pydantic==2.9.2
38
+ click==8.1.7
39
+ nest-asyncio==1.5.6
40
+ gmpy2==2.1.0
41
+ matplotlib==3.8.2
42
+ scipy==1.11.4
43
+ typing_extensions==4.12.2
44
+ statsmodels==0.14.0
45
+ huggingface-hub==0.25.0
46
+ frozenlist==1.4.1
47
+ gpustat==1.1.1
48
+ nvidia-nvtx-cu11==11.7.91
49
+ safetensors==0.4.5
50
+ stanza==1.9.2
51
+ decorator==5.1.1
52
+ seaborn==0.13.0
53
+ sentencepiece==0.2.0
54
+ PyYAML==6.0.1
55
+ black==24.8.0
56
+ protobuf==4.25.1
57
+ pickleshare==0.7.5
58
+ peft==0.13.0
59
+ triton==2.0.0
60
+ nvidia-cuda-runtime-cu11==11.7.99
61
+ Jinja2==3.1.2
62
+ nvidia-cusolver-cu11==11.4.0.1
63
+ executing==1.2.0
64
+ jupyter_client==8.1.0
65
+ pluggy==1.3.0
66
+ cmake==3.30.3
67
+ pytz==2023.3.post1
68
+ aiohappyeyeballs==2.4.2
69
+ kiwisolver==1.4.5
70
+ py-cpuinfo==9.0.0
71
+ Pillow==10.1.0
72
+ ptyprocess==0.7.0
73
+ importlib_resources==6.4.5
74
+ GitPython==3.1.43
75
+ importlib-metadata==6.0.0
76
+ iniconfig==2.0.0
77
+ scikit-learn==1.3.2
78
+ exceptiongroup==1.1.0
79
+ networkx==2.8.6
80
+ accelerate==1.0.0
81
+ nltk==3.8.1
82
+ shutilwhich==1.1.0
83
+ fonttools==4.45.1
84
+ future==0.18.3
85
+ aiohttp==3.10.6
86
+ wcwidth==0.2.5
87
+ idna==3.6
88
+ filelock==3.12.2
89
+ pathspec==0.12.1
90
+ jupyter_core==5.1.0
91
+ lit==18.1.8
92
+ nvidia-curand-cu11==10.2.10.91
93
+ nvidia-cublas-cu11==11.10.3.66
94
+ nvidia-ml-py==12.560.30
95
+ msgpack==1.1.0
96
+ python-dateutil==2.8.2
97
+ blessed==1.20.0
98
+ packaging==23.0
99
+ gitdb==4.0.11
100
+ yarl==1.13.0
101
+ emoji==2.8.0
102
+ tzdata==2023.3
103
+ cycler==0.12.1
104
+ tornado==6.2
105
+ backcall==0.2.0
106
+ plotnine==0.12.4
107
+ ninja==1.11.1.1
108
+ latex==0.7.0
109
+ wandb==0.18.5
110
+ setproctitle==1.3.3
111
+ threadpoolctl==3.2.0
112
+ requests==2.32.3
113
+ pyparsing==3.1.1
114
+ smmap==5.0.1
115
+ pyzmq==23.0.0
116
+ async-timeout==4.0.3
117
+ annotated-types==0.7.0
118
+ matplotlib-inline==0.1.6
119
+ latexcodec==1.0.0
120
+ ipython==8.0.0
121
+ patsy==0.5.3
122
+ contourpy==1.2.0
123
+ multidict==6.1.0
124
+ mizani==0.9.3
125
+ urllib3==2.1.0
126
+ tokenizers==0.20.0
127
+ MarkupSafe==2.1.2
128
+ pip==24.2
129
+ pexpect==4.8.0
130
+ tqdm==4.66.5
131
+ jedi==0.18.2
132
+ pydantic_core==2.23.4
133
+ tempdir==0.7.1
134
+ mpmath==1.2.1
135
+ setuptools==72.1.0
136
+ pytest==7.4.3
137
+ pure-eval==0.2.2
138
+ psutil==5.9.1
139
+ comm==0.1.2
140
+ nvidia-cuda-cupti-cu11==11.7.101
141
+ nvidia-cuda-nvrtc-cu11==11.7.99
142
+ regex==2023.10.3
143
+ platformdirs==2.5.2
144
+ asttokens==2.2.1
145
+ torch==2.0.0
146
+ nvidia-nccl-cu11==2.14.3
147
+ xxhash==3.5.0
wandb/run-20241030_013141-v317zdzd/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-10-30T05:31:41.692035Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_full",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "7",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1709824413696"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241030_013141-v317zdzd/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-30T01:31:41.694124018-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-10-30T01:31:41.694138749-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013141-v317zdzd/logs/debug-core.log"}
3
+ {"time":"2024-10-30T01:31:41.802315796-04:00","level":"INFO","msg":"created new stream","id":"v317zdzd"}
4
+ {"time":"2024-10-30T01:31:41.802356857-04:00","level":"INFO","msg":"stream: started","id":"v317zdzd"}
5
+ {"time":"2024-10-30T01:31:41.802407437-04:00","level":"INFO","msg":"sender: started","stream_id":"v317zdzd"}
6
+ {"time":"2024-10-30T01:31:41.802396467-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"v317zdzd"}}
7
+ {"time":"2024-10-30T01:31:41.802381677-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"v317zdzd"}}
8
+ {"time":"2024-10-30T01:31:42.031691859-04:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-10-30T01:33:16.596421902-04:00","level":"INFO","msg":"stream: closing","id":"v317zdzd"}
10
+ {"time":"2024-10-30T01:33:16.596523562-04:00","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2024-10-30T01:33:16.59760984-04:00","level":"INFO","msg":"Stopped system monitor"}
wandb/run-20241030_013141-v317zdzd/logs/debug.log ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_setup.py:_flush():79] Configure stats pid to 335756
3
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
8
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013141-v317zdzd/logs/debug.log
10
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013141-v317zdzd/logs/debug-internal.log
11
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_init.py:init():621] calling init triggers
12
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_init.py:init():671] starting backend
15
+ 2024-10-30 01:31:41,690 INFO MainThread:335756 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-10-30 01:31:41,691 INFO MainThread:335756 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-10-30 01:31:41,691 INFO MainThread:335756 [wandb_init.py:init():688] backend started and connected
18
+ 2024-10-30 01:31:41,694 INFO MainThread:335756 [wandb_init.py:init():783] updated telemetry
19
+ 2024-10-30 01:31:41,727 INFO MainThread:335756 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-10-30 01:31:42,028 INFO MainThread:335756 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-10-30 01:31:42,134 INFO MainThread:335756 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-10-30 01:31:42,134 INFO MainThread:335756 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-10-30 01:31:42,135 INFO MainThread:335756 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-10-30 01:31:42,135 INFO MainThread:335756 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-10-30 01:31:42,136 INFO MainThread:335756 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-10-30 01:31:42,136 INFO MainThread:335756 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0}
27
+ 2024-10-30 01:33:16,596 WARNING MsgRouterThr:335756 [router.py:message_loop():77] message_loop has been closed
wandb/run-20241030_222932-l8nv7d2l/files/output.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00, 9.39s/it]
2
+ tokenized_valid: Dataset({
3
+ features: ['input_ids', 'attention_mask'],
4
+ num_rows: 600
5
+ })
6
+ /mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
7
+ warnings.warn(
8
+ [2024-10-30 22:29:54,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
9
+ [2024-10-30 22:30:03,646] [INFO] [comm.py:652:init_distributed] cdb=None
10
+ Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
11
+ Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
12
+ Loading extension module cpu_adam...
13
+ Time to load cpu_adam op: 5.236328601837158 seconds
14
+ wandb: WARNING Fatal error while uploading data. Some run data will not be synced, but it will still be written to disk. Use `wandb sync` at the end of the run to try uploading.
wandb/run-20241030_222932-l8nv7d2l/logs/debug-internal.log ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-30T22:29:32.388391759-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-10-30T22:29:32.388402599-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_222932-l8nv7d2l/logs/debug-core.log"}
3
+ {"time":"2024-10-30T22:29:32.494517518-04:00","level":"INFO","msg":"created new stream","id":"l8nv7d2l"}
4
+ {"time":"2024-10-30T22:29:32.494545668-04:00","level":"INFO","msg":"stream: started","id":"l8nv7d2l"}
5
+ {"time":"2024-10-30T22:29:32.494613299-04:00","level":"INFO","msg":"sender: started","stream_id":"l8nv7d2l"}
6
+ {"time":"2024-10-30T22:29:32.494582519-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"l8nv7d2l"}}
7
+ {"time":"2024-10-30T22:29:32.494578598-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"l8nv7d2l"}}
8
+ {"time":"2024-10-30T22:29:32.702441334-04:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-10-30T22:56:33.113473032-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/impossible_llm_reverse/l8nv7d2l/file_stream"}
10
+ {"time":"2024-10-30T22:56:33.117771758-04:00","level":"ERROR+4","msg":"filestream: fatal error: filestream: failed to upload: 404 Not Found path=files/yaning1001-dartmouth-college/impossible_llm_reverse/l8nv7d2l/file_stream: {\"error\":\"run impossible_llm_reverse/l8nv7d2l not found while streaming file\"}"}
wandb/run-20241030_222932-l8nv7d2l/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-30 22:29:32,383 INFO MainThread:447696 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_setup.py:_flush():79] Configure stats pid to 447696
3
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
8
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_222932-l8nv7d2l/logs/debug.log
10
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_222932-l8nv7d2l/logs/debug-internal.log
11
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_init.py:init():621] calling init triggers
12
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_init.py:init():671] starting backend
15
+ 2024-10-30 22:29:32,384 INFO MainThread:447696 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-10-30 22:29:32,385 INFO MainThread:447696 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-10-30 22:29:32,385 INFO MainThread:447696 [wandb_init.py:init():688] backend started and connected
18
+ 2024-10-30 22:29:32,388 INFO MainThread:447696 [wandb_init.py:init():783] updated telemetry
19
+ 2024-10-30 22:29:32,418 INFO MainThread:447696 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-10-30 22:29:32,698 INFO MainThread:447696 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-10-30 22:29:32,826 INFO MainThread:447696 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-10-30 22:29:32,827 INFO MainThread:447696 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-10-30 22:29:32,827 INFO MainThread:447696 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-10-30 22:29:32,827 INFO MainThread:447696 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-10-30 22:29:32,828 INFO MainThread:447696 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-10-30 22:29:32,829 INFO MainThread:447696 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0}
wandb/run-20241030_222932-lsfm0d2q/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-10-31T02:29:32.440797Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_control",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "3",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1710969503744"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241030_222932-lsfm0d2q/logs/debug-internal.log ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-30T22:29:32.443244692-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-10-30T22:29:32.443263202-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_222932-lsfm0d2q/logs/debug-core.log"}
3
+ {"time":"2024-10-30T22:29:32.554317529-04:00","level":"INFO","msg":"created new stream","id":"lsfm0d2q"}
4
+ {"time":"2024-10-30T22:29:32.554346489-04:00","level":"INFO","msg":"stream: started","id":"lsfm0d2q"}
5
+ {"time":"2024-10-30T22:29:32.554411019-04:00","level":"INFO","msg":"sender: started","stream_id":"lsfm0d2q"}
6
+ {"time":"2024-10-30T22:29:32.554371289-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"lsfm0d2q"}}
7
+ {"time":"2024-10-30T22:29:32.554397639-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"lsfm0d2q"}}
8
+ {"time":"2024-10-30T22:29:32.714899908-04:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-10-30T22:56:18.124763411-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/impossible_llm_reverse/lsfm0d2q/file_stream"}
10
+ {"time":"2024-10-30T22:56:18.129259917-04:00","level":"ERROR+4","msg":"filestream: fatal error: filestream: failed to upload: 404 Not Found path=files/yaning1001-dartmouth-college/impossible_llm_reverse/lsfm0d2q/file_stream: {\"error\":\"run impossible_llm_reverse/lsfm0d2q not found while streaming file\"}"}
wandb/run-20241030_222932-lsfm0d2q/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_setup.py:_flush():79] Configure stats pid to 447700
3
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
8
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_222932-lsfm0d2q/logs/debug.log
10
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_222932-lsfm0d2q/logs/debug-internal.log
11
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_init.py:init():621] calling init triggers
12
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_init.py:init():671] starting backend
15
+ 2024-10-30 22:29:32,439 INFO MainThread:447700 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-10-30 22:29:32,440 INFO MainThread:447700 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-10-30 22:29:32,440 INFO MainThread:447700 [wandb_init.py:init():688] backend started and connected
18
+ 2024-10-30 22:29:32,443 INFO MainThread:447700 [wandb_init.py:init():783] updated telemetry
19
+ 2024-10-30 22:29:32,469 INFO MainThread:447700 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-10-30 22:29:32,711 INFO MainThread:447700 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-10-30 22:29:32,849 INFO MainThread:447700 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-10-30 22:29:32,850 INFO MainThread:447700 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-10-30 22:29:32,850 INFO MainThread:447700 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-10-30 22:29:32,850 INFO MainThread:447700 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-10-30 22:29:32,851 INFO MainThread:447700 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-10-30 22:29:32,852 INFO MainThread:447700 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0}
wandb/run-20241101_012733-4u8e027p/files/output.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.34s/it]
2
+ Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:09<00:00, 4.62s/it]
3
+ Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16425/16425 [00:54<00:00, 301.45 examples/s]
4
+ Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:55<00:00, 307.56 examples/s]
5
+ tokenized_valid: Dataset({
6
+ features: ['input_ids', 'attention_mask'],
7
+ num_rows: 600
8
+ })
9
+ /mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
10
+ warnings.warn(
11
+ [2024-11-01 01:32:35,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
12
+ [2024-11-01 01:32:46,292] [INFO] [comm.py:652:init_distributed] cdb=None
13
+ Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
14
+ Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
15
+ Loading extension module cpu_adam...
16
+ Time to load cpu_adam op: 5.491261959075928 seconds
wandb/run-20241101_012733-4u8e027p/files/requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ funcsigs==1.0.2
2
+ sentry-sdk==2.17.0
3
+ multiprocess==0.70.16
4
+ numpy==1.26.2
5
+ pluralizer==1.2.0
6
+ debugpy==1.6.7
7
+ nvidia-cudnn-cu11==8.5.0.96
8
+ deepspeed==0.15.2
9
+ data==0.4
10
+ pandas==2.1.3
11
+ tomli==2.0.1
12
+ charset-normalizer==3.3.2
13
+ attrs==24.2.0
14
+ aiosignal==1.3.1
15
+ fsspec==2023.10.0
16
+ nvidia-cusparse-cu11==11.7.4.91
17
+ zipp==3.12.0
18
+ mypy-extensions==1.0.0
19
+ datasets==3.0.1
20
+ joblib==1.3.2
21
+ hjson==3.1.0
22
+ traitlets==5.7.1
23
+ stack-data==0.6.0
24
+ transformers==4.45.1
25
+ sympy==1.11.1
26
+ Pygments==2.15.0
27
+ docker-pycreds==0.4.0
28
+ dill==0.3.8
29
+ wheel==0.44.0
30
+ prompt-toolkit==3.0.30
31
+ parso==0.8.3
32
+ ipykernel==6.23.1
33
+ pyarrow==17.0.0
34
+ certifi==2023.11.17
35
+ nvidia-cufft-cu11==10.9.0.58
36
+ six==1.16.0
37
+ pydantic==2.9.2
38
+ click==8.1.7
39
+ nest-asyncio==1.5.6
40
+ gmpy2==2.1.0
41
+ matplotlib==3.8.2
42
+ scipy==1.11.4
43
+ typing_extensions==4.12.2
44
+ statsmodels==0.14.0
45
+ huggingface-hub==0.25.0
46
+ frozenlist==1.4.1
47
+ gpustat==1.1.1
48
+ nvidia-nvtx-cu11==11.7.91
49
+ safetensors==0.4.5
50
+ stanza==1.9.2
51
+ decorator==5.1.1
52
+ seaborn==0.13.0
53
+ sentencepiece==0.2.0
54
+ PyYAML==6.0.1
55
+ black==24.8.0
56
+ protobuf==4.25.1
57
+ pickleshare==0.7.5
58
+ peft==0.13.0
59
+ triton==2.0.0
60
+ nvidia-cuda-runtime-cu11==11.7.99
61
+ Jinja2==3.1.2
62
+ nvidia-cusolver-cu11==11.4.0.1
63
+ executing==1.2.0
64
+ jupyter_client==8.1.0
65
+ pluggy==1.3.0
66
+ cmake==3.30.3
67
+ pytz==2023.3.post1
68
+ aiohappyeyeballs==2.4.2
69
+ kiwisolver==1.4.5
70
+ py-cpuinfo==9.0.0
71
+ Pillow==10.1.0
72
+ ptyprocess==0.7.0
73
+ importlib_resources==6.4.5
74
+ GitPython==3.1.43
75
+ importlib-metadata==6.0.0
76
+ iniconfig==2.0.0
77
+ scikit-learn==1.3.2
78
+ exceptiongroup==1.1.0
79
+ networkx==2.8.6
80
+ accelerate==1.0.0
81
+ nltk==3.8.1
82
+ shutilwhich==1.1.0
83
+ fonttools==4.45.1
84
+ future==0.18.3
85
+ aiohttp==3.10.6
86
+ wcwidth==0.2.5
87
+ idna==3.6
88
+ filelock==3.12.2
89
+ pathspec==0.12.1
90
+ jupyter_core==5.1.0
91
+ lit==18.1.8
92
+ nvidia-curand-cu11==10.2.10.91
93
+ nvidia-cublas-cu11==11.10.3.66
94
+ nvidia-ml-py==12.560.30
95
+ msgpack==1.1.0
96
+ python-dateutil==2.8.2
97
+ blessed==1.20.0
98
+ packaging==23.0
99
+ gitdb==4.0.11
100
+ yarl==1.13.0
101
+ emoji==2.8.0
102
+ tzdata==2023.3
103
+ cycler==0.12.1
104
+ tornado==6.2
105
+ backcall==0.2.0
106
+ plotnine==0.12.4
107
+ ninja==1.11.1.1
108
+ latex==0.7.0
109
+ wandb==0.18.5
110
+ setproctitle==1.3.3
111
+ threadpoolctl==3.2.0
112
+ requests==2.32.3
113
+ pyparsing==3.1.1
114
+ smmap==5.0.1
115
+ pyzmq==23.0.0
116
+ async-timeout==4.0.3
117
+ annotated-types==0.7.0
118
+ matplotlib-inline==0.1.6
119
+ latexcodec==1.0.0
120
+ ipython==8.0.0
121
+ patsy==0.5.3
122
+ contourpy==1.2.0
123
+ multidict==6.1.0
124
+ mizani==0.9.3
125
+ urllib3==2.1.0
126
+ tokenizers==0.20.0
127
+ MarkupSafe==2.1.2
128
+ pip==24.2
129
+ pexpect==4.8.0
130
+ tqdm==4.66.5
131
+ jedi==0.18.2
132
+ pydantic_core==2.23.4
133
+ tempdir==0.7.1
134
+ mpmath==1.2.1
135
+ setuptools==72.1.0
136
+ pytest==7.4.3
137
+ pure-eval==0.2.2
138
+ psutil==5.9.1
139
+ comm==0.1.2
140
+ nvidia-cuda-cupti-cu11==11.7.101
141
+ nvidia-cuda-nvrtc-cu11==11.7.99
142
+ regex==2023.10.3
143
+ platformdirs==2.5.2
144
+ asttokens==2.2.1
145
+ torch==2.0.0
146
+ nvidia-nccl-cu11==2.14.3
147
+ xxhash==3.5.0
wandb/run-20241101_012733-4u8e027p/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-11-01T05:27:33.993570Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "shuffle_nondeterministic",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "6",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1753992269824"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241101_012733-4u8e027p/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_setup.py:_flush():79] Configure stats pid to 678556
3
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
8
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-4u8e027p/logs/debug.log
10
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-4u8e027p/logs/debug-internal.log
11
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_init.py:init():621] calling init triggers
12
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_init.py:init():671] starting backend
15
+ 2024-11-01 01:27:33,991 INFO MainThread:678556 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-11-01 01:27:33,993 INFO MainThread:678556 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-11-01 01:27:33,993 INFO MainThread:678556 [wandb_init.py:init():688] backend started and connected
18
+ 2024-11-01 01:27:33,996 INFO MainThread:678556 [wandb_init.py:init():783] updated telemetry
19
+ 2024-11-01 01:27:34,021 INFO MainThread:678556 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-11-01 01:27:34,320 INFO MainThread:678556 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-11-01 01:27:34,405 INFO MainThread:678556 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-11-01 01:27:34,405 INFO MainThread:678556 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-11-01 01:27:34,405 INFO MainThread:678556 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-11-01 01:27:34,405 INFO MainThread:678556 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-11-01 01:27:34,407 INFO MainThread:678556 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-11-01 01:27:34,407 INFO MainThread:678556 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06}
wandb/run-20241101_012733-e3zsr634/files/output.log ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.36s/it]
2
+ Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00, 3.18s/it]
3
+ generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:00<00:00, 47.5kB/s]
4
+ Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16425/16425 [00:51<00:00, 321.80 examples/s]
5
+ Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:49<00:00, 346.44 examples/s]
6
+ tokenized_valid: Dataset({
7
+ features: ['input_ids', 'attention_mask'],
8
+ num_rows: 600
9
+ })
10
+ /mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
11
+ warnings.warn(
12
+ [2024-11-01 01:32:23,603] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
13
+ [2024-11-01 01:32:32,774] [INFO] [comm.py:652:init_distributed] cdb=None
14
+ Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
15
+ Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
16
+ Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja...
17
+ Building extension module cpu_adam...
18
+ Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
19
+ Loading extension module cpu_adam...
20
+ Time to load cpu_adam op: 5.578649520874023 seconds
wandb/run-20241101_012733-e3zsr634/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-11-01T05:27:33.958355Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "shuffle_nondeterministic",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "6",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1753992261632"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241101_200502-28ivel81/files/output.log ADDED
@@ -0,0 +1 @@
 
 
1
+ Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
wandb/run-20241101_200502-28ivel81/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-11-02T00:05:02.693656Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "shuffle_nondeterministic",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "3",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1754801463296"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241101_201708-b4wkk29o/files/output.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.56s/it]
2
+ tokenized_valid: Dataset({
3
+ features: ['input_ids', 'attention_mask'],
4
+ num_rows: 600
5
+ })
6
+ /mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
7
+ warnings.warn(
8
+ [2024-11-01 20:17:16,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
9
+ [2024-11-01 20:17:26,148] [INFO] [comm.py:652:init_distributed] cdb=None
10
+ Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
11
+ Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
12
+ Loading extension module cpu_adam...
13
+ Time to load cpu_adam op: 5.02955174446106 seconds
wandb/run-20241101_201708-b4wkk29o/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-11-02T00:17:08.113936Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "shuffle_nondeterministic",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "3",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1754802659328"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241101_201708-b4wkk29o/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-01 20:17:08,110 INFO MainThread:875622 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_setup.py:_flush():79] Configure stats pid to 875622
3
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
8
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_201708-b4wkk29o/logs/debug.log
10
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_201708-b4wkk29o/logs/debug-internal.log
11
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_init.py:init():621] calling init triggers
12
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_init.py:init():671] starting backend
15
+ 2024-11-01 20:17:08,111 INFO MainThread:875622 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-11-01 20:17:08,113 INFO MainThread:875622 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-11-01 20:17:08,113 INFO MainThread:875622 [wandb_init.py:init():688] backend started and connected
18
+ 2024-11-01 20:17:08,116 INFO MainThread:875622 [wandb_init.py:init():783] updated telemetry
19
+ 2024-11-01 20:17:08,142 INFO MainThread:875622 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-11-01 20:17:09,975 INFO MainThread:875622 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-11-01 20:17:10,065 INFO MainThread:875622 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-11-01 20:17:10,065 INFO MainThread:875622 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-11-01 20:17:10,065 INFO MainThread:875622 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-11-01 20:17:10,065 INFO MainThread:875622 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-11-01 20:17:10,067 INFO MainThread:875622 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-11-01 20:17:10,067 INFO MainThread:875622 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
wandb/run-20241101_201708-b4wkk29o/run-b4wkk29o.wandb ADDED
Binary file (32.8 kB). View file
 
wandb/run-20241101_201926-5y6ulxig/files/output.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.41s/it]
2
+ tokenized_valid: Dataset({
3
+ features: ['input_ids', 'attention_mask'],
4
+ num_rows: 600
5
+ })
6
+ /mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
7
+ warnings.warn(
8
+ [2024-11-01 20:19:34,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
9
+ [2024-11-01 20:19:43,157] [INFO] [comm.py:652:init_distributed] cdb=None
10
+ Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
11
+ Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
12
+ Loading extension module cpu_adam...
13
+ Time to load cpu_adam op: 5.544436693191528 seconds
wandb/run-20241101_201926-5y6ulxig/files/requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ funcsigs==1.0.2
2
+ sentry-sdk==2.17.0
3
+ multiprocess==0.70.16
4
+ numpy==1.26.2
5
+ pluralizer==1.2.0
6
+ debugpy==1.6.7
7
+ nvidia-cudnn-cu11==8.5.0.96
8
+ deepspeed==0.15.2
9
+ data==0.4
10
+ pandas==2.1.3
11
+ tomli==2.0.1
12
+ charset-normalizer==3.3.2
13
+ attrs==24.2.0
14
+ aiosignal==1.3.1
15
+ fsspec==2023.10.0
16
+ nvidia-cusparse-cu11==11.7.4.91
17
+ zipp==3.12.0
18
+ mypy-extensions==1.0.0
19
+ datasets==3.0.1
20
+ joblib==1.3.2
21
+ hjson==3.1.0
22
+ traitlets==5.7.1
23
+ stack-data==0.6.0
24
+ transformers==4.45.1
25
+ sympy==1.11.1
26
+ Pygments==2.15.0
27
+ docker-pycreds==0.4.0
28
+ dill==0.3.8
29
+ wheel==0.44.0
30
+ prompt-toolkit==3.0.30
31
+ parso==0.8.3
32
+ ipykernel==6.23.1
33
+ pyarrow==17.0.0
34
+ certifi==2023.11.17
35
+ nvidia-cufft-cu11==10.9.0.58
36
+ six==1.16.0
37
+ pydantic==2.9.2
38
+ click==8.1.7
39
+ nest-asyncio==1.5.6
40
+ gmpy2==2.1.0
41
+ matplotlib==3.8.2
42
+ scipy==1.11.4
43
+ typing_extensions==4.12.2
44
+ statsmodels==0.14.0
45
+ huggingface-hub==0.25.0
46
+ frozenlist==1.4.1
47
+ gpustat==1.1.1
48
+ nvidia-nvtx-cu11==11.7.91
49
+ safetensors==0.4.5
50
+ stanza==1.9.2
51
+ decorator==5.1.1
52
+ seaborn==0.13.0
53
+ sentencepiece==0.2.0
54
+ PyYAML==6.0.1
55
+ black==24.8.0
56
+ protobuf==4.25.1
57
+ pickleshare==0.7.5
58
+ peft==0.13.0
59
+ triton==2.0.0
60
+ nvidia-cuda-runtime-cu11==11.7.99
61
+ Jinja2==3.1.2
62
+ nvidia-cusolver-cu11==11.4.0.1
63
+ executing==1.2.0
64
+ jupyter_client==8.1.0
65
+ pluggy==1.3.0
66
+ cmake==3.30.3
67
+ pytz==2023.3.post1
68
+ aiohappyeyeballs==2.4.2
69
+ kiwisolver==1.4.5
70
+ py-cpuinfo==9.0.0
71
+ Pillow==10.1.0
72
+ ptyprocess==0.7.0
73
+ importlib_resources==6.4.5
74
+ GitPython==3.1.43
75
+ importlib-metadata==6.0.0
76
+ iniconfig==2.0.0
77
+ scikit-learn==1.3.2
78
+ exceptiongroup==1.1.0
79
+ networkx==2.8.6
80
+ accelerate==1.0.0
81
+ nltk==3.8.1
82
+ shutilwhich==1.1.0
83
+ fonttools==4.45.1
84
+ future==0.18.3
85
+ aiohttp==3.10.6
86
+ wcwidth==0.2.5
87
+ idna==3.6
88
+ filelock==3.12.2
89
+ pathspec==0.12.1
90
+ jupyter_core==5.1.0
91
+ lit==18.1.8
92
+ nvidia-curand-cu11==10.2.10.91
93
+ nvidia-cublas-cu11==11.10.3.66
94
+ nvidia-ml-py==12.560.30
95
+ msgpack==1.1.0
96
+ python-dateutil==2.8.2
97
+ blessed==1.20.0
98
+ packaging==23.0
99
+ gitdb==4.0.11
100
+ yarl==1.13.0
101
+ emoji==2.8.0
102
+ tzdata==2023.3
103
+ cycler==0.12.1
104
+ tornado==6.2
105
+ backcall==0.2.0
106
+ plotnine==0.12.4
107
+ ninja==1.11.1.1
108
+ latex==0.7.0
109
+ wandb==0.18.5
110
+ setproctitle==1.3.3
111
+ threadpoolctl==3.2.0
112
+ requests==2.32.3
113
+ pyparsing==3.1.1
114
+ smmap==5.0.1
115
+ pyzmq==23.0.0
116
+ async-timeout==4.0.3
117
+ annotated-types==0.7.0
118
+ matplotlib-inline==0.1.6
119
+ latexcodec==1.0.0
120
+ ipython==8.0.0
121
+ patsy==0.5.3
122
+ contourpy==1.2.0
123
+ multidict==6.1.0
124
+ mizani==0.9.3
125
+ urllib3==2.1.0
126
+ tokenizers==0.20.0
127
+ MarkupSafe==2.1.2
128
+ pip==24.2
129
+ pexpect==4.8.0
130
+ tqdm==4.66.5
131
+ jedi==0.18.2
132
+ pydantic_core==2.23.4
133
+ tempdir==0.7.1
134
+ mpmath==1.2.1
135
+ setuptools==72.1.0
136
+ pytest==7.4.3
137
+ pure-eval==0.2.2
138
+ psutil==5.9.1
139
+ comm==0.1.2
140
+ nvidia-cuda-cupti-cu11==11.7.101
141
+ nvidia-cuda-nvrtc-cu11==11.7.99
142
+ regex==2023.10.3
143
+ platformdirs==2.5.2
144
+ asttokens==2.2.1
145
+ torch==2.0.0
146
+ nvidia-nccl-cu11==2.14.3
147
+ xxhash==3.5.0
wandb/run-20241101_201926-5y6ulxig/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-11-02T00:19:26.870793Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "shuffle_nondeterministic",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "3",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1754803675136"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }