shibbir24 commited on
Commit
7632f05
·
1 Parent(s): b756bbd

Add # app_type: streamlit to enable Streamlit

Browse files
Files changed (41) hide show
  1. .gitattributes +0 -35
  2. README.md +0 -14
  3. dataset/amazon_product_reviews.csv +0 -0
  4. evaluate_model.py +0 -113
  5. finetune_lora.py +0 -100
  6. lora_adapter/README.md +0 -207
  7. lora_adapter/adapter_config.json +0 -38
  8. lora_adapter/adapter_model.safetensors +0 -3
  9. lora_adapter/checkpoint-2500/README.md +0 -207
  10. lora_adapter/checkpoint-2500/adapter_config.json +0 -38
  11. lora_adapter/checkpoint-2500/adapter_model.safetensors +0 -3
  12. lora_adapter/checkpoint-2500/merges.txt +0 -0
  13. lora_adapter/checkpoint-2500/optimizer.pt +0 -3
  14. lora_adapter/checkpoint-2500/rng_state.pth +0 -3
  15. lora_adapter/checkpoint-2500/scheduler.pt +0 -3
  16. lora_adapter/checkpoint-2500/special_tokens_map.json +0 -6
  17. lora_adapter/checkpoint-2500/tokenizer.json +0 -0
  18. lora_adapter/checkpoint-2500/tokenizer_config.json +0 -21
  19. lora_adapter/checkpoint-2500/trainer_state.json +0 -3534
  20. lora_adapter/checkpoint-2500/training_args.bin +0 -3
  21. lora_adapter/checkpoint-2500/vocab.json +0 -0
  22. lora_adapter/checkpoint-5000/README.md +0 -207
  23. lora_adapter/checkpoint-5000/adapter_config.json +0 -38
  24. lora_adapter/checkpoint-5000/adapter_model.safetensors +0 -3
  25. lora_adapter/checkpoint-5000/merges.txt +0 -0
  26. lora_adapter/checkpoint-5000/optimizer.pt +0 -3
  27. lora_adapter/checkpoint-5000/rng_state.pth +0 -3
  28. lora_adapter/checkpoint-5000/scheduler.pt +0 -3
  29. lora_adapter/checkpoint-5000/special_tokens_map.json +0 -6
  30. lora_adapter/checkpoint-5000/tokenizer.json +0 -0
  31. lora_adapter/checkpoint-5000/tokenizer_config.json +0 -21
  32. lora_adapter/checkpoint-5000/trainer_state.json +0 -0
  33. lora_adapter/checkpoint-5000/training_args.bin +0 -3
  34. lora_adapter/checkpoint-5000/vocab.json +0 -0
  35. lora_adapter/merges.txt +0 -0
  36. lora_adapter/special_tokens_map.json +0 -6
  37. lora_adapter/tokenizer.json +0 -0
  38. lora_adapter/tokenizer_config.json +0 -21
  39. lora_adapter/vocab.json +0 -0
  40. requirements.txt +0 -0
  41. start.sh +0 -3
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md DELETED
@@ -1,14 +0,0 @@
1
- ---
2
- title: SmartReviewAI
3
- emoji: 🏃
4
- colorFrom: pink
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: AI-powered product review generation using fine-tuned LLMs
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/amazon_product_reviews.csv DELETED
The diff for this file is too large to render. See raw diff
 
evaluate_model.py DELETED
@@ -1,113 +0,0 @@
1
- from transformers import AutoTokenizer, AutoModelForCausalLM
2
- from peft import PeftModel
3
- import torch
4
- import numpy as np
5
- import pandas as pd
6
- import re
7
- from collections import Counter
8
-
9
- # ------------------ Review Generation ------------------
10
- def generate_review(base_model, product, category, features, rating, tone, review_cache=None):
11
- """
12
- Generate a product review using LoRA fine-tuned model and apply repetition control.
13
- Optionally evaluates performance every 10 reviews.
14
- """
15
- adapter_path = "./lora_adapter"
16
- tokenizer = AutoTokenizer.from_pretrained(base_model)
17
- model = AutoModelForCausalLM.from_pretrained(base_model)
18
- model = PeftModel.from_pretrained(model, adapter_path)
19
- model.eval()
20
-
21
- prompt = (
22
- f"Product: {product}\n"
23
- f"Category: {category}\n"
24
- f"Features: {features}\n"
25
- f"Rating: {rating}\n"
26
- f"Tone: {tone}\n\nReview:"
27
- )
28
-
29
- inputs = tokenizer(prompt, return_tensors="pt")
30
-
31
- with torch.no_grad():
32
- outputs = model.generate(
33
- **inputs,
34
- max_new_tokens=180,
35
- temperature=0.8,
36
- top_p=0.9,
37
- repetition_penalty=1.8,
38
- no_repeat_ngram_size=3,
39
- do_sample=True
40
- )
41
-
42
- generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
43
-
44
- # -------- Optional: Evaluation Trigger --------
45
- if review_cache is not None:
46
- review_cache.append(generated_text)
47
- if len(review_cache) % 10 == 0:
48
- metrics = compute_metrics(review_cache, requested_tone=tone)
49
- diversity = distinct_n_score(review_cache)
50
- metrics["distinct_n"] = diversity
51
- print(f"\n📊 Auto Evaluation after {len(review_cache)} reviews:")
52
- print(metrics)
53
-
54
- return generated_text
55
-
56
-
57
- # ------------------ Evaluation Metrics ------------------
58
- def compute_metrics(reviews, requested_tone="neutral"):
59
- """
60
- Compute simple text-level metrics:
61
- - avg_length: average word count
62
- - tone_match_ratio: how often requested tone appears
63
- """
64
- avg_length = np.mean([len(r.split()) for r in reviews]) if reviews else 0
65
- tone_match = sum(1 for r in reviews if re.search(requested_tone, r, re.IGNORECASE))
66
- tone_match_ratio = tone_match / len(reviews) if reviews else 0.0
67
- return {
68
- "avg_length": round(avg_length, 2),
69
- "tone_match_ratio": round(tone_match_ratio, 3)
70
- }
71
-
72
-
73
- # ------------------ Diversity Metric ------------------
74
- def distinct_n_score(texts, n=2):
75
- """
76
- Compute Distinct-N score (uniqueness measure).
77
- High values mean less repetition.
78
- """
79
- all_ngrams = []
80
- for text in texts:
81
- tokens = text.split()
82
- all_ngrams.extend(tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1))
83
- if not all_ngrams:
84
- return 0.0
85
- unique_ngrams = len(set(all_ngrams))
86
- return round(unique_ngrams / len(all_ngrams), 3)
87
-
88
-
89
- # ------------------ Perplexity Evaluation ------------------
90
- def evaluate_perplexity(base_model, test_csv="dataset/amazon_product_reviews.csv"):
91
- """
92
- Compute perplexity on a small subset of test data.
93
- Lower perplexity = better model.
94
- """
95
- tokenizer = AutoTokenizer.from_pretrained(base_model)
96
- model = AutoModelForCausalLM.from_pretrained(base_model)
97
- model = PeftModel.from_pretrained(model, "./lora_adapter")
98
- model.eval()
99
-
100
- df = pd.read_csv(test_csv)
101
- texts = df["Review"].dropna().sample(min(50, len(df))).tolist()
102
-
103
- total_loss, total_tokens = 0, 0
104
- for text in texts:
105
- inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
106
- with torch.no_grad():
107
- outputs = model(**inputs, labels=inputs["input_ids"])
108
- loss = outputs.loss.item()
109
- total_loss += loss * inputs["input_ids"].size(1)
110
- total_tokens += inputs["input_ids"].size(1)
111
-
112
- ppl = np.exp(total_loss / total_tokens) if total_tokens > 0 else float("inf")
113
- return round(ppl, 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
finetune_lora.py DELETED
@@ -1,100 +0,0 @@
1
- import os
2
- import torch
3
- from datasets import load_dataset
4
- from transformers import (
5
- AutoModelForCausalLM,
6
- AutoTokenizer,
7
- Trainer,
8
- TrainingArguments,
9
- DataCollatorForLanguageModeling,
10
- )
11
- from peft import LoraConfig, get_peft_model
12
- import streamlit as st
13
-
14
- def train_lora(base_model: str, epochs: int = 2, lr: float = 1e-4, train_csv: str = "dataset/amazon_product_reviews.csv"):
15
- """
16
- Fine-tune a base model using LoRA on the provided dataset and visualize progress in Streamlit.
17
- """
18
- st.write(f"### 🔧 Loading base model `{base_model}`...")
19
- tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
20
- if tokenizer.pad_token is None:
21
- tokenizer.pad_token = tokenizer.eos_token
22
- tokenizer.padding_side = "right"
23
-
24
- # Load dataset
25
- st.info("📂 Loading dataset for fine-tuning...")
26
- ds = load_dataset("csv", data_files={"train": train_csv})["train"]
27
-
28
- def preprocess(example):
29
- prompt = (
30
- f"Product: {example.get('Product','')}\n"
31
- f"Category: {example.get('Category','')}\n"
32
- f"Features: {example.get('Features','')}\n"
33
- f"Rating: {example.get('Rating','')}\n"
34
- f"Tone: {example.get('Tone','')}\n\n"
35
- f"Review: {example.get('Review','')}"
36
- )
37
- return tokenizer(prompt, truncation=True, padding="max_length", max_length=256)
38
-
39
- tokenized_ds = ds.map(preprocess, batched=False)
40
-
41
- # LoRA config
42
- lora_config = LoraConfig(
43
- r=8,
44
- lora_alpha=16,
45
- target_modules=["c_attn", "q_proj", "v_proj"],
46
- lora_dropout=0.05,
47
- bias="none",
48
- task_type="CAUSAL_LM"
49
- )
50
-
51
- # Apply LoRA to base model
52
- model = AutoModelForCausalLM.from_pretrained(base_model)
53
- model = get_peft_model(model, lora_config)
54
- data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
55
-
56
- output_dir = "./lora_adapter"
57
- os.makedirs(output_dir, exist_ok=True)
58
-
59
- # Streamlit progress UI
60
- progress_bar = st.progress(0)
61
- status_text = st.empty()
62
- loss_chart = st.empty()
63
- loss_list = []
64
-
65
- from transformers import TrainerCallback
66
- class StreamlitCallback(TrainerCallback):
67
- def on_log(self, args, state, control, logs=None, **kwargs):
68
- if logs and "loss" in logs:
69
- loss = logs["loss"]
70
- loss_list.append(loss)
71
- progress = int((state.epoch / epochs) * 100)
72
- progress_bar.progress(progress)
73
- status_text.text(f"Epoch {state.epoch:.1f}/{epochs} | Step {state.global_step} | Loss: {loss:.4f}")
74
- loss_chart.line_chart(loss_list)
75
-
76
- training_args = TrainingArguments(
77
- output_dir=output_dir,
78
- per_device_train_batch_size=2,
79
- num_train_epochs=epochs,
80
- learning_rate=lr,
81
- logging_steps=5,
82
- save_strategy="epoch",
83
- report_to="none"
84
- )
85
-
86
- trainer = Trainer(
87
- model=model,
88
- args=training_args,
89
- train_dataset=tokenized_ds,
90
- data_collator=data_collator,
91
- tokenizer=tokenizer,
92
- callbacks=[StreamlitCallback()]
93
- )
94
-
95
- trainer.train()
96
- model.save_pretrained(output_dir)
97
- tokenizer.save_pretrained(output_dir)
98
-
99
- st.success("🎉 LoRA adapter trained and saved successfully!")
100
- return {"train_loss": loss_list, "epochs": epochs, "base_model": base_model}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lora_adapter/README.md DELETED
@@ -1,207 +0,0 @@
1
- ---
2
- base_model: gpt2
3
- library_name: peft
4
- pipeline_tag: text-generation
5
- tags:
6
- - base_model:adapter:gpt2
7
- - lora
8
- - transformers
9
- ---
10
-
11
- # Model Card for Model ID
12
-
13
- <!-- Provide a quick summary of what the model is/does. -->
14
-
15
-
16
-
17
- ## Model Details
18
-
19
- ### Model Description
20
-
21
- <!-- Provide a longer summary of what this model is. -->
22
-
23
-
24
-
25
- - **Developed by:** [More Information Needed]
26
- - **Funded by [optional]:** [More Information Needed]
27
- - **Shared by [optional]:** [More Information Needed]
28
- - **Model type:** [More Information Needed]
29
- - **Language(s) (NLP):** [More Information Needed]
30
- - **License:** [More Information Needed]
31
- - **Finetuned from model [optional]:** [More Information Needed]
32
-
33
- ### Model Sources [optional]
34
-
35
- <!-- Provide the basic links for the model. -->
36
-
37
- - **Repository:** [More Information Needed]
38
- - **Paper [optional]:** [More Information Needed]
39
- - **Demo [optional]:** [More Information Needed]
40
-
41
- ## Uses
42
-
43
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
-
45
- ### Direct Use
46
-
47
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
-
49
- [More Information Needed]
50
-
51
- ### Downstream Use [optional]
52
-
53
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
-
55
- [More Information Needed]
56
-
57
- ### Out-of-Scope Use
58
-
59
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
-
61
- [More Information Needed]
62
-
63
- ## Bias, Risks, and Limitations
64
-
65
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
-
67
- [More Information Needed]
68
-
69
- ### Recommendations
70
-
71
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
-
73
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
-
75
- ## How to Get Started with the Model
76
-
77
- Use the code below to get started with the model.
78
-
79
- [More Information Needed]
80
-
81
- ## Training Details
82
-
83
- ### Training Data
84
-
85
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
-
87
- [More Information Needed]
88
-
89
- ### Training Procedure
90
-
91
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
-
93
- #### Preprocessing [optional]
94
-
95
- [More Information Needed]
96
-
97
-
98
- #### Training Hyperparameters
99
-
100
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
-
102
- #### Speeds, Sizes, Times [optional]
103
-
104
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
-
106
- [More Information Needed]
107
-
108
- ## Evaluation
109
-
110
- <!-- This section describes the evaluation protocols and provides the results. -->
111
-
112
- ### Testing Data, Factors & Metrics
113
-
114
- #### Testing Data
115
-
116
- <!-- This should link to a Dataset Card if possible. -->
117
-
118
- [More Information Needed]
119
-
120
- #### Factors
121
-
122
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
-
124
- [More Information Needed]
125
-
126
- #### Metrics
127
-
128
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
-
130
- [More Information Needed]
131
-
132
- ### Results
133
-
134
- [More Information Needed]
135
-
136
- #### Summary
137
-
138
-
139
-
140
- ## Model Examination [optional]
141
-
142
- <!-- Relevant interpretability work for the model goes here -->
143
-
144
- [More Information Needed]
145
-
146
- ## Environmental Impact
147
-
148
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
-
150
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
-
152
- - **Hardware Type:** [More Information Needed]
153
- - **Hours used:** [More Information Needed]
154
- - **Cloud Provider:** [More Information Needed]
155
- - **Compute Region:** [More Information Needed]
156
- - **Carbon Emitted:** [More Information Needed]
157
-
158
- ## Technical Specifications [optional]
159
-
160
- ### Model Architecture and Objective
161
-
162
- [More Information Needed]
163
-
164
- ### Compute Infrastructure
165
-
166
- [More Information Needed]
167
-
168
- #### Hardware
169
-
170
- [More Information Needed]
171
-
172
- #### Software
173
-
174
- [More Information Needed]
175
-
176
- ## Citation [optional]
177
-
178
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
-
180
- **BibTeX:**
181
-
182
- [More Information Needed]
183
-
184
- **APA:**
185
-
186
- [More Information Needed]
187
-
188
- ## Glossary [optional]
189
-
190
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
-
192
- [More Information Needed]
193
-
194
- ## More Information [optional]
195
-
196
- [More Information Needed]
197
-
198
- ## Model Card Authors [optional]
199
-
200
- [More Information Needed]
201
-
202
- ## Model Card Contact
203
-
204
- [More Information Needed]
205
- ### Framework versions
206
-
207
- - PEFT 0.17.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lora_adapter/adapter_config.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "gpt2",
5
- "bias": "none",
6
- "corda_config": null,
7
- "eva_config": null,
8
- "exclude_modules": null,
9
- "fan_in_fan_out": true,
10
- "inference_mode": true,
11
- "init_lora_weights": true,
12
- "layer_replication": null,
13
- "layers_pattern": null,
14
- "layers_to_transform": null,
15
- "loftq_config": {},
16
- "lora_alpha": 16,
17
- "lora_bias": false,
18
- "lora_dropout": 0.05,
19
- "megatron_config": null,
20
- "megatron_core": "megatron.core",
21
- "modules_to_save": null,
22
- "peft_type": "LORA",
23
- "qalora_group_size": 16,
24
- "r": 8,
25
- "rank_pattern": {},
26
- "revision": null,
27
- "target_modules": [
28
- "q_proj",
29
- "c_attn",
30
- "v_proj"
31
- ],
32
- "target_parameters": null,
33
- "task_type": "CAUSAL_LM",
34
- "trainable_token_indices": null,
35
- "use_dora": false,
36
- "use_qalora": false,
37
- "use_rslora": false
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lora_adapter/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:56ae2a27b7624b3b0f0db362e7f072e5939af1914786000af021a132df291b1d
3
- size 1182680
 
 
 
 
lora_adapter/checkpoint-2500/README.md DELETED
@@ -1,207 +0,0 @@
1
- ---
2
- base_model: gpt2
3
- library_name: peft
4
- pipeline_tag: text-generation
5
- tags:
6
- - base_model:adapter:gpt2
7
- - lora
8
- - transformers
9
- ---
10
-
11
- # Model Card for Model ID
12
-
13
- <!-- Provide a quick summary of what the model is/does. -->
14
-
15
-
16
-
17
- ## Model Details
18
-
19
- ### Model Description
20
-
21
- <!-- Provide a longer summary of what this model is. -->
22
-
23
-
24
-
25
- - **Developed by:** [More Information Needed]
26
- - **Funded by [optional]:** [More Information Needed]
27
- - **Shared by [optional]:** [More Information Needed]
28
- - **Model type:** [More Information Needed]
29
- - **Language(s) (NLP):** [More Information Needed]
30
- - **License:** [More Information Needed]
31
- - **Finetuned from model [optional]:** [More Information Needed]
32
-
33
- ### Model Sources [optional]
34
-
35
- <!-- Provide the basic links for the model. -->
36
-
37
- - **Repository:** [More Information Needed]
38
- - **Paper [optional]:** [More Information Needed]
39
- - **Demo [optional]:** [More Information Needed]
40
-
41
- ## Uses
42
-
43
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
-
45
- ### Direct Use
46
-
47
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
-
49
- [More Information Needed]
50
-
51
- ### Downstream Use [optional]
52
-
53
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
-
55
- [More Information Needed]
56
-
57
- ### Out-of-Scope Use
58
-
59
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
-
61
- [More Information Needed]
62
-
63
- ## Bias, Risks, and Limitations
64
-
65
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
-
67
- [More Information Needed]
68
-
69
- ### Recommendations
70
-
71
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
-
73
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
-
75
- ## How to Get Started with the Model
76
-
77
- Use the code below to get started with the model.
78
-
79
- [More Information Needed]
80
-
81
- ## Training Details
82
-
83
- ### Training Data
84
-
85
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
-
87
- [More Information Needed]
88
-
89
- ### Training Procedure
90
-
91
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
-
93
- #### Preprocessing [optional]
94
-
95
- [More Information Needed]
96
-
97
-
98
- #### Training Hyperparameters
99
-
100
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
-
102
- #### Speeds, Sizes, Times [optional]
103
-
104
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
-
106
- [More Information Needed]
107
-
108
- ## Evaluation
109
-
110
- <!-- This section describes the evaluation protocols and provides the results. -->
111
-
112
- ### Testing Data, Factors & Metrics
113
-
114
- #### Testing Data
115
-
116
- <!-- This should link to a Dataset Card if possible. -->
117
-
118
- [More Information Needed]
119
-
120
- #### Factors
121
-
122
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
-
124
- [More Information Needed]
125
-
126
- #### Metrics
127
-
128
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
-
130
- [More Information Needed]
131
-
132
- ### Results
133
-
134
- [More Information Needed]
135
-
136
- #### Summary
137
-
138
-
139
-
140
- ## Model Examination [optional]
141
-
142
- <!-- Relevant interpretability work for the model goes here -->
143
-
144
- [More Information Needed]
145
-
146
- ## Environmental Impact
147
-
148
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
-
150
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
-
152
- - **Hardware Type:** [More Information Needed]
153
- - **Hours used:** [More Information Needed]
154
- - **Cloud Provider:** [More Information Needed]
155
- - **Compute Region:** [More Information Needed]
156
- - **Carbon Emitted:** [More Information Needed]
157
-
158
- ## Technical Specifications [optional]
159
-
160
- ### Model Architecture and Objective
161
-
162
- [More Information Needed]
163
-
164
- ### Compute Infrastructure
165
-
166
- [More Information Needed]
167
-
168
- #### Hardware
169
-
170
- [More Information Needed]
171
-
172
- #### Software
173
-
174
- [More Information Needed]
175
-
176
- ## Citation [optional]
177
-
178
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
-
180
- **BibTeX:**
181
-
182
- [More Information Needed]
183
-
184
- **APA:**
185
-
186
- [More Information Needed]
187
-
188
- ## Glossary [optional]
189
-
190
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
-
192
- [More Information Needed]
193
-
194
- ## More Information [optional]
195
-
196
- [More Information Needed]
197
-
198
- ## Model Card Authors [optional]
199
-
200
- [More Information Needed]
201
-
202
- ## Model Card Contact
203
-
204
- [More Information Needed]
205
- ### Framework versions
206
-
207
- - PEFT 0.17.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lora_adapter/checkpoint-2500/adapter_config.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "gpt2",
5
- "bias": "none",
6
- "corda_config": null,
7
- "eva_config": null,
8
- "exclude_modules": null,
9
- "fan_in_fan_out": true,
10
- "inference_mode": true,
11
- "init_lora_weights": true,
12
- "layer_replication": null,
13
- "layers_pattern": null,
14
- "layers_to_transform": null,
15
- "loftq_config": {},
16
- "lora_alpha": 16,
17
- "lora_bias": false,
18
- "lora_dropout": 0.05,
19
- "megatron_config": null,
20
- "megatron_core": "megatron.core",
21
- "modules_to_save": null,
22
- "peft_type": "LORA",
23
- "qalora_group_size": 16,
24
- "r": 8,
25
- "rank_pattern": {},
26
- "revision": null,
27
- "target_modules": [
28
- "q_proj",
29
- "c_attn",
30
- "v_proj"
31
- ],
32
- "target_parameters": null,
33
- "task_type": "CAUSAL_LM",
34
- "trainable_token_indices": null,
35
- "use_dora": false,
36
- "use_qalora": false,
37
- "use_rslora": false
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lora_adapter/checkpoint-2500/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d0d62a7de1f452e4ad3d74a902aff00e6cd4599100cadacdcbfced18f4c7061
3
- size 1182680
 
 
 
 
lora_adapter/checkpoint-2500/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
lora_adapter/checkpoint-2500/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:eac60d7e5da930414ed371f4f0e3ce3f14dc24a0680b36e7be1cd17f3f2a2a74
3
- size 2379751
 
 
 
 
lora_adapter/checkpoint-2500/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:590f57b9ce13d5a2899ae3c2e3f58480cd67308a0e9800d0e7183808d09f6442
3
- size 14391
 
 
 
 
lora_adapter/checkpoint-2500/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:29d95e81169b69828873fd84c06db4aed77d764c52f1965537a833a8d1bde196
3
- size 1465
 
 
 
 
lora_adapter/checkpoint-2500/special_tokens_map.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "pad_token": "<|endoftext|>",
5
- "unk_token": "<|endoftext|>"
6
- }
 
 
 
 
 
 
 
lora_adapter/checkpoint-2500/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
lora_adapter/checkpoint-2500/tokenizer_config.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "added_tokens_decoder": {
4
- "50256": {
5
- "content": "<|endoftext|>",
6
- "lstrip": false,
7
- "normalized": true,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- }
12
- },
13
- "bos_token": "<|endoftext|>",
14
- "clean_up_tokenization_spaces": false,
15
- "eos_token": "<|endoftext|>",
16
- "extra_special_tokens": {},
17
- "model_max_length": 1024,
18
- "pad_token": "<|endoftext|>",
19
- "tokenizer_class": "GPT2Tokenizer",
20
- "unk_token": "<|endoftext|>"
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lora_adapter/checkpoint-2500/trainer_state.json DELETED
@@ -1,3534 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
- "eval_steps": 500,
7
- "global_step": 2500,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.002,
14
- "grad_norm": 0.5721193552017212,
15
- "learning_rate": 9.992e-05,
16
- "loss": 4.2877,
17
- "step": 5
18
- },
19
- {
20
- "epoch": 0.004,
21
- "grad_norm": 0.6026411056518555,
22
- "learning_rate": 9.982e-05,
23
- "loss": 4.6802,
24
- "step": 10
25
- },
26
- {
27
- "epoch": 0.006,
28
- "grad_norm": 0.9385420680046082,
29
- "learning_rate": 9.972e-05,
30
- "loss": 4.6201,
31
- "step": 15
32
- },
33
- {
34
- "epoch": 0.008,
35
- "grad_norm": 0.8009935021400452,
36
- "learning_rate": 9.962e-05,
37
- "loss": 4.7671,
38
- "step": 20
39
- },
40
- {
41
- "epoch": 0.01,
42
- "grad_norm": 0.9409578442573547,
43
- "learning_rate": 9.952e-05,
44
- "loss": 4.2347,
45
- "step": 25
46
- },
47
- {
48
- "epoch": 0.012,
49
- "grad_norm": 1.1376001834869385,
50
- "learning_rate": 9.942000000000001e-05,
51
- "loss": 4.4625,
52
- "step": 30
53
- },
54
- {
55
- "epoch": 0.014,
56
- "grad_norm": 0.9677644371986389,
57
- "learning_rate": 9.932e-05,
58
- "loss": 4.5317,
59
- "step": 35
60
- },
61
- {
62
- "epoch": 0.016,
63
- "grad_norm": 0.878607988357544,
64
- "learning_rate": 9.922e-05,
65
- "loss": 4.1702,
66
- "step": 40
67
- },
68
- {
69
- "epoch": 0.018,
70
- "grad_norm": 1.034571886062622,
71
- "learning_rate": 9.912e-05,
72
- "loss": 4.215,
73
- "step": 45
74
- },
75
- {
76
- "epoch": 0.02,
77
- "grad_norm": 1.0319870710372925,
78
- "learning_rate": 9.902e-05,
79
- "loss": 3.9984,
80
- "step": 50
81
- },
82
- {
83
- "epoch": 0.022,
84
- "grad_norm": 0.7936278581619263,
85
- "learning_rate": 9.892e-05,
86
- "loss": 4.1078,
87
- "step": 55
88
- },
89
- {
90
- "epoch": 0.024,
91
- "grad_norm": 1.5388593673706055,
92
- "learning_rate": 9.882e-05,
93
- "loss": 4.1454,
94
- "step": 60
95
- },
96
- {
97
- "epoch": 0.026,
98
- "grad_norm": 1.1013274192810059,
99
- "learning_rate": 9.872e-05,
100
- "loss": 4.1011,
101
- "step": 65
102
- },
103
- {
104
- "epoch": 0.028,
105
- "grad_norm": 1.3863942623138428,
106
- "learning_rate": 9.862e-05,
107
- "loss": 3.8758,
108
- "step": 70
109
- },
110
- {
111
- "epoch": 0.03,
112
- "grad_norm": 1.2699391841888428,
113
- "learning_rate": 9.852e-05,
114
- "loss": 3.8447,
115
- "step": 75
116
- },
117
- {
118
- "epoch": 0.032,
119
- "grad_norm": 0.79298996925354,
120
- "learning_rate": 9.842e-05,
121
- "loss": 3.6708,
122
- "step": 80
123
- },
124
- {
125
- "epoch": 0.034,
126
- "grad_norm": 1.3336719274520874,
127
- "learning_rate": 9.832000000000001e-05,
128
- "loss": 3.8648,
129
- "step": 85
130
- },
131
- {
132
- "epoch": 0.036,
133
- "grad_norm": 1.0719950199127197,
134
- "learning_rate": 9.822e-05,
135
- "loss": 3.7916,
136
- "step": 90
137
- },
138
- {
139
- "epoch": 0.038,
140
- "grad_norm": 1.332682490348816,
141
- "learning_rate": 9.812e-05,
142
- "loss": 3.6925,
143
- "step": 95
144
- },
145
- {
146
- "epoch": 0.04,
147
- "grad_norm": 1.3171230554580688,
148
- "learning_rate": 9.802e-05,
149
- "loss": 3.6201,
150
- "step": 100
151
- },
152
- {
153
- "epoch": 0.042,
154
- "grad_norm": 1.0597072839736938,
155
- "learning_rate": 9.792e-05,
156
- "loss": 3.484,
157
- "step": 105
158
- },
159
- {
160
- "epoch": 0.044,
161
- "grad_norm": 1.6820316314697266,
162
- "learning_rate": 9.782e-05,
163
- "loss": 3.6541,
164
- "step": 110
165
- },
166
- {
167
- "epoch": 0.046,
168
- "grad_norm": 1.7244327068328857,
169
- "learning_rate": 9.772e-05,
170
- "loss": 3.5441,
171
- "step": 115
172
- },
173
- {
174
- "epoch": 0.048,
175
- "grad_norm": 1.0304560661315918,
176
- "learning_rate": 9.762e-05,
177
- "loss": 3.5992,
178
- "step": 120
179
- },
180
- {
181
- "epoch": 0.05,
182
- "grad_norm": 1.675391435623169,
183
- "learning_rate": 9.752e-05,
184
- "loss": 3.1433,
185
- "step": 125
186
- },
187
- {
188
- "epoch": 0.052,
189
- "grad_norm": 1.9963089227676392,
190
- "learning_rate": 9.742e-05,
191
- "loss": 3.3042,
192
- "step": 130
193
- },
194
- {
195
- "epoch": 0.054,
196
- "grad_norm": 1.8973188400268555,
197
- "learning_rate": 9.732e-05,
198
- "loss": 3.3942,
199
- "step": 135
200
- },
201
- {
202
- "epoch": 0.056,
203
- "grad_norm": 1.1776793003082275,
204
- "learning_rate": 9.722e-05,
205
- "loss": 3.1565,
206
- "step": 140
207
- },
208
- {
209
- "epoch": 0.058,
210
- "grad_norm": 1.6588083505630493,
211
- "learning_rate": 9.712e-05,
212
- "loss": 3.2037,
213
- "step": 145
214
- },
215
- {
216
- "epoch": 0.06,
217
- "grad_norm": 1.866132140159607,
218
- "learning_rate": 9.702e-05,
219
- "loss": 2.921,
220
- "step": 150
221
- },
222
- {
223
- "epoch": 0.062,
224
- "grad_norm": 0.8898491263389587,
225
- "learning_rate": 9.692e-05,
226
- "loss": 3.0541,
227
- "step": 155
228
- },
229
- {
230
- "epoch": 0.064,
231
- "grad_norm": 1.8436152935028076,
232
- "learning_rate": 9.682e-05,
233
- "loss": 2.7864,
234
- "step": 160
235
- },
236
- {
237
- "epoch": 0.066,
238
- "grad_norm": 2.3928751945495605,
239
- "learning_rate": 9.672e-05,
240
- "loss": 3.0799,
241
- "step": 165
242
- },
243
- {
244
- "epoch": 0.068,
245
- "grad_norm": 1.4375264644622803,
246
- "learning_rate": 9.661999999999999e-05,
247
- "loss": 2.9754,
248
- "step": 170
249
- },
250
- {
251
- "epoch": 0.07,
252
- "grad_norm": 1.478073239326477,
253
- "learning_rate": 9.652e-05,
254
- "loss": 2.8644,
255
- "step": 175
256
- },
257
- {
258
- "epoch": 0.072,
259
- "grad_norm": 1.5689969062805176,
260
- "learning_rate": 9.642e-05,
261
- "loss": 2.9735,
262
- "step": 180
263
- },
264
- {
265
- "epoch": 0.074,
266
- "grad_norm": 1.9494465589523315,
267
- "learning_rate": 9.632e-05,
268
- "loss": 2.6551,
269
- "step": 185
270
- },
271
- {
272
- "epoch": 0.076,
273
- "grad_norm": 2.043407678604126,
274
- "learning_rate": 9.622000000000001e-05,
275
- "loss": 2.6535,
276
- "step": 190
277
- },
278
- {
279
- "epoch": 0.078,
280
- "grad_norm": 1.8407542705535889,
281
- "learning_rate": 9.612000000000001e-05,
282
- "loss": 2.7985,
283
- "step": 195
284
- },
285
- {
286
- "epoch": 0.08,
287
- "grad_norm": 1.5500164031982422,
288
- "learning_rate": 9.602e-05,
289
- "loss": 2.9799,
290
- "step": 200
291
- },
292
- {
293
- "epoch": 0.082,
294
- "grad_norm": 1.3006932735443115,
295
- "learning_rate": 9.592e-05,
296
- "loss": 2.9563,
297
- "step": 205
298
- },
299
- {
300
- "epoch": 0.084,
301
- "grad_norm": 1.2256354093551636,
302
- "learning_rate": 9.582000000000001e-05,
303
- "loss": 2.9478,
304
- "step": 210
305
- },
306
- {
307
- "epoch": 0.086,
308
- "grad_norm": 2.3953299522399902,
309
- "learning_rate": 9.572000000000001e-05,
310
- "loss": 2.8945,
311
- "step": 215
312
- },
313
- {
314
- "epoch": 0.088,
315
- "grad_norm": 2.034975051879883,
316
- "learning_rate": 9.562000000000001e-05,
317
- "loss": 2.839,
318
- "step": 220
319
- },
320
- {
321
- "epoch": 0.09,
322
- "grad_norm": 2.116765260696411,
323
- "learning_rate": 9.552000000000001e-05,
324
- "loss": 2.626,
325
- "step": 225
326
- },
327
- {
328
- "epoch": 0.092,
329
- "grad_norm": 1.7377326488494873,
330
- "learning_rate": 9.542e-05,
331
- "loss": 3.0082,
332
- "step": 230
333
- },
334
- {
335
- "epoch": 0.094,
336
- "grad_norm": 1.8839207887649536,
337
- "learning_rate": 9.532000000000002e-05,
338
- "loss": 2.7061,
339
- "step": 235
340
- },
341
- {
342
- "epoch": 0.096,
343
- "grad_norm": 1.8325484991073608,
344
- "learning_rate": 9.522000000000001e-05,
345
- "loss": 2.6903,
346
- "step": 240
347
- },
348
- {
349
- "epoch": 0.098,
350
- "grad_norm": 1.7984235286712646,
351
- "learning_rate": 9.512000000000001e-05,
352
- "loss": 2.7144,
353
- "step": 245
354
- },
355
- {
356
- "epoch": 0.1,
357
- "grad_norm": 2.731910228729248,
358
- "learning_rate": 9.502000000000001e-05,
359
- "loss": 2.6156,
360
- "step": 250
361
- },
362
- {
363
- "epoch": 0.102,
364
- "grad_norm": 2.2913668155670166,
365
- "learning_rate": 9.492e-05,
366
- "loss": 2.4733,
367
- "step": 255
368
- },
369
- {
370
- "epoch": 0.104,
371
- "grad_norm": 1.8068524599075317,
372
- "learning_rate": 9.482e-05,
373
- "loss": 2.7326,
374
- "step": 260
375
- },
376
- {
377
- "epoch": 0.106,
378
- "grad_norm": 2.2460227012634277,
379
- "learning_rate": 9.472000000000001e-05,
380
- "loss": 2.7199,
381
- "step": 265
382
- },
383
- {
384
- "epoch": 0.108,
385
- "grad_norm": 2.186492443084717,
386
- "learning_rate": 9.462000000000001e-05,
387
- "loss": 2.7873,
388
- "step": 270
389
- },
390
- {
391
- "epoch": 0.11,
392
- "grad_norm": 2.345064401626587,
393
- "learning_rate": 9.452000000000001e-05,
394
- "loss": 2.5964,
395
- "step": 275
396
- },
397
- {
398
- "epoch": 0.112,
399
- "grad_norm": 1.6393128633499146,
400
- "learning_rate": 9.442000000000001e-05,
401
- "loss": 2.7022,
402
- "step": 280
403
- },
404
- {
405
- "epoch": 0.114,
406
- "grad_norm": 1.9504517316818237,
407
- "learning_rate": 9.432e-05,
408
- "loss": 2.526,
409
- "step": 285
410
- },
411
- {
412
- "epoch": 0.116,
413
- "grad_norm": 3.769509792327881,
414
- "learning_rate": 9.422e-05,
415
- "loss": 2.4051,
416
- "step": 290
417
- },
418
- {
419
- "epoch": 0.118,
420
- "grad_norm": 2.109177589416504,
421
- "learning_rate": 9.412000000000001e-05,
422
- "loss": 2.3615,
423
- "step": 295
424
- },
425
- {
426
- "epoch": 0.12,
427
- "grad_norm": 6.674826145172119,
428
- "learning_rate": 9.402000000000001e-05,
429
- "loss": 2.5718,
430
- "step": 300
431
- },
432
- {
433
- "epoch": 0.122,
434
- "grad_norm": 2.5551745891571045,
435
- "learning_rate": 9.392000000000001e-05,
436
- "loss": 2.5388,
437
- "step": 305
438
- },
439
- {
440
- "epoch": 0.124,
441
- "grad_norm": 2.7368383407592773,
442
- "learning_rate": 9.382e-05,
443
- "loss": 2.1562,
444
- "step": 310
445
- },
446
- {
447
- "epoch": 0.126,
448
- "grad_norm": 2.9764292240142822,
449
- "learning_rate": 9.372e-05,
450
- "loss": 2.4115,
451
- "step": 315
452
- },
453
- {
454
- "epoch": 0.128,
455
- "grad_norm": 2.150486469268799,
456
- "learning_rate": 9.362e-05,
457
- "loss": 2.4289,
458
- "step": 320
459
- },
460
- {
461
- "epoch": 0.13,
462
- "grad_norm": 3.41752028465271,
463
- "learning_rate": 9.352000000000001e-05,
464
- "loss": 2.4018,
465
- "step": 325
466
- },
467
- {
468
- "epoch": 0.132,
469
- "grad_norm": 2.62450909614563,
470
- "learning_rate": 9.342000000000001e-05,
471
- "loss": 2.404,
472
- "step": 330
473
- },
474
- {
475
- "epoch": 0.134,
476
- "grad_norm": 2.1548142433166504,
477
- "learning_rate": 9.332000000000001e-05,
478
- "loss": 2.766,
479
- "step": 335
480
- },
481
- {
482
- "epoch": 0.136,
483
- "grad_norm": 2.3468611240386963,
484
- "learning_rate": 9.322e-05,
485
- "loss": 2.4288,
486
- "step": 340
487
- },
488
- {
489
- "epoch": 0.138,
490
- "grad_norm": 1.9857568740844727,
491
- "learning_rate": 9.312e-05,
492
- "loss": 2.0464,
493
- "step": 345
494
- },
495
- {
496
- "epoch": 0.14,
497
- "grad_norm": 1.7904646396636963,
498
- "learning_rate": 9.302e-05,
499
- "loss": 2.5532,
500
- "step": 350
501
- },
502
- {
503
- "epoch": 0.142,
504
- "grad_norm": 1.6434996128082275,
505
- "learning_rate": 9.292000000000001e-05,
506
- "loss": 2.2769,
507
- "step": 355
508
- },
509
- {
510
- "epoch": 0.144,
511
- "grad_norm": 2.023183584213257,
512
- "learning_rate": 9.282000000000001e-05,
513
- "loss": 2.37,
514
- "step": 360
515
- },
516
- {
517
- "epoch": 0.146,
518
- "grad_norm": 1.925668478012085,
519
- "learning_rate": 9.272e-05,
520
- "loss": 2.7774,
521
- "step": 365
522
- },
523
- {
524
- "epoch": 0.148,
525
- "grad_norm": 3.1799802780151367,
526
- "learning_rate": 9.262e-05,
527
- "loss": 2.4829,
528
- "step": 370
529
- },
530
- {
531
- "epoch": 0.15,
532
- "grad_norm": 2.7041819095611572,
533
- "learning_rate": 9.252e-05,
534
- "loss": 2.3482,
535
- "step": 375
536
- },
537
- {
538
- "epoch": 0.152,
539
- "grad_norm": 2.807724952697754,
540
- "learning_rate": 9.242000000000001e-05,
541
- "loss": 2.0214,
542
- "step": 380
543
- },
544
- {
545
- "epoch": 0.154,
546
- "grad_norm": 2.2531774044036865,
547
- "learning_rate": 9.232000000000001e-05,
548
- "loss": 2.93,
549
- "step": 385
550
- },
551
- {
552
- "epoch": 0.156,
553
- "grad_norm": 2.0609052181243896,
554
- "learning_rate": 9.222000000000001e-05,
555
- "loss": 1.9283,
556
- "step": 390
557
- },
558
- {
559
- "epoch": 0.158,
560
- "grad_norm": 2.284008502960205,
561
- "learning_rate": 9.212e-05,
562
- "loss": 2.2357,
563
- "step": 395
564
- },
565
- {
566
- "epoch": 0.16,
567
- "grad_norm": 2.8613440990448,
568
- "learning_rate": 9.202e-05,
569
- "loss": 2.1285,
570
- "step": 400
571
- },
572
- {
573
- "epoch": 0.162,
574
- "grad_norm": 2.23891544342041,
575
- "learning_rate": 9.192e-05,
576
- "loss": 2.2739,
577
- "step": 405
578
- },
579
- {
580
- "epoch": 0.164,
581
- "grad_norm": 1.527755856513977,
582
- "learning_rate": 9.182000000000001e-05,
583
- "loss": 2.4071,
584
- "step": 410
585
- },
586
- {
587
- "epoch": 0.166,
588
- "grad_norm": 1.6973111629486084,
589
- "learning_rate": 9.172000000000001e-05,
590
- "loss": 2.4015,
591
- "step": 415
592
- },
593
- {
594
- "epoch": 0.168,
595
- "grad_norm": 3.209406614303589,
596
- "learning_rate": 9.162000000000001e-05,
597
- "loss": 2.4004,
598
- "step": 420
599
- },
600
- {
601
- "epoch": 0.17,
602
- "grad_norm": 1.8819735050201416,
603
- "learning_rate": 9.152e-05,
604
- "loss": 2.2514,
605
- "step": 425
606
- },
607
- {
608
- "epoch": 0.172,
609
- "grad_norm": 2.637023448944092,
610
- "learning_rate": 9.142e-05,
611
- "loss": 2.0511,
612
- "step": 430
613
- },
614
- {
615
- "epoch": 0.174,
616
- "grad_norm": 2.4952168464660645,
617
- "learning_rate": 9.132e-05,
618
- "loss": 2.2291,
619
- "step": 435
620
- },
621
- {
622
- "epoch": 0.176,
623
- "grad_norm": 2.280730724334717,
624
- "learning_rate": 9.122000000000001e-05,
625
- "loss": 2.4591,
626
- "step": 440
627
- },
628
- {
629
- "epoch": 0.178,
630
- "grad_norm": 1.9758051633834839,
631
- "learning_rate": 9.112000000000001e-05,
632
- "loss": 2.4378,
633
- "step": 445
634
- },
635
- {
636
- "epoch": 0.18,
637
- "grad_norm": 2.1086337566375732,
638
- "learning_rate": 9.102e-05,
639
- "loss": 2.2705,
640
- "step": 450
641
- },
642
- {
643
- "epoch": 0.182,
644
- "grad_norm": 2.398313045501709,
645
- "learning_rate": 9.092e-05,
646
- "loss": 2.2926,
647
- "step": 455
648
- },
649
- {
650
- "epoch": 0.184,
651
- "grad_norm": 3.39194393157959,
652
- "learning_rate": 9.082e-05,
653
- "loss": 2.8741,
654
- "step": 460
655
- },
656
- {
657
- "epoch": 0.186,
658
- "grad_norm": 2.1371476650238037,
659
- "learning_rate": 9.072e-05,
660
- "loss": 1.9811,
661
- "step": 465
662
- },
663
- {
664
- "epoch": 0.188,
665
- "grad_norm": 2.9003446102142334,
666
- "learning_rate": 9.062000000000001e-05,
667
- "loss": 2.4993,
668
- "step": 470
669
- },
670
- {
671
- "epoch": 0.19,
672
- "grad_norm": 2.0266385078430176,
673
- "learning_rate": 9.052000000000001e-05,
674
- "loss": 2.2897,
675
- "step": 475
676
- },
677
- {
678
- "epoch": 0.192,
679
- "grad_norm": 1.8421316146850586,
680
- "learning_rate": 9.042e-05,
681
- "loss": 2.0086,
682
- "step": 480
683
- },
684
- {
685
- "epoch": 0.194,
686
- "grad_norm": 1.958868145942688,
687
- "learning_rate": 9.032e-05,
688
- "loss": 2.3263,
689
- "step": 485
690
- },
691
- {
692
- "epoch": 0.196,
693
- "grad_norm": 2.8556814193725586,
694
- "learning_rate": 9.022e-05,
695
- "loss": 2.3719,
696
- "step": 490
697
- },
698
- {
699
- "epoch": 0.198,
700
- "grad_norm": 2.265723705291748,
701
- "learning_rate": 9.012e-05,
702
- "loss": 2.2051,
703
- "step": 495
704
- },
705
- {
706
- "epoch": 0.2,
707
- "grad_norm": 1.8368626832962036,
708
- "learning_rate": 9.002000000000001e-05,
709
- "loss": 2.3211,
710
- "step": 500
711
- },
712
- {
713
- "epoch": 0.202,
714
- "grad_norm": 3.4433846473693848,
715
- "learning_rate": 8.992e-05,
716
- "loss": 2.0655,
717
- "step": 505
718
- },
719
- {
720
- "epoch": 0.204,
721
- "grad_norm": 1.8898130655288696,
722
- "learning_rate": 8.982e-05,
723
- "loss": 1.992,
724
- "step": 510
725
- },
726
- {
727
- "epoch": 0.206,
728
- "grad_norm": 3.5473153591156006,
729
- "learning_rate": 8.972e-05,
730
- "loss": 2.1858,
731
- "step": 515
732
- },
733
- {
734
- "epoch": 0.208,
735
- "grad_norm": 2.271097183227539,
736
- "learning_rate": 8.962e-05,
737
- "loss": 1.9518,
738
- "step": 520
739
- },
740
- {
741
- "epoch": 0.21,
742
- "grad_norm": 1.821327805519104,
743
- "learning_rate": 8.952000000000001e-05,
744
- "loss": 1.9524,
745
- "step": 525
746
- },
747
- {
748
- "epoch": 0.212,
749
- "grad_norm": 3.471569776535034,
750
- "learning_rate": 8.942000000000001e-05,
751
- "loss": 1.8348,
752
- "step": 530
753
- },
754
- {
755
- "epoch": 0.214,
756
- "grad_norm": 3.1918933391571045,
757
- "learning_rate": 8.932e-05,
758
- "loss": 2.2592,
759
- "step": 535
760
- },
761
- {
762
- "epoch": 0.216,
763
- "grad_norm": 2.0800018310546875,
764
- "learning_rate": 8.922e-05,
765
- "loss": 2.3358,
766
- "step": 540
767
- },
768
- {
769
- "epoch": 0.218,
770
- "grad_norm": 1.8120659589767456,
771
- "learning_rate": 8.912e-05,
772
- "loss": 2.2089,
773
- "step": 545
774
- },
775
- {
776
- "epoch": 0.22,
777
- "grad_norm": 2.169672727584839,
778
- "learning_rate": 8.902e-05,
779
- "loss": 2.3545,
780
- "step": 550
781
- },
782
- {
783
- "epoch": 0.222,
784
- "grad_norm": 1.9190467596054077,
785
- "learning_rate": 8.892000000000001e-05,
786
- "loss": 2.2975,
787
- "step": 555
788
- },
789
- {
790
- "epoch": 0.224,
791
- "grad_norm": 2.399026870727539,
792
- "learning_rate": 8.882000000000001e-05,
793
- "loss": 2.3177,
794
- "step": 560
795
- },
796
- {
797
- "epoch": 0.226,
798
- "grad_norm": 1.993609070777893,
799
- "learning_rate": 8.872e-05,
800
- "loss": 2.412,
801
- "step": 565
802
- },
803
- {
804
- "epoch": 0.228,
805
- "grad_norm": 4.1268720626831055,
806
- "learning_rate": 8.862e-05,
807
- "loss": 2.3971,
808
- "step": 570
809
- },
810
- {
811
- "epoch": 0.23,
812
- "grad_norm": 2.6726512908935547,
813
- "learning_rate": 8.852e-05,
814
- "loss": 2.294,
815
- "step": 575
816
- },
817
- {
818
- "epoch": 0.232,
819
- "grad_norm": 2.2172746658325195,
820
- "learning_rate": 8.842e-05,
821
- "loss": 2.355,
822
- "step": 580
823
- },
824
- {
825
- "epoch": 0.234,
826
- "grad_norm": 2.61527943611145,
827
- "learning_rate": 8.832000000000001e-05,
828
- "loss": 1.83,
829
- "step": 585
830
- },
831
- {
832
- "epoch": 0.236,
833
- "grad_norm": 1.6478010416030884,
834
- "learning_rate": 8.822e-05,
835
- "loss": 2.1412,
836
- "step": 590
837
- },
838
- {
839
- "epoch": 0.238,
840
- "grad_norm": 2.563441038131714,
841
- "learning_rate": 8.812e-05,
842
- "loss": 2.2381,
843
- "step": 595
844
- },
845
- {
846
- "epoch": 0.24,
847
- "grad_norm": 3.079211473464966,
848
- "learning_rate": 8.802e-05,
849
- "loss": 2.2569,
850
- "step": 600
851
- },
852
- {
853
- "epoch": 0.242,
854
- "grad_norm": 1.9616568088531494,
855
- "learning_rate": 8.792e-05,
856
- "loss": 2.2858,
857
- "step": 605
858
- },
859
- {
860
- "epoch": 0.244,
861
- "grad_norm": 2.6890292167663574,
862
- "learning_rate": 8.782e-05,
863
- "loss": 2.0128,
864
- "step": 610
865
- },
866
- {
867
- "epoch": 0.246,
868
- "grad_norm": 1.2593388557434082,
869
- "learning_rate": 8.772000000000001e-05,
870
- "loss": 2.4054,
871
- "step": 615
872
- },
873
- {
874
- "epoch": 0.248,
875
- "grad_norm": 2.716627836227417,
876
- "learning_rate": 8.762e-05,
877
- "loss": 2.5457,
878
- "step": 620
879
- },
880
- {
881
- "epoch": 0.25,
882
- "grad_norm": 2.6016945838928223,
883
- "learning_rate": 8.752e-05,
884
- "loss": 1.6912,
885
- "step": 625
886
- },
887
- {
888
- "epoch": 0.252,
889
- "grad_norm": 2.391510248184204,
890
- "learning_rate": 8.742e-05,
891
- "loss": 2.0171,
892
- "step": 630
893
- },
894
- {
895
- "epoch": 0.254,
896
- "grad_norm": 4.822355270385742,
897
- "learning_rate": 8.732e-05,
898
- "loss": 2.1439,
899
- "step": 635
900
- },
901
- {
902
- "epoch": 0.256,
903
- "grad_norm": 3.8465750217437744,
904
- "learning_rate": 8.722e-05,
905
- "loss": 2.0739,
906
- "step": 640
907
- },
908
- {
909
- "epoch": 0.258,
910
- "grad_norm": 2.866173267364502,
911
- "learning_rate": 8.712e-05,
912
- "loss": 2.0621,
913
- "step": 645
914
- },
915
- {
916
- "epoch": 0.26,
917
- "grad_norm": 2.4506778717041016,
918
- "learning_rate": 8.702e-05,
919
- "loss": 2.0337,
920
- "step": 650
921
- },
922
- {
923
- "epoch": 0.262,
924
- "grad_norm": 2.4373891353607178,
925
- "learning_rate": 8.692e-05,
926
- "loss": 1.7654,
927
- "step": 655
928
- },
929
- {
930
- "epoch": 0.264,
931
- "grad_norm": 2.212902784347534,
932
- "learning_rate": 8.682e-05,
933
- "loss": 2.1709,
934
- "step": 660
935
- },
936
- {
937
- "epoch": 0.266,
938
- "grad_norm": 2.6106960773468018,
939
- "learning_rate": 8.672e-05,
940
- "loss": 1.9015,
941
- "step": 665
942
- },
943
- {
944
- "epoch": 0.268,
945
- "grad_norm": 4.304783344268799,
946
- "learning_rate": 8.662000000000001e-05,
947
- "loss": 2.0843,
948
- "step": 670
949
- },
950
- {
951
- "epoch": 0.27,
952
- "grad_norm": 2.9099340438842773,
953
- "learning_rate": 8.652e-05,
954
- "loss": 2.2098,
955
- "step": 675
956
- },
957
- {
958
- "epoch": 0.272,
959
- "grad_norm": 2.6931354999542236,
960
- "learning_rate": 8.642e-05,
961
- "loss": 2.1349,
962
- "step": 680
963
- },
964
- {
965
- "epoch": 0.274,
966
- "grad_norm": 3.630815029144287,
967
- "learning_rate": 8.632e-05,
968
- "loss": 1.7593,
969
- "step": 685
970
- },
971
- {
972
- "epoch": 0.276,
973
- "grad_norm": 2.0120015144348145,
974
- "learning_rate": 8.622e-05,
975
- "loss": 2.1293,
976
- "step": 690
977
- },
978
- {
979
- "epoch": 0.278,
980
- "grad_norm": 3.897691249847412,
981
- "learning_rate": 8.612e-05,
982
- "loss": 2.1552,
983
- "step": 695
984
- },
985
- {
986
- "epoch": 0.28,
987
- "grad_norm": 2.266237735748291,
988
- "learning_rate": 8.602e-05,
989
- "loss": 2.2244,
990
- "step": 700
991
- },
992
- {
993
- "epoch": 0.282,
994
- "grad_norm": 2.100522994995117,
995
- "learning_rate": 8.592e-05,
996
- "loss": 2.3361,
997
- "step": 705
998
- },
999
- {
1000
- "epoch": 0.284,
1001
- "grad_norm": 2.1430091857910156,
1002
- "learning_rate": 8.582e-05,
1003
- "loss": 1.7879,
1004
- "step": 710
1005
- },
1006
- {
1007
- "epoch": 0.286,
1008
- "grad_norm": 3.2257421016693115,
1009
- "learning_rate": 8.572e-05,
1010
- "loss": 2.0216,
1011
- "step": 715
1012
- },
1013
- {
1014
- "epoch": 0.288,
1015
- "grad_norm": 0.9987928867340088,
1016
- "learning_rate": 8.562e-05,
1017
- "loss": 2.3699,
1018
- "step": 720
1019
- },
1020
- {
1021
- "epoch": 0.29,
1022
- "grad_norm": 3.250732421875,
1023
- "learning_rate": 8.552e-05,
1024
- "loss": 1.7009,
1025
- "step": 725
1026
- },
1027
- {
1028
- "epoch": 0.292,
1029
- "grad_norm": 2.7594077587127686,
1030
- "learning_rate": 8.542e-05,
1031
- "loss": 1.829,
1032
- "step": 730
1033
- },
1034
- {
1035
- "epoch": 0.294,
1036
- "grad_norm": 3.0348315238952637,
1037
- "learning_rate": 8.532e-05,
1038
- "loss": 1.4677,
1039
- "step": 735
1040
- },
1041
- {
1042
- "epoch": 0.296,
1043
- "grad_norm": 2.9564616680145264,
1044
- "learning_rate": 8.522e-05,
1045
- "loss": 1.7962,
1046
- "step": 740
1047
- },
1048
- {
1049
- "epoch": 0.298,
1050
- "grad_norm": 2.6723451614379883,
1051
- "learning_rate": 8.512e-05,
1052
- "loss": 2.4121,
1053
- "step": 745
1054
- },
1055
- {
1056
- "epoch": 0.3,
1057
- "grad_norm": 3.3210055828094482,
1058
- "learning_rate": 8.502e-05,
1059
- "loss": 2.1947,
1060
- "step": 750
1061
- },
1062
- {
1063
- "epoch": 0.302,
1064
- "grad_norm": 2.0533103942871094,
1065
- "learning_rate": 8.492e-05,
1066
- "loss": 2.1698,
1067
- "step": 755
1068
- },
1069
- {
1070
- "epoch": 0.304,
1071
- "grad_norm": 1.7164925336837769,
1072
- "learning_rate": 8.482e-05,
1073
- "loss": 2.3975,
1074
- "step": 760
1075
- },
1076
- {
1077
- "epoch": 0.306,
1078
- "grad_norm": 2.3715977668762207,
1079
- "learning_rate": 8.472e-05,
1080
- "loss": 2.0064,
1081
- "step": 765
1082
- },
1083
- {
1084
- "epoch": 0.308,
1085
- "grad_norm": 2.326876640319824,
1086
- "learning_rate": 8.462e-05,
1087
- "loss": 1.8805,
1088
- "step": 770
1089
- },
1090
- {
1091
- "epoch": 0.31,
1092
- "grad_norm": 2.4446003437042236,
1093
- "learning_rate": 8.452e-05,
1094
- "loss": 2.0861,
1095
- "step": 775
1096
- },
1097
- {
1098
- "epoch": 0.312,
1099
- "grad_norm": 3.457144021987915,
1100
- "learning_rate": 8.442e-05,
1101
- "loss": 2.3564,
1102
- "step": 780
1103
- },
1104
- {
1105
- "epoch": 0.314,
1106
- "grad_norm": 2.255930185317993,
1107
- "learning_rate": 8.431999999999999e-05,
1108
- "loss": 2.1533,
1109
- "step": 785
1110
- },
1111
- {
1112
- "epoch": 0.316,
1113
- "grad_norm": 1.9043174982070923,
1114
- "learning_rate": 8.422e-05,
1115
- "loss": 1.914,
1116
- "step": 790
1117
- },
1118
- {
1119
- "epoch": 0.318,
1120
- "grad_norm": 3.0527002811431885,
1121
- "learning_rate": 8.412e-05,
1122
- "loss": 1.9351,
1123
- "step": 795
1124
- },
1125
- {
1126
- "epoch": 0.32,
1127
- "grad_norm": 3.707892417907715,
1128
- "learning_rate": 8.402e-05,
1129
- "loss": 2.0129,
1130
- "step": 800
1131
- },
1132
- {
1133
- "epoch": 0.322,
1134
- "grad_norm": 1.6021428108215332,
1135
- "learning_rate": 8.392e-05,
1136
- "loss": 2.0383,
1137
- "step": 805
1138
- },
1139
- {
1140
- "epoch": 0.324,
1141
- "grad_norm": 2.2315077781677246,
1142
- "learning_rate": 8.382e-05,
1143
- "loss": 1.8572,
1144
- "step": 810
1145
- },
1146
- {
1147
- "epoch": 0.326,
1148
- "grad_norm": 2.0886893272399902,
1149
- "learning_rate": 8.372e-05,
1150
- "loss": 2.389,
1151
- "step": 815
1152
- },
1153
- {
1154
- "epoch": 0.328,
1155
- "grad_norm": 2.5066492557525635,
1156
- "learning_rate": 8.362000000000002e-05,
1157
- "loss": 2.3126,
1158
- "step": 820
1159
- },
1160
- {
1161
- "epoch": 0.33,
1162
- "grad_norm": 2.559074640274048,
1163
- "learning_rate": 8.352000000000001e-05,
1164
- "loss": 1.8435,
1165
- "step": 825
1166
- },
1167
- {
1168
- "epoch": 0.332,
1169
- "grad_norm": 1.2982532978057861,
1170
- "learning_rate": 8.342000000000001e-05,
1171
- "loss": 2.6958,
1172
- "step": 830
1173
- },
1174
- {
1175
- "epoch": 0.334,
1176
- "grad_norm": 2.9500558376312256,
1177
- "learning_rate": 8.332000000000001e-05,
1178
- "loss": 2.2249,
1179
- "step": 835
1180
- },
1181
- {
1182
- "epoch": 0.336,
1183
- "grad_norm": 1.1935762166976929,
1184
- "learning_rate": 8.322e-05,
1185
- "loss": 2.1226,
1186
- "step": 840
1187
- },
1188
- {
1189
- "epoch": 0.338,
1190
- "grad_norm": 2.153440237045288,
1191
- "learning_rate": 8.312e-05,
1192
- "loss": 2.1367,
1193
- "step": 845
1194
- },
1195
- {
1196
- "epoch": 0.34,
1197
- "grad_norm": 3.4815332889556885,
1198
- "learning_rate": 8.302000000000001e-05,
1199
- "loss": 1.7813,
1200
- "step": 850
1201
- },
1202
- {
1203
- "epoch": 0.342,
1204
- "grad_norm": 2.8280904293060303,
1205
- "learning_rate": 8.292000000000001e-05,
1206
- "loss": 1.9479,
1207
- "step": 855
1208
- },
1209
- {
1210
- "epoch": 0.344,
1211
- "grad_norm": 3.511687994003296,
1212
- "learning_rate": 8.282000000000001e-05,
1213
- "loss": 1.9082,
1214
- "step": 860
1215
- },
1216
- {
1217
- "epoch": 0.346,
1218
- "grad_norm": 2.669370651245117,
1219
- "learning_rate": 8.272000000000001e-05,
1220
- "loss": 1.5332,
1221
- "step": 865
1222
- },
1223
- {
1224
- "epoch": 0.348,
1225
- "grad_norm": 2.840242862701416,
1226
- "learning_rate": 8.262e-05,
1227
- "loss": 2.3229,
1228
- "step": 870
1229
- },
1230
- {
1231
- "epoch": 0.35,
1232
- "grad_norm": 3.331766128540039,
1233
- "learning_rate": 8.252e-05,
1234
- "loss": 2.0155,
1235
- "step": 875
1236
- },
1237
- {
1238
- "epoch": 0.352,
1239
- "grad_norm": 4.060706615447998,
1240
- "learning_rate": 8.242000000000001e-05,
1241
- "loss": 1.7354,
1242
- "step": 880
1243
- },
1244
- {
1245
- "epoch": 0.354,
1246
- "grad_norm": 2.9245781898498535,
1247
- "learning_rate": 8.232000000000001e-05,
1248
- "loss": 2.1195,
1249
- "step": 885
1250
- },
1251
- {
1252
- "epoch": 0.356,
1253
- "grad_norm": 2.2486793994903564,
1254
- "learning_rate": 8.222000000000001e-05,
1255
- "loss": 2.0922,
1256
- "step": 890
1257
- },
1258
- {
1259
- "epoch": 0.358,
1260
- "grad_norm": 1.3685901165008545,
1261
- "learning_rate": 8.212e-05,
1262
- "loss": 2.0711,
1263
- "step": 895
1264
- },
1265
- {
1266
- "epoch": 0.36,
1267
- "grad_norm": 3.810460090637207,
1268
- "learning_rate": 8.202e-05,
1269
- "loss": 1.6854,
1270
- "step": 900
1271
- },
1272
- {
1273
- "epoch": 0.362,
1274
- "grad_norm": 2.693786382675171,
1275
- "learning_rate": 8.192e-05,
1276
- "loss": 1.9812,
1277
- "step": 905
1278
- },
1279
- {
1280
- "epoch": 0.364,
1281
- "grad_norm": 3.220974922180176,
1282
- "learning_rate": 8.182000000000001e-05,
1283
- "loss": 2.1331,
1284
- "step": 910
1285
- },
1286
- {
1287
- "epoch": 0.366,
1288
- "grad_norm": 3.7384660243988037,
1289
- "learning_rate": 8.172000000000001e-05,
1290
- "loss": 1.713,
1291
- "step": 915
1292
- },
1293
- {
1294
- "epoch": 0.368,
1295
- "grad_norm": 2.024315118789673,
1296
- "learning_rate": 8.162000000000001e-05,
1297
- "loss": 2.2023,
1298
- "step": 920
1299
- },
1300
- {
1301
- "epoch": 0.37,
1302
- "grad_norm": 3.1162705421447754,
1303
- "learning_rate": 8.152e-05,
1304
- "loss": 1.77,
1305
- "step": 925
1306
- },
1307
- {
1308
- "epoch": 0.372,
1309
- "grad_norm": 2.4156429767608643,
1310
- "learning_rate": 8.142e-05,
1311
- "loss": 1.9556,
1312
- "step": 930
1313
- },
1314
- {
1315
- "epoch": 0.374,
1316
- "grad_norm": 1.5801384449005127,
1317
- "learning_rate": 8.132e-05,
1318
- "loss": 1.9622,
1319
- "step": 935
1320
- },
1321
- {
1322
- "epoch": 0.376,
1323
- "grad_norm": 3.660128355026245,
1324
- "learning_rate": 8.122000000000001e-05,
1325
- "loss": 2.078,
1326
- "step": 940
1327
- },
1328
- {
1329
- "epoch": 0.378,
1330
- "grad_norm": 1.9089343547821045,
1331
- "learning_rate": 8.112000000000001e-05,
1332
- "loss": 1.9397,
1333
- "step": 945
1334
- },
1335
- {
1336
- "epoch": 0.38,
1337
- "grad_norm": 2.250739812850952,
1338
- "learning_rate": 8.102000000000001e-05,
1339
- "loss": 1.6644,
1340
- "step": 950
1341
- },
1342
- {
1343
- "epoch": 0.382,
1344
- "grad_norm": 2.162501573562622,
1345
- "learning_rate": 8.092e-05,
1346
- "loss": 1.7254,
1347
- "step": 955
1348
- },
1349
- {
1350
- "epoch": 0.384,
1351
- "grad_norm": 1.6305783987045288,
1352
- "learning_rate": 8.082e-05,
1353
- "loss": 1.7052,
1354
- "step": 960
1355
- },
1356
- {
1357
- "epoch": 0.386,
1358
- "grad_norm": 3.8243024349212646,
1359
- "learning_rate": 8.072000000000001e-05,
1360
- "loss": 1.6534,
1361
- "step": 965
1362
- },
1363
- {
1364
- "epoch": 0.388,
1365
- "grad_norm": 2.9563748836517334,
1366
- "learning_rate": 8.062000000000001e-05,
1367
- "loss": 2.0002,
1368
- "step": 970
1369
- },
1370
- {
1371
- "epoch": 0.39,
1372
- "grad_norm": 2.350604772567749,
1373
- "learning_rate": 8.052000000000001e-05,
1374
- "loss": 1.8192,
1375
- "step": 975
1376
- },
1377
- {
1378
- "epoch": 0.392,
1379
- "grad_norm": 1.9382598400115967,
1380
- "learning_rate": 8.042e-05,
1381
- "loss": 2.1386,
1382
- "step": 980
1383
- },
1384
- {
1385
- "epoch": 0.394,
1386
- "grad_norm": 3.3442025184631348,
1387
- "learning_rate": 8.032e-05,
1388
- "loss": 1.7758,
1389
- "step": 985
1390
- },
1391
- {
1392
- "epoch": 0.396,
1393
- "grad_norm": 4.59849214553833,
1394
- "learning_rate": 8.022e-05,
1395
- "loss": 1.9987,
1396
- "step": 990
1397
- },
1398
- {
1399
- "epoch": 0.398,
1400
- "grad_norm": 1.7831141948699951,
1401
- "learning_rate": 8.012000000000001e-05,
1402
- "loss": 1.8504,
1403
- "step": 995
1404
- },
1405
- {
1406
- "epoch": 0.4,
1407
- "grad_norm": 3.119198799133301,
1408
- "learning_rate": 8.002000000000001e-05,
1409
- "loss": 2.1055,
1410
- "step": 1000
1411
- },
1412
- {
1413
- "epoch": 0.402,
1414
- "grad_norm": 4.341230869293213,
1415
- "learning_rate": 7.992000000000001e-05,
1416
- "loss": 1.9915,
1417
- "step": 1005
1418
- },
1419
- {
1420
- "epoch": 0.404,
1421
- "grad_norm": 3.653338670730591,
1422
- "learning_rate": 7.982e-05,
1423
- "loss": 1.9072,
1424
- "step": 1010
1425
- },
1426
- {
1427
- "epoch": 0.406,
1428
- "grad_norm": 2.365283489227295,
1429
- "learning_rate": 7.972e-05,
1430
- "loss": 2.1189,
1431
- "step": 1015
1432
- },
1433
- {
1434
- "epoch": 0.408,
1435
- "grad_norm": 2.3448755741119385,
1436
- "learning_rate": 7.962e-05,
1437
- "loss": 1.5658,
1438
- "step": 1020
1439
- },
1440
- {
1441
- "epoch": 0.41,
1442
- "grad_norm": 3.2361137866973877,
1443
- "learning_rate": 7.952000000000001e-05,
1444
- "loss": 1.5764,
1445
- "step": 1025
1446
- },
1447
- {
1448
- "epoch": 0.412,
1449
- "grad_norm": 4.448095798492432,
1450
- "learning_rate": 7.942000000000001e-05,
1451
- "loss": 1.9814,
1452
- "step": 1030
1453
- },
1454
- {
1455
- "epoch": 0.414,
1456
- "grad_norm": 1.5654709339141846,
1457
- "learning_rate": 7.932e-05,
1458
- "loss": 1.8629,
1459
- "step": 1035
1460
- },
1461
- {
1462
- "epoch": 0.416,
1463
- "grad_norm": 3.3745901584625244,
1464
- "learning_rate": 7.922e-05,
1465
- "loss": 2.1952,
1466
- "step": 1040
1467
- },
1468
- {
1469
- "epoch": 0.418,
1470
- "grad_norm": 2.3770949840545654,
1471
- "learning_rate": 7.912e-05,
1472
- "loss": 1.9977,
1473
- "step": 1045
1474
- },
1475
- {
1476
- "epoch": 0.42,
1477
- "grad_norm": 3.179367780685425,
1478
- "learning_rate": 7.902e-05,
1479
- "loss": 1.9814,
1480
- "step": 1050
1481
- },
1482
- {
1483
- "epoch": 0.422,
1484
- "grad_norm": 1.5007638931274414,
1485
- "learning_rate": 7.892000000000001e-05,
1486
- "loss": 1.8761,
1487
- "step": 1055
1488
- },
1489
- {
1490
- "epoch": 0.424,
1491
- "grad_norm": 3.5575854778289795,
1492
- "learning_rate": 7.882000000000001e-05,
1493
- "loss": 1.789,
1494
- "step": 1060
1495
- },
1496
- {
1497
- "epoch": 0.426,
1498
- "grad_norm": 1.8852957487106323,
1499
- "learning_rate": 7.872e-05,
1500
- "loss": 2.178,
1501
- "step": 1065
1502
- },
1503
- {
1504
- "epoch": 0.428,
1505
- "grad_norm": 2.534390449523926,
1506
- "learning_rate": 7.862e-05,
1507
- "loss": 1.9272,
1508
- "step": 1070
1509
- },
1510
- {
1511
- "epoch": 0.43,
1512
- "grad_norm": 3.5568392276763916,
1513
- "learning_rate": 7.852e-05,
1514
- "loss": 2.116,
1515
- "step": 1075
1516
- },
1517
- {
1518
- "epoch": 0.432,
1519
- "grad_norm": 2.170743942260742,
1520
- "learning_rate": 7.842e-05,
1521
- "loss": 1.4085,
1522
- "step": 1080
1523
- },
1524
- {
1525
- "epoch": 0.434,
1526
- "grad_norm": 2.4826807975769043,
1527
- "learning_rate": 7.832000000000001e-05,
1528
- "loss": 1.6083,
1529
- "step": 1085
1530
- },
1531
- {
1532
- "epoch": 0.436,
1533
- "grad_norm": 3.557332992553711,
1534
- "learning_rate": 7.822e-05,
1535
- "loss": 2.0262,
1536
- "step": 1090
1537
- },
1538
- {
1539
- "epoch": 0.438,
1540
- "grad_norm": 2.6044585704803467,
1541
- "learning_rate": 7.812e-05,
1542
- "loss": 1.9665,
1543
- "step": 1095
1544
- },
1545
- {
1546
- "epoch": 0.44,
1547
- "grad_norm": 2.431857109069824,
1548
- "learning_rate": 7.802e-05,
1549
- "loss": 1.9879,
1550
- "step": 1100
1551
- },
1552
- {
1553
- "epoch": 0.442,
1554
- "grad_norm": 3.814208507537842,
1555
- "learning_rate": 7.792e-05,
1556
- "loss": 1.6894,
1557
- "step": 1105
1558
- },
1559
- {
1560
- "epoch": 0.444,
1561
- "grad_norm": 2.7338225841522217,
1562
- "learning_rate": 7.782000000000001e-05,
1563
- "loss": 1.7777,
1564
- "step": 1110
1565
- },
1566
- {
1567
- "epoch": 0.446,
1568
- "grad_norm": 2.560375690460205,
1569
- "learning_rate": 7.772000000000001e-05,
1570
- "loss": 2.0086,
1571
- "step": 1115
1572
- },
1573
- {
1574
- "epoch": 0.448,
1575
- "grad_norm": 2.316746950149536,
1576
- "learning_rate": 7.762e-05,
1577
- "loss": 1.7457,
1578
- "step": 1120
1579
- },
1580
- {
1581
- "epoch": 0.45,
1582
- "grad_norm": 1.6756999492645264,
1583
- "learning_rate": 7.752e-05,
1584
- "loss": 2.0588,
1585
- "step": 1125
1586
- },
1587
- {
1588
- "epoch": 0.452,
1589
- "grad_norm": 1.4262984991073608,
1590
- "learning_rate": 7.742e-05,
1591
- "loss": 1.9309,
1592
- "step": 1130
1593
- },
1594
- {
1595
- "epoch": 0.454,
1596
- "grad_norm": 3.5977210998535156,
1597
- "learning_rate": 7.732e-05,
1598
- "loss": 1.7672,
1599
- "step": 1135
1600
- },
1601
- {
1602
- "epoch": 0.456,
1603
- "grad_norm": 2.7261245250701904,
1604
- "learning_rate": 7.722000000000001e-05,
1605
- "loss": 1.5192,
1606
- "step": 1140
1607
- },
1608
- {
1609
- "epoch": 0.458,
1610
- "grad_norm": 2.7008583545684814,
1611
- "learning_rate": 7.712000000000001e-05,
1612
- "loss": 2.0424,
1613
- "step": 1145
1614
- },
1615
- {
1616
- "epoch": 0.46,
1617
- "grad_norm": 2.377896785736084,
1618
- "learning_rate": 7.702e-05,
1619
- "loss": 2.0002,
1620
- "step": 1150
1621
- },
1622
- {
1623
- "epoch": 0.462,
1624
- "grad_norm": 4.894864082336426,
1625
- "learning_rate": 7.692e-05,
1626
- "loss": 2.1725,
1627
- "step": 1155
1628
- },
1629
- {
1630
- "epoch": 0.464,
1631
- "grad_norm": 1.4119629859924316,
1632
- "learning_rate": 7.682e-05,
1633
- "loss": 2.177,
1634
- "step": 1160
1635
- },
1636
- {
1637
- "epoch": 0.466,
1638
- "grad_norm": 2.613739013671875,
1639
- "learning_rate": 7.672e-05,
1640
- "loss": 2.093,
1641
- "step": 1165
1642
- },
1643
- {
1644
- "epoch": 0.468,
1645
- "grad_norm": 2.0441625118255615,
1646
- "learning_rate": 7.662000000000001e-05,
1647
- "loss": 1.98,
1648
- "step": 1170
1649
- },
1650
- {
1651
- "epoch": 0.47,
1652
- "grad_norm": 3.4278924465179443,
1653
- "learning_rate": 7.652e-05,
1654
- "loss": 1.7976,
1655
- "step": 1175
1656
- },
1657
- {
1658
- "epoch": 0.472,
1659
- "grad_norm": 2.316985607147217,
1660
- "learning_rate": 7.642e-05,
1661
- "loss": 2.0487,
1662
- "step": 1180
1663
- },
1664
- {
1665
- "epoch": 0.474,
1666
- "grad_norm": 2.847053050994873,
1667
- "learning_rate": 7.632e-05,
1668
- "loss": 1.8201,
1669
- "step": 1185
1670
- },
1671
- {
1672
- "epoch": 0.476,
1673
- "grad_norm": 2.258514404296875,
1674
- "learning_rate": 7.622e-05,
1675
- "loss": 1.8056,
1676
- "step": 1190
1677
- },
1678
- {
1679
- "epoch": 0.478,
1680
- "grad_norm": 1.729820728302002,
1681
- "learning_rate": 7.612e-05,
1682
- "loss": 1.7724,
1683
- "step": 1195
1684
- },
1685
- {
1686
- "epoch": 0.48,
1687
- "grad_norm": 3.0825610160827637,
1688
- "learning_rate": 7.602000000000001e-05,
1689
- "loss": 1.9275,
1690
- "step": 1200
1691
- },
1692
- {
1693
- "epoch": 0.482,
1694
- "grad_norm": 3.6028025150299072,
1695
- "learning_rate": 7.592e-05,
1696
- "loss": 1.7892,
1697
- "step": 1205
1698
- },
1699
- {
1700
- "epoch": 0.484,
1701
- "grad_norm": 3.5654330253601074,
1702
- "learning_rate": 7.582e-05,
1703
- "loss": 2.3649,
1704
- "step": 1210
1705
- },
1706
- {
1707
- "epoch": 0.486,
1708
- "grad_norm": 3.2018349170684814,
1709
- "learning_rate": 7.572e-05,
1710
- "loss": 1.7233,
1711
- "step": 1215
1712
- },
1713
- {
1714
- "epoch": 0.488,
1715
- "grad_norm": 2.509002923965454,
1716
- "learning_rate": 7.562e-05,
1717
- "loss": 1.7338,
1718
- "step": 1220
1719
- },
1720
- {
1721
- "epoch": 0.49,
1722
- "grad_norm": 3.320098876953125,
1723
- "learning_rate": 7.552e-05,
1724
- "loss": 2.0038,
1725
- "step": 1225
1726
- },
1727
- {
1728
- "epoch": 0.492,
1729
- "grad_norm": 3.109086036682129,
1730
- "learning_rate": 7.542e-05,
1731
- "loss": 1.724,
1732
- "step": 1230
1733
- },
1734
- {
1735
- "epoch": 0.494,
1736
- "grad_norm": 2.193565607070923,
1737
- "learning_rate": 7.532e-05,
1738
- "loss": 1.9984,
1739
- "step": 1235
1740
- },
1741
- {
1742
- "epoch": 0.496,
1743
- "grad_norm": 1.5994617938995361,
1744
- "learning_rate": 7.522e-05,
1745
- "loss": 1.4454,
1746
- "step": 1240
1747
- },
1748
- {
1749
- "epoch": 0.498,
1750
- "grad_norm": 4.096536159515381,
1751
- "learning_rate": 7.512e-05,
1752
- "loss": 1.9554,
1753
- "step": 1245
1754
- },
1755
- {
1756
- "epoch": 0.5,
1757
- "grad_norm": 4.227677822113037,
1758
- "learning_rate": 7.502e-05,
1759
- "loss": 2.1382,
1760
- "step": 1250
1761
- },
1762
- {
1763
- "epoch": 0.502,
1764
- "grad_norm": 3.4727842807769775,
1765
- "learning_rate": 7.492000000000001e-05,
1766
- "loss": 1.5761,
1767
- "step": 1255
1768
- },
1769
- {
1770
- "epoch": 0.504,
1771
- "grad_norm": 3.6935126781463623,
1772
- "learning_rate": 7.482e-05,
1773
- "loss": 1.845,
1774
- "step": 1260
1775
- },
1776
- {
1777
- "epoch": 0.506,
1778
- "grad_norm": 2.6635711193084717,
1779
- "learning_rate": 7.472e-05,
1780
- "loss": 1.9839,
1781
- "step": 1265
1782
- },
1783
- {
1784
- "epoch": 0.508,
1785
- "grad_norm": 3.7328500747680664,
1786
- "learning_rate": 7.462e-05,
1787
- "loss": 1.9438,
1788
- "step": 1270
1789
- },
1790
- {
1791
- "epoch": 0.51,
1792
- "grad_norm": 2.842043161392212,
1793
- "learning_rate": 7.452e-05,
1794
- "loss": 1.7112,
1795
- "step": 1275
1796
- },
1797
- {
1798
- "epoch": 0.512,
1799
- "grad_norm": 2.5873022079467773,
1800
- "learning_rate": 7.442e-05,
1801
- "loss": 1.7037,
1802
- "step": 1280
1803
- },
1804
- {
1805
- "epoch": 0.514,
1806
- "grad_norm": 2.5171470642089844,
1807
- "learning_rate": 7.432e-05,
1808
- "loss": 2.0828,
1809
- "step": 1285
1810
- },
1811
- {
1812
- "epoch": 0.516,
1813
- "grad_norm": 2.580310344696045,
1814
- "learning_rate": 7.422e-05,
1815
- "loss": 1.9703,
1816
- "step": 1290
1817
- },
1818
- {
1819
- "epoch": 0.518,
1820
- "grad_norm": 1.925465703010559,
1821
- "learning_rate": 7.412e-05,
1822
- "loss": 1.9266,
1823
- "step": 1295
1824
- },
1825
- {
1826
- "epoch": 0.52,
1827
- "grad_norm": 4.212243556976318,
1828
- "learning_rate": 7.402e-05,
1829
- "loss": 1.816,
1830
- "step": 1300
1831
- },
1832
- {
1833
- "epoch": 0.522,
1834
- "grad_norm": 2.8834757804870605,
1835
- "learning_rate": 7.392e-05,
1836
- "loss": 1.7435,
1837
- "step": 1305
1838
- },
1839
- {
1840
- "epoch": 0.524,
1841
- "grad_norm": 3.207301616668701,
1842
- "learning_rate": 7.382e-05,
1843
- "loss": 1.6266,
1844
- "step": 1310
1845
- },
1846
- {
1847
- "epoch": 0.526,
1848
- "grad_norm": 2.595672369003296,
1849
- "learning_rate": 7.372e-05,
1850
- "loss": 2.1611,
1851
- "step": 1315
1852
- },
1853
- {
1854
- "epoch": 0.528,
1855
- "grad_norm": 1.9702566862106323,
1856
- "learning_rate": 7.362e-05,
1857
- "loss": 1.874,
1858
- "step": 1320
1859
- },
1860
- {
1861
- "epoch": 0.53,
1862
- "grad_norm": 3.2945854663848877,
1863
- "learning_rate": 7.352e-05,
1864
- "loss": 2.385,
1865
- "step": 1325
1866
- },
1867
- {
1868
- "epoch": 0.532,
1869
- "grad_norm": 2.8158018589019775,
1870
- "learning_rate": 7.342e-05,
1871
- "loss": 1.8912,
1872
- "step": 1330
1873
- },
1874
- {
1875
- "epoch": 0.534,
1876
- "grad_norm": 3.153384208679199,
1877
- "learning_rate": 7.332e-05,
1878
- "loss": 1.8591,
1879
- "step": 1335
1880
- },
1881
- {
1882
- "epoch": 0.536,
1883
- "grad_norm": 2.0991859436035156,
1884
- "learning_rate": 7.322e-05,
1885
- "loss": 2.4344,
1886
- "step": 1340
1887
- },
1888
- {
1889
- "epoch": 0.538,
1890
- "grad_norm": 1.6609746217727661,
1891
- "learning_rate": 7.312e-05,
1892
- "loss": 1.6431,
1893
- "step": 1345
1894
- },
1895
- {
1896
- "epoch": 0.54,
1897
- "grad_norm": 1.7339993715286255,
1898
- "learning_rate": 7.302e-05,
1899
- "loss": 1.8644,
1900
- "step": 1350
1901
- },
1902
- {
1903
- "epoch": 0.542,
1904
- "grad_norm": 2.7158915996551514,
1905
- "learning_rate": 7.292e-05,
1906
- "loss": 1.7384,
1907
- "step": 1355
1908
- },
1909
- {
1910
- "epoch": 0.544,
1911
- "grad_norm": 3.752121925354004,
1912
- "learning_rate": 7.282e-05,
1913
- "loss": 1.6989,
1914
- "step": 1360
1915
- },
1916
- {
1917
- "epoch": 0.546,
1918
- "grad_norm": 0.895588755607605,
1919
- "learning_rate": 7.272e-05,
1920
- "loss": 1.99,
1921
- "step": 1365
1922
- },
1923
- {
1924
- "epoch": 0.548,
1925
- "grad_norm": 3.2313334941864014,
1926
- "learning_rate": 7.261999999999999e-05,
1927
- "loss": 1.7486,
1928
- "step": 1370
1929
- },
1930
- {
1931
- "epoch": 0.55,
1932
- "grad_norm": 3.4713807106018066,
1933
- "learning_rate": 7.252e-05,
1934
- "loss": 1.6347,
1935
- "step": 1375
1936
- },
1937
- {
1938
- "epoch": 0.552,
1939
- "grad_norm": 2.7429184913635254,
1940
- "learning_rate": 7.242e-05,
1941
- "loss": 1.8079,
1942
- "step": 1380
1943
- },
1944
- {
1945
- "epoch": 0.554,
1946
- "grad_norm": 1.5747346878051758,
1947
- "learning_rate": 7.232e-05,
1948
- "loss": 1.5241,
1949
- "step": 1385
1950
- },
1951
- {
1952
- "epoch": 0.556,
1953
- "grad_norm": 2.867905855178833,
1954
- "learning_rate": 7.222e-05,
1955
- "loss": 1.8958,
1956
- "step": 1390
1957
- },
1958
- {
1959
- "epoch": 0.558,
1960
- "grad_norm": 2.3015518188476562,
1961
- "learning_rate": 7.212e-05,
1962
- "loss": 1.7197,
1963
- "step": 1395
1964
- },
1965
- {
1966
- "epoch": 0.56,
1967
- "grad_norm": 1.6140376329421997,
1968
- "learning_rate": 7.202e-05,
1969
- "loss": 1.8053,
1970
- "step": 1400
1971
- },
1972
- {
1973
- "epoch": 0.562,
1974
- "grad_norm": 3.653310537338257,
1975
- "learning_rate": 7.192e-05,
1976
- "loss": 1.739,
1977
- "step": 1405
1978
- },
1979
- {
1980
- "epoch": 0.564,
1981
- "grad_norm": 2.1771411895751953,
1982
- "learning_rate": 7.182e-05,
1983
- "loss": 1.8199,
1984
- "step": 1410
1985
- },
1986
- {
1987
- "epoch": 0.566,
1988
- "grad_norm": 3.141714096069336,
1989
- "learning_rate": 7.172e-05,
1990
- "loss": 1.782,
1991
- "step": 1415
1992
- },
1993
- {
1994
- "epoch": 0.568,
1995
- "grad_norm": 3.9781055450439453,
1996
- "learning_rate": 7.162e-05,
1997
- "loss": 1.9008,
1998
- "step": 1420
1999
- },
2000
- {
2001
- "epoch": 0.57,
2002
- "grad_norm": 2.663086175918579,
2003
- "learning_rate": 7.151999999999999e-05,
2004
- "loss": 1.787,
2005
- "step": 1425
2006
- },
2007
- {
2008
- "epoch": 0.572,
2009
- "grad_norm": 2.78171443939209,
2010
- "learning_rate": 7.142e-05,
2011
- "loss": 1.676,
2012
- "step": 1430
2013
- },
2014
- {
2015
- "epoch": 0.574,
2016
- "grad_norm": 1.9540828466415405,
2017
- "learning_rate": 7.132e-05,
2018
- "loss": 2.553,
2019
- "step": 1435
2020
- },
2021
- {
2022
- "epoch": 0.576,
2023
- "grad_norm": 3.7563962936401367,
2024
- "learning_rate": 7.122000000000001e-05,
2025
- "loss": 1.614,
2026
- "step": 1440
2027
- },
2028
- {
2029
- "epoch": 0.578,
2030
- "grad_norm": 3.0696017742156982,
2031
- "learning_rate": 7.112000000000001e-05,
2032
- "loss": 1.6421,
2033
- "step": 1445
2034
- },
2035
- {
2036
- "epoch": 0.58,
2037
- "grad_norm": 2.7918848991394043,
2038
- "learning_rate": 7.102000000000001e-05,
2039
- "loss": 1.576,
2040
- "step": 1450
2041
- },
2042
- {
2043
- "epoch": 0.582,
2044
- "grad_norm": 2.9208178520202637,
2045
- "learning_rate": 7.092e-05,
2046
- "loss": 1.7068,
2047
- "step": 1455
2048
- },
2049
- {
2050
- "epoch": 0.584,
2051
- "grad_norm": 2.821730375289917,
2052
- "learning_rate": 7.082e-05,
2053
- "loss": 1.9337,
2054
- "step": 1460
2055
- },
2056
- {
2057
- "epoch": 0.586,
2058
- "grad_norm": 3.104081392288208,
2059
- "learning_rate": 7.072000000000001e-05,
2060
- "loss": 1.6916,
2061
- "step": 1465
2062
- },
2063
- {
2064
- "epoch": 0.588,
2065
- "grad_norm": 4.225072860717773,
2066
- "learning_rate": 7.062000000000001e-05,
2067
- "loss": 1.489,
2068
- "step": 1470
2069
- },
2070
- {
2071
- "epoch": 0.59,
2072
- "grad_norm": 1.777544379234314,
2073
- "learning_rate": 7.052000000000001e-05,
2074
- "loss": 2.5044,
2075
- "step": 1475
2076
- },
2077
- {
2078
- "epoch": 0.592,
2079
- "grad_norm": 3.047288179397583,
2080
- "learning_rate": 7.042000000000001e-05,
2081
- "loss": 1.7485,
2082
- "step": 1480
2083
- },
2084
- {
2085
- "epoch": 0.594,
2086
- "grad_norm": 2.2908759117126465,
2087
- "learning_rate": 7.032e-05,
2088
- "loss": 1.5557,
2089
- "step": 1485
2090
- },
2091
- {
2092
- "epoch": 0.596,
2093
- "grad_norm": 3.3206658363342285,
2094
- "learning_rate": 7.022e-05,
2095
- "loss": 1.707,
2096
- "step": 1490
2097
- },
2098
- {
2099
- "epoch": 0.598,
2100
- "grad_norm": 6.7620649337768555,
2101
- "learning_rate": 7.012000000000001e-05,
2102
- "loss": 1.7839,
2103
- "step": 1495
2104
- },
2105
- {
2106
- "epoch": 0.6,
2107
- "grad_norm": 2.4363317489624023,
2108
- "learning_rate": 7.002000000000001e-05,
2109
- "loss": 2.006,
2110
- "step": 1500
2111
- },
2112
- {
2113
- "epoch": 0.602,
2114
- "grad_norm": 1.6987566947937012,
2115
- "learning_rate": 6.992000000000001e-05,
2116
- "loss": 1.701,
2117
- "step": 1505
2118
- },
2119
- {
2120
- "epoch": 0.604,
2121
- "grad_norm": 1.0138988494873047,
2122
- "learning_rate": 6.982e-05,
2123
- "loss": 2.0307,
2124
- "step": 1510
2125
- },
2126
- {
2127
- "epoch": 0.606,
2128
- "grad_norm": 3.704721689224243,
2129
- "learning_rate": 6.972e-05,
2130
- "loss": 1.9313,
2131
- "step": 1515
2132
- },
2133
- {
2134
- "epoch": 0.608,
2135
- "grad_norm": 2.189314126968384,
2136
- "learning_rate": 6.962e-05,
2137
- "loss": 2.195,
2138
- "step": 1520
2139
- },
2140
- {
2141
- "epoch": 0.61,
2142
- "grad_norm": 2.160581111907959,
2143
- "learning_rate": 6.952000000000001e-05,
2144
- "loss": 1.8127,
2145
- "step": 1525
2146
- },
2147
- {
2148
- "epoch": 0.612,
2149
- "grad_norm": 2.969454288482666,
2150
- "learning_rate": 6.942000000000001e-05,
2151
- "loss": 1.8863,
2152
- "step": 1530
2153
- },
2154
- {
2155
- "epoch": 0.614,
2156
- "grad_norm": 3.452462673187256,
2157
- "learning_rate": 6.932000000000001e-05,
2158
- "loss": 1.8243,
2159
- "step": 1535
2160
- },
2161
- {
2162
- "epoch": 0.616,
2163
- "grad_norm": 4.208456039428711,
2164
- "learning_rate": 6.922e-05,
2165
- "loss": 1.72,
2166
- "step": 1540
2167
- },
2168
- {
2169
- "epoch": 0.618,
2170
- "grad_norm": 2.2857871055603027,
2171
- "learning_rate": 6.912e-05,
2172
- "loss": 1.886,
2173
- "step": 1545
2174
- },
2175
- {
2176
- "epoch": 0.62,
2177
- "grad_norm": 2.4010958671569824,
2178
- "learning_rate": 6.902000000000001e-05,
2179
- "loss": 2.0313,
2180
- "step": 1550
2181
- },
2182
- {
2183
- "epoch": 0.622,
2184
- "grad_norm": 3.4712297916412354,
2185
- "learning_rate": 6.892000000000001e-05,
2186
- "loss": 1.5378,
2187
- "step": 1555
2188
- },
2189
- {
2190
- "epoch": 0.624,
2191
- "grad_norm": 2.614377975463867,
2192
- "learning_rate": 6.882000000000001e-05,
2193
- "loss": 1.5747,
2194
- "step": 1560
2195
- },
2196
- {
2197
- "epoch": 0.626,
2198
- "grad_norm": 1.621139407157898,
2199
- "learning_rate": 6.872e-05,
2200
- "loss": 2.2916,
2201
- "step": 1565
2202
- },
2203
- {
2204
- "epoch": 0.628,
2205
- "grad_norm": 2.306574821472168,
2206
- "learning_rate": 6.862e-05,
2207
- "loss": 1.7473,
2208
- "step": 1570
2209
- },
2210
- {
2211
- "epoch": 0.63,
2212
- "grad_norm": 2.851588010787964,
2213
- "learning_rate": 6.852e-05,
2214
- "loss": 1.5369,
2215
- "step": 1575
2216
- },
2217
- {
2218
- "epoch": 0.632,
2219
- "grad_norm": 3.665318489074707,
2220
- "learning_rate": 6.842000000000001e-05,
2221
- "loss": 1.7895,
2222
- "step": 1580
2223
- },
2224
- {
2225
- "epoch": 0.634,
2226
- "grad_norm": 1.9340227842330933,
2227
- "learning_rate": 6.832000000000001e-05,
2228
- "loss": 1.9506,
2229
- "step": 1585
2230
- },
2231
- {
2232
- "epoch": 0.636,
2233
- "grad_norm": 4.726400375366211,
2234
- "learning_rate": 6.822000000000001e-05,
2235
- "loss": 1.8055,
2236
- "step": 1590
2237
- },
2238
- {
2239
- "epoch": 0.638,
2240
- "grad_norm": 3.3782994747161865,
2241
- "learning_rate": 6.812e-05,
2242
- "loss": 1.9607,
2243
- "step": 1595
2244
- },
2245
- {
2246
- "epoch": 0.64,
2247
- "grad_norm": 2.157594680786133,
2248
- "learning_rate": 6.802e-05,
2249
- "loss": 1.9568,
2250
- "step": 1600
2251
- },
2252
- {
2253
- "epoch": 0.642,
2254
- "grad_norm": 2.580761671066284,
2255
- "learning_rate": 6.792e-05,
2256
- "loss": 1.8217,
2257
- "step": 1605
2258
- },
2259
- {
2260
- "epoch": 0.644,
2261
- "grad_norm": 2.2638015747070312,
2262
- "learning_rate": 6.782000000000001e-05,
2263
- "loss": 1.6837,
2264
- "step": 1610
2265
- },
2266
- {
2267
- "epoch": 0.646,
2268
- "grad_norm": 4.926771640777588,
2269
- "learning_rate": 6.772000000000001e-05,
2270
- "loss": 1.8462,
2271
- "step": 1615
2272
- },
2273
- {
2274
- "epoch": 0.648,
2275
- "grad_norm": 2.017150640487671,
2276
- "learning_rate": 6.762e-05,
2277
- "loss": 2.0979,
2278
- "step": 1620
2279
- },
2280
- {
2281
- "epoch": 0.65,
2282
- "grad_norm": 1.7009762525558472,
2283
- "learning_rate": 6.752e-05,
2284
- "loss": 1.9508,
2285
- "step": 1625
2286
- },
2287
- {
2288
- "epoch": 0.652,
2289
- "grad_norm": 1.5154443979263306,
2290
- "learning_rate": 6.742e-05,
2291
- "loss": 1.8678,
2292
- "step": 1630
2293
- },
2294
- {
2295
- "epoch": 0.654,
2296
- "grad_norm": 2.348085403442383,
2297
- "learning_rate": 6.732e-05,
2298
- "loss": 2.0632,
2299
- "step": 1635
2300
- },
2301
- {
2302
- "epoch": 0.656,
2303
- "grad_norm": 3.450380802154541,
2304
- "learning_rate": 6.722000000000001e-05,
2305
- "loss": 1.8161,
2306
- "step": 1640
2307
- },
2308
- {
2309
- "epoch": 0.658,
2310
- "grad_norm": 1.0829286575317383,
2311
- "learning_rate": 6.712000000000001e-05,
2312
- "loss": 1.9894,
2313
- "step": 1645
2314
- },
2315
- {
2316
- "epoch": 0.66,
2317
- "grad_norm": 2.454120397567749,
2318
- "learning_rate": 6.702e-05,
2319
- "loss": 1.4593,
2320
- "step": 1650
2321
- },
2322
- {
2323
- "epoch": 0.662,
2324
- "grad_norm": 1.4079653024673462,
2325
- "learning_rate": 6.692e-05,
2326
- "loss": 1.6048,
2327
- "step": 1655
2328
- },
2329
- {
2330
- "epoch": 0.664,
2331
- "grad_norm": 2.143089771270752,
2332
- "learning_rate": 6.682e-05,
2333
- "loss": 1.8546,
2334
- "step": 1660
2335
- },
2336
- {
2337
- "epoch": 0.666,
2338
- "grad_norm": 1.7809556722640991,
2339
- "learning_rate": 6.672e-05,
2340
- "loss": 1.8759,
2341
- "step": 1665
2342
- },
2343
- {
2344
- "epoch": 0.668,
2345
- "grad_norm": 2.6478631496429443,
2346
- "learning_rate": 6.662000000000001e-05,
2347
- "loss": 2.2062,
2348
- "step": 1670
2349
- },
2350
- {
2351
- "epoch": 0.67,
2352
- "grad_norm": 3.3029139041900635,
2353
- "learning_rate": 6.652000000000001e-05,
2354
- "loss": 1.6157,
2355
- "step": 1675
2356
- },
2357
- {
2358
- "epoch": 0.672,
2359
- "grad_norm": 2.268291473388672,
2360
- "learning_rate": 6.642e-05,
2361
- "loss": 1.7665,
2362
- "step": 1680
2363
- },
2364
- {
2365
- "epoch": 0.674,
2366
- "grad_norm": 2.053265333175659,
2367
- "learning_rate": 6.632e-05,
2368
- "loss": 2.01,
2369
- "step": 1685
2370
- },
2371
- {
2372
- "epoch": 0.676,
2373
- "grad_norm": 2.9823215007781982,
2374
- "learning_rate": 6.622e-05,
2375
- "loss": 2.2441,
2376
- "step": 1690
2377
- },
2378
- {
2379
- "epoch": 0.678,
2380
- "grad_norm": 2.4951868057250977,
2381
- "learning_rate": 6.612000000000001e-05,
2382
- "loss": 1.7005,
2383
- "step": 1695
2384
- },
2385
- {
2386
- "epoch": 0.68,
2387
- "grad_norm": 3.276228666305542,
2388
- "learning_rate": 6.602000000000001e-05,
2389
- "loss": 1.7218,
2390
- "step": 1700
2391
- },
2392
- {
2393
- "epoch": 0.682,
2394
- "grad_norm": 1.6981475353240967,
2395
- "learning_rate": 6.592e-05,
2396
- "loss": 1.8756,
2397
- "step": 1705
2398
- },
2399
- {
2400
- "epoch": 0.684,
2401
- "grad_norm": 2.3083853721618652,
2402
- "learning_rate": 6.582e-05,
2403
- "loss": 1.7134,
2404
- "step": 1710
2405
- },
2406
- {
2407
- "epoch": 0.686,
2408
- "grad_norm": 1.466787576675415,
2409
- "learning_rate": 6.572e-05,
2410
- "loss": 1.758,
2411
- "step": 1715
2412
- },
2413
- {
2414
- "epoch": 0.688,
2415
- "grad_norm": 3.2987775802612305,
2416
- "learning_rate": 6.562e-05,
2417
- "loss": 1.8357,
2418
- "step": 1720
2419
- },
2420
- {
2421
- "epoch": 0.69,
2422
- "grad_norm": 2.7337427139282227,
2423
- "learning_rate": 6.552000000000001e-05,
2424
- "loss": 1.9261,
2425
- "step": 1725
2426
- },
2427
- {
2428
- "epoch": 0.692,
2429
- "grad_norm": 3.676628828048706,
2430
- "learning_rate": 6.542000000000001e-05,
2431
- "loss": 2.2404,
2432
- "step": 1730
2433
- },
2434
- {
2435
- "epoch": 0.694,
2436
- "grad_norm": 1.8547945022583008,
2437
- "learning_rate": 6.532e-05,
2438
- "loss": 1.5531,
2439
- "step": 1735
2440
- },
2441
- {
2442
- "epoch": 0.696,
2443
- "grad_norm": 1.6941248178482056,
2444
- "learning_rate": 6.522e-05,
2445
- "loss": 1.7762,
2446
- "step": 1740
2447
- },
2448
- {
2449
- "epoch": 0.698,
2450
- "grad_norm": 1.8873628377914429,
2451
- "learning_rate": 6.512e-05,
2452
- "loss": 1.8979,
2453
- "step": 1745
2454
- },
2455
- {
2456
- "epoch": 0.7,
2457
- "grad_norm": 2.069035768508911,
2458
- "learning_rate": 6.502e-05,
2459
- "loss": 1.6585,
2460
- "step": 1750
2461
- },
2462
- {
2463
- "epoch": 0.702,
2464
- "grad_norm": 2.0181164741516113,
2465
- "learning_rate": 6.492000000000001e-05,
2466
- "loss": 1.5298,
2467
- "step": 1755
2468
- },
2469
- {
2470
- "epoch": 0.704,
2471
- "grad_norm": 3.213226795196533,
2472
- "learning_rate": 6.482e-05,
2473
- "loss": 1.8443,
2474
- "step": 1760
2475
- },
2476
- {
2477
- "epoch": 0.706,
2478
- "grad_norm": 1.1691619157791138,
2479
- "learning_rate": 6.472e-05,
2480
- "loss": 2.0895,
2481
- "step": 1765
2482
- },
2483
- {
2484
- "epoch": 0.708,
2485
- "grad_norm": 2.166172504425049,
2486
- "learning_rate": 6.462e-05,
2487
- "loss": 2.0047,
2488
- "step": 1770
2489
- },
2490
- {
2491
- "epoch": 0.71,
2492
- "grad_norm": 3.0072996616363525,
2493
- "learning_rate": 6.452e-05,
2494
- "loss": 1.7831,
2495
- "step": 1775
2496
- },
2497
- {
2498
- "epoch": 0.712,
2499
- "grad_norm": 2.720421552658081,
2500
- "learning_rate": 6.442e-05,
2501
- "loss": 1.8452,
2502
- "step": 1780
2503
- },
2504
- {
2505
- "epoch": 0.714,
2506
- "grad_norm": 2.536058187484741,
2507
- "learning_rate": 6.432000000000001e-05,
2508
- "loss": 1.7563,
2509
- "step": 1785
2510
- },
2511
- {
2512
- "epoch": 0.716,
2513
- "grad_norm": 3.408418893814087,
2514
- "learning_rate": 6.422e-05,
2515
- "loss": 1.6771,
2516
- "step": 1790
2517
- },
2518
- {
2519
- "epoch": 0.718,
2520
- "grad_norm": 2.075005531311035,
2521
- "learning_rate": 6.412e-05,
2522
- "loss": 2.1428,
2523
- "step": 1795
2524
- },
2525
- {
2526
- "epoch": 0.72,
2527
- "grad_norm": 2.7794342041015625,
2528
- "learning_rate": 6.402e-05,
2529
- "loss": 1.7375,
2530
- "step": 1800
2531
- },
2532
- {
2533
- "epoch": 0.722,
2534
- "grad_norm": 3.188624382019043,
2535
- "learning_rate": 6.392e-05,
2536
- "loss": 1.5951,
2537
- "step": 1805
2538
- },
2539
- {
2540
- "epoch": 0.724,
2541
- "grad_norm": 2.1974058151245117,
2542
- "learning_rate": 6.382e-05,
2543
- "loss": 1.9184,
2544
- "step": 1810
2545
- },
2546
- {
2547
- "epoch": 0.726,
2548
- "grad_norm": 2.495058298110962,
2549
- "learning_rate": 6.372e-05,
2550
- "loss": 1.7634,
2551
- "step": 1815
2552
- },
2553
- {
2554
- "epoch": 0.728,
2555
- "grad_norm": 3.094088077545166,
2556
- "learning_rate": 6.362e-05,
2557
- "loss": 1.8355,
2558
- "step": 1820
2559
- },
2560
- {
2561
- "epoch": 0.73,
2562
- "grad_norm": 2.500934600830078,
2563
- "learning_rate": 6.352e-05,
2564
- "loss": 1.4541,
2565
- "step": 1825
2566
- },
2567
- {
2568
- "epoch": 0.732,
2569
- "grad_norm": 2.872494697570801,
2570
- "learning_rate": 6.342e-05,
2571
- "loss": 1.7752,
2572
- "step": 1830
2573
- },
2574
- {
2575
- "epoch": 0.734,
2576
- "grad_norm": 1.8021352291107178,
2577
- "learning_rate": 6.332e-05,
2578
- "loss": 1.8278,
2579
- "step": 1835
2580
- },
2581
- {
2582
- "epoch": 0.736,
2583
- "grad_norm": 2.14013409614563,
2584
- "learning_rate": 6.322000000000001e-05,
2585
- "loss": 1.728,
2586
- "step": 1840
2587
- },
2588
- {
2589
- "epoch": 0.738,
2590
- "grad_norm": 1.6599818468093872,
2591
- "learning_rate": 6.312e-05,
2592
- "loss": 2.1892,
2593
- "step": 1845
2594
- },
2595
- {
2596
- "epoch": 0.74,
2597
- "grad_norm": 4.102724552154541,
2598
- "learning_rate": 6.302e-05,
2599
- "loss": 2.011,
2600
- "step": 1850
2601
- },
2602
- {
2603
- "epoch": 0.742,
2604
- "grad_norm": 1.7305388450622559,
2605
- "learning_rate": 6.292e-05,
2606
- "loss": 1.7146,
2607
- "step": 1855
2608
- },
2609
- {
2610
- "epoch": 0.744,
2611
- "grad_norm": 2.732679843902588,
2612
- "learning_rate": 6.282e-05,
2613
- "loss": 2.1723,
2614
- "step": 1860
2615
- },
2616
- {
2617
- "epoch": 0.746,
2618
- "grad_norm": 2.7860026359558105,
2619
- "learning_rate": 6.272e-05,
2620
- "loss": 1.3846,
2621
- "step": 1865
2622
- },
2623
- {
2624
- "epoch": 0.748,
2625
- "grad_norm": 2.3102917671203613,
2626
- "learning_rate": 6.262000000000001e-05,
2627
- "loss": 2.3062,
2628
- "step": 1870
2629
- },
2630
- {
2631
- "epoch": 0.75,
2632
- "grad_norm": 2.2898411750793457,
2633
- "learning_rate": 6.252e-05,
2634
- "loss": 1.8194,
2635
- "step": 1875
2636
- },
2637
- {
2638
- "epoch": 0.752,
2639
- "grad_norm": 2.242110252380371,
2640
- "learning_rate": 6.242e-05,
2641
- "loss": 1.3548,
2642
- "step": 1880
2643
- },
2644
- {
2645
- "epoch": 0.754,
2646
- "grad_norm": 2.670325994491577,
2647
- "learning_rate": 6.232e-05,
2648
- "loss": 1.7741,
2649
- "step": 1885
2650
- },
2651
- {
2652
- "epoch": 0.756,
2653
- "grad_norm": 2.8892014026641846,
2654
- "learning_rate": 6.222e-05,
2655
- "loss": 1.8173,
2656
- "step": 1890
2657
- },
2658
- {
2659
- "epoch": 0.758,
2660
- "grad_norm": 2.0819385051727295,
2661
- "learning_rate": 6.212e-05,
2662
- "loss": 1.8424,
2663
- "step": 1895
2664
- },
2665
- {
2666
- "epoch": 0.76,
2667
- "grad_norm": 3.9723422527313232,
2668
- "learning_rate": 6.202e-05,
2669
- "loss": 1.6035,
2670
- "step": 1900
2671
- },
2672
- {
2673
- "epoch": 0.762,
2674
- "grad_norm": 2.007082939147949,
2675
- "learning_rate": 6.192e-05,
2676
- "loss": 2.0778,
2677
- "step": 1905
2678
- },
2679
- {
2680
- "epoch": 0.764,
2681
- "grad_norm": 3.79123854637146,
2682
- "learning_rate": 6.182e-05,
2683
- "loss": 1.9806,
2684
- "step": 1910
2685
- },
2686
- {
2687
- "epoch": 0.766,
2688
- "grad_norm": 3.2290866374969482,
2689
- "learning_rate": 6.172e-05,
2690
- "loss": 1.8257,
2691
- "step": 1915
2692
- },
2693
- {
2694
- "epoch": 0.768,
2695
- "grad_norm": 1.8563956022262573,
2696
- "learning_rate": 6.162e-05,
2697
- "loss": 1.8678,
2698
- "step": 1920
2699
- },
2700
- {
2701
- "epoch": 0.77,
2702
- "grad_norm": 2.831134080886841,
2703
- "learning_rate": 6.152e-05,
2704
- "loss": 2.0049,
2705
- "step": 1925
2706
- },
2707
- {
2708
- "epoch": 0.772,
2709
- "grad_norm": 3.1902923583984375,
2710
- "learning_rate": 6.142e-05,
2711
- "loss": 1.5629,
2712
- "step": 1930
2713
- },
2714
- {
2715
- "epoch": 0.774,
2716
- "grad_norm": 2.6706533432006836,
2717
- "learning_rate": 6.132e-05,
2718
- "loss": 1.7534,
2719
- "step": 1935
2720
- },
2721
- {
2722
- "epoch": 0.776,
2723
- "grad_norm": 1.5922584533691406,
2724
- "learning_rate": 6.122e-05,
2725
- "loss": 1.6197,
2726
- "step": 1940
2727
- },
2728
- {
2729
- "epoch": 0.778,
2730
- "grad_norm": 3.367527723312378,
2731
- "learning_rate": 6.112e-05,
2732
- "loss": 1.7022,
2733
- "step": 1945
2734
- },
2735
- {
2736
- "epoch": 0.78,
2737
- "grad_norm": 2.544776678085327,
2738
- "learning_rate": 6.102e-05,
2739
- "loss": 2.0928,
2740
- "step": 1950
2741
- },
2742
- {
2743
- "epoch": 0.782,
2744
- "grad_norm": 1.8083670139312744,
2745
- "learning_rate": 6.092e-05,
2746
- "loss": 1.8053,
2747
- "step": 1955
2748
- },
2749
- {
2750
- "epoch": 0.784,
2751
- "grad_norm": 5.398744583129883,
2752
- "learning_rate": 6.082e-05,
2753
- "loss": 1.8233,
2754
- "step": 1960
2755
- },
2756
- {
2757
- "epoch": 0.786,
2758
- "grad_norm": 2.380007743835449,
2759
- "learning_rate": 6.072e-05,
2760
- "loss": 1.3794,
2761
- "step": 1965
2762
- },
2763
- {
2764
- "epoch": 0.788,
2765
- "grad_norm": 2.977511405944824,
2766
- "learning_rate": 6.062e-05,
2767
- "loss": 1.8151,
2768
- "step": 1970
2769
- },
2770
- {
2771
- "epoch": 0.79,
2772
- "grad_norm": 1.6027389764785767,
2773
- "learning_rate": 6.0519999999999997e-05,
2774
- "loss": 1.4474,
2775
- "step": 1975
2776
- },
2777
- {
2778
- "epoch": 0.792,
2779
- "grad_norm": 1.7922685146331787,
2780
- "learning_rate": 6.042e-05,
2781
- "loss": 1.4798,
2782
- "step": 1980
2783
- },
2784
- {
2785
- "epoch": 0.794,
2786
- "grad_norm": 4.0504984855651855,
2787
- "learning_rate": 6.032e-05,
2788
- "loss": 2.069,
2789
- "step": 1985
2790
- },
2791
- {
2792
- "epoch": 0.796,
2793
- "grad_norm": 1.401548147201538,
2794
- "learning_rate": 6.0219999999999996e-05,
2795
- "loss": 1.8933,
2796
- "step": 1990
2797
- },
2798
- {
2799
- "epoch": 0.798,
2800
- "grad_norm": 1.408260464668274,
2801
- "learning_rate": 6.012e-05,
2802
- "loss": 1.9556,
2803
- "step": 1995
2804
- },
2805
- {
2806
- "epoch": 0.8,
2807
- "grad_norm": 2.128838062286377,
2808
- "learning_rate": 6.002e-05,
2809
- "loss": 1.6432,
2810
- "step": 2000
2811
- },
2812
- {
2813
- "epoch": 0.802,
2814
- "grad_norm": 7.282062530517578,
2815
- "learning_rate": 5.9919999999999996e-05,
2816
- "loss": 2.1569,
2817
- "step": 2005
2818
- },
2819
- {
2820
- "epoch": 0.804,
2821
- "grad_norm": 2.412156343460083,
2822
- "learning_rate": 5.982e-05,
2823
- "loss": 1.4548,
2824
- "step": 2010
2825
- },
2826
- {
2827
- "epoch": 0.806,
2828
- "grad_norm": 2.9918742179870605,
2829
- "learning_rate": 5.972e-05,
2830
- "loss": 1.5009,
2831
- "step": 2015
2832
- },
2833
- {
2834
- "epoch": 0.808,
2835
- "grad_norm": 5.301854610443115,
2836
- "learning_rate": 5.9619999999999995e-05,
2837
- "loss": 1.5879,
2838
- "step": 2020
2839
- },
2840
- {
2841
- "epoch": 0.81,
2842
- "grad_norm": 3.3276255130767822,
2843
- "learning_rate": 5.952e-05,
2844
- "loss": 1.5994,
2845
- "step": 2025
2846
- },
2847
- {
2848
- "epoch": 0.812,
2849
- "grad_norm": 2.128038167953491,
2850
- "learning_rate": 5.942e-05,
2851
- "loss": 1.8374,
2852
- "step": 2030
2853
- },
2854
- {
2855
- "epoch": 0.814,
2856
- "grad_norm": 3.896848201751709,
2857
- "learning_rate": 5.9319999999999994e-05,
2858
- "loss": 1.5896,
2859
- "step": 2035
2860
- },
2861
- {
2862
- "epoch": 0.816,
2863
- "grad_norm": 2.371381998062134,
2864
- "learning_rate": 5.922e-05,
2865
- "loss": 1.7849,
2866
- "step": 2040
2867
- },
2868
- {
2869
- "epoch": 0.818,
2870
- "grad_norm": 1.7761462926864624,
2871
- "learning_rate": 5.9119999999999996e-05,
2872
- "loss": 2.2341,
2873
- "step": 2045
2874
- },
2875
- {
2876
- "epoch": 0.82,
2877
- "grad_norm": 2.826425552368164,
2878
- "learning_rate": 5.902e-05,
2879
- "loss": 2.1281,
2880
- "step": 2050
2881
- },
2882
- {
2883
- "epoch": 0.822,
2884
- "grad_norm": 3.5838959217071533,
2885
- "learning_rate": 5.892e-05,
2886
- "loss": 1.8984,
2887
- "step": 2055
2888
- },
2889
- {
2890
- "epoch": 0.824,
2891
- "grad_norm": 3.9069666862487793,
2892
- "learning_rate": 5.8819999999999996e-05,
2893
- "loss": 1.8578,
2894
- "step": 2060
2895
- },
2896
- {
2897
- "epoch": 0.826,
2898
- "grad_norm": 4.064440727233887,
2899
- "learning_rate": 5.872000000000001e-05,
2900
- "loss": 2.0205,
2901
- "step": 2065
2902
- },
2903
- {
2904
- "epoch": 0.828,
2905
- "grad_norm": 1.290831208229065,
2906
- "learning_rate": 5.862000000000001e-05,
2907
- "loss": 1.8112,
2908
- "step": 2070
2909
- },
2910
- {
2911
- "epoch": 0.83,
2912
- "grad_norm": 2.8391001224517822,
2913
- "learning_rate": 5.852000000000001e-05,
2914
- "loss": 1.3297,
2915
- "step": 2075
2916
- },
2917
- {
2918
- "epoch": 0.832,
2919
- "grad_norm": 2.2486915588378906,
2920
- "learning_rate": 5.8420000000000006e-05,
2921
- "loss": 1.5082,
2922
- "step": 2080
2923
- },
2924
- {
2925
- "epoch": 0.834,
2926
- "grad_norm": 2.228530168533325,
2927
- "learning_rate": 5.832000000000001e-05,
2928
- "loss": 2.0064,
2929
- "step": 2085
2930
- },
2931
- {
2932
- "epoch": 0.836,
2933
- "grad_norm": 2.0774176120758057,
2934
- "learning_rate": 5.822000000000001e-05,
2935
- "loss": 1.5593,
2936
- "step": 2090
2937
- },
2938
- {
2939
- "epoch": 0.838,
2940
- "grad_norm": 3.9520459175109863,
2941
- "learning_rate": 5.8120000000000006e-05,
2942
- "loss": 1.3591,
2943
- "step": 2095
2944
- },
2945
- {
2946
- "epoch": 0.84,
2947
- "grad_norm": 2.112677574157715,
2948
- "learning_rate": 5.802000000000001e-05,
2949
- "loss": 2.1816,
2950
- "step": 2100
2951
- },
2952
- {
2953
- "epoch": 0.842,
2954
- "grad_norm": 2.870356798171997,
2955
- "learning_rate": 5.792000000000001e-05,
2956
- "loss": 1.9012,
2957
- "step": 2105
2958
- },
2959
- {
2960
- "epoch": 0.844,
2961
- "grad_norm": 2.8879733085632324,
2962
- "learning_rate": 5.7820000000000005e-05,
2963
- "loss": 1.604,
2964
- "step": 2110
2965
- },
2966
- {
2967
- "epoch": 0.846,
2968
- "grad_norm": 2.116102933883667,
2969
- "learning_rate": 5.772000000000001e-05,
2970
- "loss": 1.5525,
2971
- "step": 2115
2972
- },
2973
- {
2974
- "epoch": 0.848,
2975
- "grad_norm": 4.587926387786865,
2976
- "learning_rate": 5.762000000000001e-05,
2977
- "loss": 2.0804,
2978
- "step": 2120
2979
- },
2980
- {
2981
- "epoch": 0.85,
2982
- "grad_norm": 1.983154058456421,
2983
- "learning_rate": 5.7520000000000005e-05,
2984
- "loss": 1.4631,
2985
- "step": 2125
2986
- },
2987
- {
2988
- "epoch": 0.852,
2989
- "grad_norm": 1.5361416339874268,
2990
- "learning_rate": 5.742000000000001e-05,
2991
- "loss": 2.3421,
2992
- "step": 2130
2993
- },
2994
- {
2995
- "epoch": 0.854,
2996
- "grad_norm": 1.5888581275939941,
2997
- "learning_rate": 5.732000000000001e-05,
2998
- "loss": 1.5937,
2999
- "step": 2135
3000
- },
3001
- {
3002
- "epoch": 0.856,
3003
- "grad_norm": 2.2069616317749023,
3004
- "learning_rate": 5.7220000000000004e-05,
3005
- "loss": 1.7698,
3006
- "step": 2140
3007
- },
3008
- {
3009
- "epoch": 0.858,
3010
- "grad_norm": 3.34380841255188,
3011
- "learning_rate": 5.712000000000001e-05,
3012
- "loss": 2.0116,
3013
- "step": 2145
3014
- },
3015
- {
3016
- "epoch": 0.86,
3017
- "grad_norm": 2.184051513671875,
3018
- "learning_rate": 5.7020000000000006e-05,
3019
- "loss": 1.8469,
3020
- "step": 2150
3021
- },
3022
- {
3023
- "epoch": 0.862,
3024
- "grad_norm": 4.115564823150635,
3025
- "learning_rate": 5.6920000000000004e-05,
3026
- "loss": 1.6461,
3027
- "step": 2155
3028
- },
3029
- {
3030
- "epoch": 0.864,
3031
- "grad_norm": 3.084815263748169,
3032
- "learning_rate": 5.682000000000001e-05,
3033
- "loss": 1.5599,
3034
- "step": 2160
3035
- },
3036
- {
3037
- "epoch": 0.866,
3038
- "grad_norm": 2.8951117992401123,
3039
- "learning_rate": 5.6720000000000006e-05,
3040
- "loss": 2.0385,
3041
- "step": 2165
3042
- },
3043
- {
3044
- "epoch": 0.868,
3045
- "grad_norm": 2.4090707302093506,
3046
- "learning_rate": 5.6620000000000003e-05,
3047
- "loss": 1.74,
3048
- "step": 2170
3049
- },
3050
- {
3051
- "epoch": 0.87,
3052
- "grad_norm": 2.6545732021331787,
3053
- "learning_rate": 5.652000000000001e-05,
3054
- "loss": 2.3722,
3055
- "step": 2175
3056
- },
3057
- {
3058
- "epoch": 0.872,
3059
- "grad_norm": 2.1310207843780518,
3060
- "learning_rate": 5.6420000000000005e-05,
3061
- "loss": 2.0919,
3062
- "step": 2180
3063
- },
3064
- {
3065
- "epoch": 0.874,
3066
- "grad_norm": 1.826372504234314,
3067
- "learning_rate": 5.632e-05,
3068
- "loss": 1.8353,
3069
- "step": 2185
3070
- },
3071
- {
3072
- "epoch": 0.876,
3073
- "grad_norm": 3.4520180225372314,
3074
- "learning_rate": 5.622000000000001e-05,
3075
- "loss": 1.8989,
3076
- "step": 2190
3077
- },
3078
- {
3079
- "epoch": 0.878,
3080
- "grad_norm": 3.487771511077881,
3081
- "learning_rate": 5.6120000000000005e-05,
3082
- "loss": 2.0489,
3083
- "step": 2195
3084
- },
3085
- {
3086
- "epoch": 0.88,
3087
- "grad_norm": 2.4317750930786133,
3088
- "learning_rate": 5.602000000000001e-05,
3089
- "loss": 1.5238,
3090
- "step": 2200
3091
- },
3092
- {
3093
- "epoch": 0.882,
3094
- "grad_norm": 4.03161096572876,
3095
- "learning_rate": 5.592000000000001e-05,
3096
- "loss": 2.0312,
3097
- "step": 2205
3098
- },
3099
- {
3100
- "epoch": 0.884,
3101
- "grad_norm": 1.701350450515747,
3102
- "learning_rate": 5.5820000000000004e-05,
3103
- "loss": 1.6582,
3104
- "step": 2210
3105
- },
3106
- {
3107
- "epoch": 0.886,
3108
- "grad_norm": 2.434293746948242,
3109
- "learning_rate": 5.572000000000001e-05,
3110
- "loss": 2.1474,
3111
- "step": 2215
3112
- },
3113
- {
3114
- "epoch": 0.888,
3115
- "grad_norm": 2.668346405029297,
3116
- "learning_rate": 5.5620000000000006e-05,
3117
- "loss": 1.7028,
3118
- "step": 2220
3119
- },
3120
- {
3121
- "epoch": 0.89,
3122
- "grad_norm": 2.782132148742676,
3123
- "learning_rate": 5.5520000000000004e-05,
3124
- "loss": 1.5188,
3125
- "step": 2225
3126
- },
3127
- {
3128
- "epoch": 0.892,
3129
- "grad_norm": 3.1809840202331543,
3130
- "learning_rate": 5.542000000000001e-05,
3131
- "loss": 1.5867,
3132
- "step": 2230
3133
- },
3134
- {
3135
- "epoch": 0.894,
3136
- "grad_norm": 3.710517644882202,
3137
- "learning_rate": 5.5320000000000006e-05,
3138
- "loss": 1.6012,
3139
- "step": 2235
3140
- },
3141
- {
3142
- "epoch": 0.896,
3143
- "grad_norm": 2.689161539077759,
3144
- "learning_rate": 5.522e-05,
3145
- "loss": 1.6461,
3146
- "step": 2240
3147
- },
3148
- {
3149
- "epoch": 0.898,
3150
- "grad_norm": 3.879901647567749,
3151
- "learning_rate": 5.512000000000001e-05,
3152
- "loss": 1.8078,
3153
- "step": 2245
3154
- },
3155
- {
3156
- "epoch": 0.9,
3157
- "grad_norm": 3.5880234241485596,
3158
- "learning_rate": 5.5020000000000005e-05,
3159
- "loss": 1.862,
3160
- "step": 2250
3161
- },
3162
- {
3163
- "epoch": 0.902,
3164
- "grad_norm": 2.162250518798828,
3165
- "learning_rate": 5.492e-05,
3166
- "loss": 1.7578,
3167
- "step": 2255
3168
- },
3169
- {
3170
- "epoch": 0.904,
3171
- "grad_norm": 2.5121278762817383,
3172
- "learning_rate": 5.482000000000001e-05,
3173
- "loss": 1.9823,
3174
- "step": 2260
3175
- },
3176
- {
3177
- "epoch": 0.906,
3178
- "grad_norm": 2.9544060230255127,
3179
- "learning_rate": 5.4720000000000005e-05,
3180
- "loss": 1.6525,
3181
- "step": 2265
3182
- },
3183
- {
3184
- "epoch": 0.908,
3185
- "grad_norm": 3.3571219444274902,
3186
- "learning_rate": 5.462e-05,
3187
- "loss": 1.5033,
3188
- "step": 2270
3189
- },
3190
- {
3191
- "epoch": 0.91,
3192
- "grad_norm": 2.5898938179016113,
3193
- "learning_rate": 5.4520000000000007e-05,
3194
- "loss": 1.7722,
3195
- "step": 2275
3196
- },
3197
- {
3198
- "epoch": 0.912,
3199
- "grad_norm": 3.3335447311401367,
3200
- "learning_rate": 5.4420000000000004e-05,
3201
- "loss": 1.6362,
3202
- "step": 2280
3203
- },
3204
- {
3205
- "epoch": 0.914,
3206
- "grad_norm": 2.584991455078125,
3207
- "learning_rate": 5.432e-05,
3208
- "loss": 1.4556,
3209
- "step": 2285
3210
- },
3211
- {
3212
- "epoch": 0.916,
3213
- "grad_norm": 2.4838953018188477,
3214
- "learning_rate": 5.4220000000000006e-05,
3215
- "loss": 1.4268,
3216
- "step": 2290
3217
- },
3218
- {
3219
- "epoch": 0.918,
3220
- "grad_norm": 2.082561492919922,
3221
- "learning_rate": 5.4120000000000004e-05,
3222
- "loss": 1.6695,
3223
- "step": 2295
3224
- },
3225
- {
3226
- "epoch": 0.92,
3227
- "grad_norm": 3.49015474319458,
3228
- "learning_rate": 5.402e-05,
3229
- "loss": 1.8325,
3230
- "step": 2300
3231
- },
3232
- {
3233
- "epoch": 0.922,
3234
- "grad_norm": 4.535400867462158,
3235
- "learning_rate": 5.3920000000000006e-05,
3236
- "loss": 1.7432,
3237
- "step": 2305
3238
- },
3239
- {
3240
- "epoch": 0.924,
3241
- "grad_norm": 1.199286699295044,
3242
- "learning_rate": 5.382e-05,
3243
- "loss": 2.2751,
3244
- "step": 2310
3245
- },
3246
- {
3247
- "epoch": 0.926,
3248
- "grad_norm": 3.7484588623046875,
3249
- "learning_rate": 5.372e-05,
3250
- "loss": 2.0561,
3251
- "step": 2315
3252
- },
3253
- {
3254
- "epoch": 0.928,
3255
- "grad_norm": 2.494021415710449,
3256
- "learning_rate": 5.3620000000000005e-05,
3257
- "loss": 1.7586,
3258
- "step": 2320
3259
- },
3260
- {
3261
- "epoch": 0.93,
3262
- "grad_norm": 1.4161405563354492,
3263
- "learning_rate": 5.352e-05,
3264
- "loss": 1.8513,
3265
- "step": 2325
3266
- },
3267
- {
3268
- "epoch": 0.932,
3269
- "grad_norm": 3.006577253341675,
3270
- "learning_rate": 5.342e-05,
3271
- "loss": 1.9067,
3272
- "step": 2330
3273
- },
3274
- {
3275
- "epoch": 0.934,
3276
- "grad_norm": 2.625708818435669,
3277
- "learning_rate": 5.3320000000000004e-05,
3278
- "loss": 1.4276,
3279
- "step": 2335
3280
- },
3281
- {
3282
- "epoch": 0.936,
3283
- "grad_norm": 2.3370842933654785,
3284
- "learning_rate": 5.322e-05,
3285
- "loss": 2.1078,
3286
- "step": 2340
3287
- },
3288
- {
3289
- "epoch": 0.938,
3290
- "grad_norm": 2.641144275665283,
3291
- "learning_rate": 5.3120000000000006e-05,
3292
- "loss": 1.3618,
3293
- "step": 2345
3294
- },
3295
- {
3296
- "epoch": 0.94,
3297
- "grad_norm": 1.7367668151855469,
3298
- "learning_rate": 5.3020000000000004e-05,
3299
- "loss": 2.147,
3300
- "step": 2350
3301
- },
3302
- {
3303
- "epoch": 0.942,
3304
- "grad_norm": 2.7725813388824463,
3305
- "learning_rate": 5.292e-05,
3306
- "loss": 1.437,
3307
- "step": 2355
3308
- },
3309
- {
3310
- "epoch": 0.944,
3311
- "grad_norm": 4.516371250152588,
3312
- "learning_rate": 5.2820000000000006e-05,
3313
- "loss": 1.7548,
3314
- "step": 2360
3315
- },
3316
- {
3317
- "epoch": 0.946,
3318
- "grad_norm": 3.1467254161834717,
3319
- "learning_rate": 5.2720000000000003e-05,
3320
- "loss": 1.5239,
3321
- "step": 2365
3322
- },
3323
- {
3324
- "epoch": 0.948,
3325
- "grad_norm": 3.392289638519287,
3326
- "learning_rate": 5.262e-05,
3327
- "loss": 1.6646,
3328
- "step": 2370
3329
- },
3330
- {
3331
- "epoch": 0.95,
3332
- "grad_norm": 2.6524746417999268,
3333
- "learning_rate": 5.2520000000000005e-05,
3334
- "loss": 1.6977,
3335
- "step": 2375
3336
- },
3337
- {
3338
- "epoch": 0.952,
3339
- "grad_norm": 1.8809561729431152,
3340
- "learning_rate": 5.242e-05,
3341
- "loss": 1.4686,
3342
- "step": 2380
3343
- },
3344
- {
3345
- "epoch": 0.954,
3346
- "grad_norm": 2.859346866607666,
3347
- "learning_rate": 5.232e-05,
3348
- "loss": 1.9559,
3349
- "step": 2385
3350
- },
3351
- {
3352
- "epoch": 0.956,
3353
- "grad_norm": 2.9633779525756836,
3354
- "learning_rate": 5.2220000000000005e-05,
3355
- "loss": 1.907,
3356
- "step": 2390
3357
- },
3358
- {
3359
- "epoch": 0.958,
3360
- "grad_norm": 2.6979637145996094,
3361
- "learning_rate": 5.212e-05,
3362
- "loss": 1.3605,
3363
- "step": 2395
3364
- },
3365
- {
3366
- "epoch": 0.96,
3367
- "grad_norm": 3.2229700088500977,
3368
- "learning_rate": 5.202e-05,
3369
- "loss": 1.4891,
3370
- "step": 2400
3371
- },
3372
- {
3373
- "epoch": 0.962,
3374
- "grad_norm": 2.6224522590637207,
3375
- "learning_rate": 5.1920000000000004e-05,
3376
- "loss": 1.6005,
3377
- "step": 2405
3378
- },
3379
- {
3380
- "epoch": 0.964,
3381
- "grad_norm": 2.480083703994751,
3382
- "learning_rate": 5.182e-05,
3383
- "loss": 1.596,
3384
- "step": 2410
3385
- },
3386
- {
3387
- "epoch": 0.966,
3388
- "grad_norm": 2.6120476722717285,
3389
- "learning_rate": 5.172e-05,
3390
- "loss": 2.1357,
3391
- "step": 2415
3392
- },
3393
- {
3394
- "epoch": 0.968,
3395
- "grad_norm": 1.8930892944335938,
3396
- "learning_rate": 5.1620000000000004e-05,
3397
- "loss": 1.8591,
3398
- "step": 2420
3399
- },
3400
- {
3401
- "epoch": 0.97,
3402
- "grad_norm": 2.999755382537842,
3403
- "learning_rate": 5.152e-05,
3404
- "loss": 1.46,
3405
- "step": 2425
3406
- },
3407
- {
3408
- "epoch": 0.972,
3409
- "grad_norm": 3.370266914367676,
3410
- "learning_rate": 5.142e-05,
3411
- "loss": 1.7493,
3412
- "step": 2430
3413
- },
3414
- {
3415
- "epoch": 0.974,
3416
- "grad_norm": 1.9898550510406494,
3417
- "learning_rate": 5.132e-05,
3418
- "loss": 1.7027,
3419
- "step": 2435
3420
- },
3421
- {
3422
- "epoch": 0.976,
3423
- "grad_norm": 1.545696496963501,
3424
- "learning_rate": 5.122e-05,
3425
- "loss": 1.6076,
3426
- "step": 2440
3427
- },
3428
- {
3429
- "epoch": 0.978,
3430
- "grad_norm": 2.1743006706237793,
3431
- "learning_rate": 5.112e-05,
3432
- "loss": 1.6397,
3433
- "step": 2445
3434
- },
3435
- {
3436
- "epoch": 0.98,
3437
- "grad_norm": 3.9286975860595703,
3438
- "learning_rate": 5.102e-05,
3439
- "loss": 1.9747,
3440
- "step": 2450
3441
- },
3442
- {
3443
- "epoch": 0.982,
3444
- "grad_norm": 3.640699863433838,
3445
- "learning_rate": 5.092e-05,
3446
- "loss": 2.0213,
3447
- "step": 2455
3448
- },
3449
- {
3450
- "epoch": 0.984,
3451
- "grad_norm": 2.4696404933929443,
3452
- "learning_rate": 5.082e-05,
3453
- "loss": 1.677,
3454
- "step": 2460
3455
- },
3456
- {
3457
- "epoch": 0.986,
3458
- "grad_norm": 3.111293077468872,
3459
- "learning_rate": 5.072e-05,
3460
- "loss": 1.9945,
3461
- "step": 2465
3462
- },
3463
- {
3464
- "epoch": 0.988,
3465
- "grad_norm": 2.899752616882324,
3466
- "learning_rate": 5.062e-05,
3467
- "loss": 1.8826,
3468
- "step": 2470
3469
- },
3470
- {
3471
- "epoch": 0.99,
3472
- "grad_norm": 1.4491517543792725,
3473
- "learning_rate": 5.052e-05,
3474
- "loss": 1.765,
3475
- "step": 2475
3476
- },
3477
- {
3478
- "epoch": 0.992,
3479
- "grad_norm": 1.7043366432189941,
3480
- "learning_rate": 5.042e-05,
3481
- "loss": 1.8315,
3482
- "step": 2480
3483
- },
3484
- {
3485
- "epoch": 0.994,
3486
- "grad_norm": 1.644760251045227,
3487
- "learning_rate": 5.032e-05,
3488
- "loss": 1.7612,
3489
- "step": 2485
3490
- },
3491
- {
3492
- "epoch": 0.996,
3493
- "grad_norm": 2.3809268474578857,
3494
- "learning_rate": 5.0220000000000004e-05,
3495
- "loss": 1.6422,
3496
- "step": 2490
3497
- },
3498
- {
3499
- "epoch": 0.998,
3500
- "grad_norm": 1.5746747255325317,
3501
- "learning_rate": 5.012e-05,
3502
- "loss": 1.7717,
3503
- "step": 2495
3504
- },
3505
- {
3506
- "epoch": 1.0,
3507
- "grad_norm": 1.9237031936645508,
3508
- "learning_rate": 5.002e-05,
3509
- "loss": 1.6089,
3510
- "step": 2500
3511
- }
3512
- ],
3513
- "logging_steps": 5,
3514
- "max_steps": 5000,
3515
- "num_input_tokens_seen": 0,
3516
- "num_train_epochs": 2,
3517
- "save_steps": 500,
3518
- "stateful_callbacks": {
3519
- "TrainerControl": {
3520
- "args": {
3521
- "should_epoch_stop": false,
3522
- "should_evaluate": false,
3523
- "should_log": false,
3524
- "should_save": true,
3525
- "should_training_stop": false
3526
- },
3527
- "attributes": {}
3528
- }
3529
- },
3530
- "total_flos": 655363905159168.0,
3531
- "train_batch_size": 2,
3532
- "trial_name": null,
3533
- "trial_params": null
3534
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lora_adapter/checkpoint-2500/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdf460e30f035dc780e74a98b5e123b6e4fec4e4ec35945405eb78d3ee53442f
3
- size 5777
 
 
 
 
lora_adapter/checkpoint-2500/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
lora_adapter/checkpoint-5000/README.md DELETED
@@ -1,207 +0,0 @@
1
- ---
2
- base_model: gpt2
3
- library_name: peft
4
- pipeline_tag: text-generation
5
- tags:
6
- - base_model:adapter:gpt2
7
- - lora
8
- - transformers
9
- ---
10
-
11
- # Model Card for Model ID
12
-
13
- <!-- Provide a quick summary of what the model is/does. -->
14
-
15
-
16
-
17
- ## Model Details
18
-
19
- ### Model Description
20
-
21
- <!-- Provide a longer summary of what this model is. -->
22
-
23
-
24
-
25
- - **Developed by:** [More Information Needed]
26
- - **Funded by [optional]:** [More Information Needed]
27
- - **Shared by [optional]:** [More Information Needed]
28
- - **Model type:** [More Information Needed]
29
- - **Language(s) (NLP):** [More Information Needed]
30
- - **License:** [More Information Needed]
31
- - **Finetuned from model [optional]:** [More Information Needed]
32
-
33
- ### Model Sources [optional]
34
-
35
- <!-- Provide the basic links for the model. -->
36
-
37
- - **Repository:** [More Information Needed]
38
- - **Paper [optional]:** [More Information Needed]
39
- - **Demo [optional]:** [More Information Needed]
40
-
41
- ## Uses
42
-
43
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
-
45
- ### Direct Use
46
-
47
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
-
49
- [More Information Needed]
50
-
51
- ### Downstream Use [optional]
52
-
53
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
-
55
- [More Information Needed]
56
-
57
- ### Out-of-Scope Use
58
-
59
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
-
61
- [More Information Needed]
62
-
63
- ## Bias, Risks, and Limitations
64
-
65
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
-
67
- [More Information Needed]
68
-
69
- ### Recommendations
70
-
71
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
-
73
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
-
75
- ## How to Get Started with the Model
76
-
77
- Use the code below to get started with the model.
78
-
79
- [More Information Needed]
80
-
81
- ## Training Details
82
-
83
- ### Training Data
84
-
85
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
-
87
- [More Information Needed]
88
-
89
- ### Training Procedure
90
-
91
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
-
93
- #### Preprocessing [optional]
94
-
95
- [More Information Needed]
96
-
97
-
98
- #### Training Hyperparameters
99
-
100
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
-
102
- #### Speeds, Sizes, Times [optional]
103
-
104
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
-
106
- [More Information Needed]
107
-
108
- ## Evaluation
109
-
110
- <!-- This section describes the evaluation protocols and provides the results. -->
111
-
112
- ### Testing Data, Factors & Metrics
113
-
114
- #### Testing Data
115
-
116
- <!-- This should link to a Dataset Card if possible. -->
117
-
118
- [More Information Needed]
119
-
120
- #### Factors
121
-
122
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
-
124
- [More Information Needed]
125
-
126
- #### Metrics
127
-
128
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
-
130
- [More Information Needed]
131
-
132
- ### Results
133
-
134
- [More Information Needed]
135
-
136
- #### Summary
137
-
138
-
139
-
140
- ## Model Examination [optional]
141
-
142
- <!-- Relevant interpretability work for the model goes here -->
143
-
144
- [More Information Needed]
145
-
146
- ## Environmental Impact
147
-
148
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
-
150
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
-
152
- - **Hardware Type:** [More Information Needed]
153
- - **Hours used:** [More Information Needed]
154
- - **Cloud Provider:** [More Information Needed]
155
- - **Compute Region:** [More Information Needed]
156
- - **Carbon Emitted:** [More Information Needed]
157
-
158
- ## Technical Specifications [optional]
159
-
160
- ### Model Architecture and Objective
161
-
162
- [More Information Needed]
163
-
164
- ### Compute Infrastructure
165
-
166
- [More Information Needed]
167
-
168
- #### Hardware
169
-
170
- [More Information Needed]
171
-
172
- #### Software
173
-
174
- [More Information Needed]
175
-
176
- ## Citation [optional]
177
-
178
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
-
180
- **BibTeX:**
181
-
182
- [More Information Needed]
183
-
184
- **APA:**
185
-
186
- [More Information Needed]
187
-
188
- ## Glossary [optional]
189
-
190
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
-
192
- [More Information Needed]
193
-
194
- ## More Information [optional]
195
-
196
- [More Information Needed]
197
-
198
- ## Model Card Authors [optional]
199
-
200
- [More Information Needed]
201
-
202
- ## Model Card Contact
203
-
204
- [More Information Needed]
205
- ### Framework versions
206
-
207
- - PEFT 0.17.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lora_adapter/checkpoint-5000/adapter_config.json DELETED
@@ -1,38 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "gpt2",
5
- "bias": "none",
6
- "corda_config": null,
7
- "eva_config": null,
8
- "exclude_modules": null,
9
- "fan_in_fan_out": true,
10
- "inference_mode": true,
11
- "init_lora_weights": true,
12
- "layer_replication": null,
13
- "layers_pattern": null,
14
- "layers_to_transform": null,
15
- "loftq_config": {},
16
- "lora_alpha": 16,
17
- "lora_bias": false,
18
- "lora_dropout": 0.05,
19
- "megatron_config": null,
20
- "megatron_core": "megatron.core",
21
- "modules_to_save": null,
22
- "peft_type": "LORA",
23
- "qalora_group_size": 16,
24
- "r": 8,
25
- "rank_pattern": {},
26
- "revision": null,
27
- "target_modules": [
28
- "q_proj",
29
- "c_attn",
30
- "v_proj"
31
- ],
32
- "target_parameters": null,
33
- "task_type": "CAUSAL_LM",
34
- "trainable_token_indices": null,
35
- "use_dora": false,
36
- "use_qalora": false,
37
- "use_rslora": false
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lora_adapter/checkpoint-5000/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:56ae2a27b7624b3b0f0db362e7f072e5939af1914786000af021a132df291b1d
3
- size 1182680
 
 
 
 
lora_adapter/checkpoint-5000/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
lora_adapter/checkpoint-5000/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:72a6899875f8be148e78ef65de13d53a401eddc965115681f0e839bef801fb06
3
- size 2379751
 
 
 
 
lora_adapter/checkpoint-5000/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:81168f3f4bfdb61985c0e4f0ecf8e7e86bcc2f63593071d0095228b71484f497
3
- size 14391
 
 
 
 
lora_adapter/checkpoint-5000/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0813b2d58892d4590c709f9864743445ef237d767b1dd50acbf0834264225280
3
- size 1465
 
 
 
 
lora_adapter/checkpoint-5000/special_tokens_map.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "pad_token": "<|endoftext|>",
5
- "unk_token": "<|endoftext|>"
6
- }
 
 
 
 
 
 
 
lora_adapter/checkpoint-5000/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
lora_adapter/checkpoint-5000/tokenizer_config.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "added_tokens_decoder": {
4
- "50256": {
5
- "content": "<|endoftext|>",
6
- "lstrip": false,
7
- "normalized": true,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- }
12
- },
13
- "bos_token": "<|endoftext|>",
14
- "clean_up_tokenization_spaces": false,
15
- "eos_token": "<|endoftext|>",
16
- "extra_special_tokens": {},
17
- "model_max_length": 1024,
18
- "pad_token": "<|endoftext|>",
19
- "tokenizer_class": "GPT2Tokenizer",
20
- "unk_token": "<|endoftext|>"
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lora_adapter/checkpoint-5000/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
lora_adapter/checkpoint-5000/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdf460e30f035dc780e74a98b5e123b6e4fec4e4ec35945405eb78d3ee53442f
3
- size 5777
 
 
 
 
lora_adapter/checkpoint-5000/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
lora_adapter/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
lora_adapter/special_tokens_map.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "pad_token": "<|endoftext|>",
5
- "unk_token": "<|endoftext|>"
6
- }
 
 
 
 
 
 
 
lora_adapter/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
lora_adapter/tokenizer_config.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "added_tokens_decoder": {
4
- "50256": {
5
- "content": "<|endoftext|>",
6
- "lstrip": false,
7
- "normalized": true,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- }
12
- },
13
- "bos_token": "<|endoftext|>",
14
- "clean_up_tokenization_spaces": false,
15
- "eos_token": "<|endoftext|>",
16
- "extra_special_tokens": {},
17
- "model_max_length": 1024,
18
- "pad_token": "<|endoftext|>",
19
- "tokenizer_class": "GPT2Tokenizer",
20
- "unk_token": "<|endoftext|>"
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lora_adapter/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
requirements.txt DELETED
Binary file (3.45 kB)
 
start.sh DELETED
@@ -1,3 +0,0 @@
1
- #!/bin/bash
2
-
3
- streamlit run app.py --server.port=$PORT --server.address=0.0.0.0