upload the Embedding model: MiA-Emb-8B

Browse files

Files changed (5) hide show

README.md +202 -3
adapter_config.json +36 -0
adapter_model.safetensors +3 -0
trainer_state.json +923 -0
training_args.bin +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,202 @@
----
-license: apache-2.0
----

+---
+base_model: Qwen/Qwen3-Embedding-8B
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.2

adapter_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen3-Embedding-8B",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 128,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "k_proj",
+    "v_proj"
+  ],
+  "task_type": "FEATURE_EXTRACTION",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68e3f319f3b58cd3de93b59090e8fc23e03431cd6e50f67638950218ecd39a27
+size 245404784

trainer_state.json ADDED Viewed

	@@ -0,0 +1,923 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1272,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02358490566037736,
+      "grad_norm": 111.64971904040075,
+      "learning_rate": 4.745611564124804e-05,
+      "loss": 53.4188,
+      "step": 10
+    },
+    {
+      "epoch": 0.04716981132075472,
+      "grad_norm": 36.50653706838016,
+      "learning_rate": 6.174182992696232e-05,
+      "loss": 19.9875,
+      "step": 20
+    },
+    {
+      "epoch": 0.07075471698113207,
+      "grad_norm": 22.585478402078195,
+      "learning_rate": 7.009843708012171e-05,
+      "loss": 13.9594,
+      "step": 30
+    },
+    {
+      "epoch": 0.09433962264150944,
+      "grad_norm": 26.901765162938684,
+      "learning_rate": 7.602754421267661e-05,
+      "loss": 12.9,
+      "step": 40
+    },
+    {
+      "epoch": 0.1179245283018868,
+      "grad_norm": 15.203549791437116,
+      "learning_rate": 8.062651699678178e-05,
+      "loss": 11.9906,
+      "step": 50
+    },
+    {
+      "epoch": 0.14150943396226415,
+      "grad_norm": 19.669683180170903,
+      "learning_rate": 8.438415136583599e-05,
+      "loss": 12.1125,
+      "step": 60
+    },
+    {
+      "epoch": 0.1650943396226415,
+      "grad_norm": 20.617026952721357,
+      "learning_rate": 8.756118595635669e-05,
+      "loss": 11.7344,
+      "step": 70
+    },
+    {
+      "epoch": 0.18867924528301888,
+      "grad_norm": 14.56388559298774,
+      "learning_rate": 9.031325849839089e-05,
+      "loss": 11.1172,
+      "step": 80
+    },
+    {
+      "epoch": 0.21226415094339623,
+      "grad_norm": 17.076659337879114,
+      "learning_rate": 9.274075851899536e-05,
+      "loss": 11.9187,
+      "step": 90
+    },
+    {
+      "epoch": 0.2358490566037736,
+      "grad_norm": 32.07124156790194,
+      "learning_rate": 9.491223128249609e-05,
+      "loss": 11.6969,
+      "step": 100
+    },
+    {
+      "epoch": 0.25943396226415094,
+      "grad_norm": 17.8530210729174,
+      "learning_rate": 9.687656733606659e-05,
+      "loss": 11.8938,
+      "step": 110
+    },
+    {
+      "epoch": 0.2830188679245283,
+      "grad_norm": 27.003004374393104,
+      "learning_rate": 9.866986565155028e-05,
+      "loss": 12.5906,
+      "step": 120
+    },
+    {
+      "epoch": 0.30660377358490565,
+      "grad_norm": 19.002162024515545,
+      "learning_rate": 9.991258741258742e-05,
+      "loss": 11.8063,
+      "step": 130
+    },
+    {
+      "epoch": 0.330188679245283,
+      "grad_norm": 14.220172545505502,
+      "learning_rate": 9.903846153846155e-05,
+      "loss": 10.5687,
+      "step": 140
+    },
+    {
+      "epoch": 0.35377358490566035,
+      "grad_norm": 14.994374088898928,
+      "learning_rate": 9.816433566433567e-05,
+      "loss": 11.4469,
+      "step": 150
+    },
+    {
+      "epoch": 0.37735849056603776,
+      "grad_norm": 28.610439819975692,
+      "learning_rate": 9.729020979020979e-05,
+      "loss": 11.6031,
+      "step": 160
+    },
+    {
+      "epoch": 0.4009433962264151,
+      "grad_norm": 13.755122715578342,
+      "learning_rate": 9.641608391608393e-05,
+      "loss": 11.7562,
+      "step": 170
+    },
+    {
+      "epoch": 0.42452830188679247,
+      "grad_norm": 13.235152354159759,
+      "learning_rate": 9.554195804195805e-05,
+      "loss": 11.5156,
+      "step": 180
+    },
+    {
+      "epoch": 0.4481132075471698,
+      "grad_norm": 12.55770261202009,
+      "learning_rate": 9.466783216783217e-05,
+      "loss": 11.2953,
+      "step": 190
+    },
+    {
+      "epoch": 0.4716981132075472,
+      "grad_norm": 13.582038438276099,
+      "learning_rate": 9.37937062937063e-05,
+      "loss": 11.1062,
+      "step": 200
+    },
+    {
+      "epoch": 0.49528301886792453,
+      "grad_norm": 11.770352793407033,
+      "learning_rate": 9.291958041958042e-05,
+      "loss": 10.7781,
+      "step": 210
+    },
+    {
+      "epoch": 0.5188679245283019,
+      "grad_norm": 14.820093896670828,
+      "learning_rate": 9.204545454545454e-05,
+      "loss": 11.4578,
+      "step": 220
+    },
+    {
+      "epoch": 0.5424528301886793,
+      "grad_norm": 16.03397508198143,
+      "learning_rate": 9.117132867132868e-05,
+      "loss": 11.3906,
+      "step": 230
+    },
+    {
+      "epoch": 0.5660377358490566,
+      "grad_norm": 12.665474151765666,
+      "learning_rate": 9.029720279720281e-05,
+      "loss": 11.2812,
+      "step": 240
+    },
+    {
+      "epoch": 0.589622641509434,
+      "grad_norm": 12.380296703659218,
+      "learning_rate": 8.942307692307693e-05,
+      "loss": 10.925,
+      "step": 250
+    },
+    {
+      "epoch": 0.6132075471698113,
+      "grad_norm": 11.609348406547522,
+      "learning_rate": 8.854895104895105e-05,
+      "loss": 10.9625,
+      "step": 260
+    },
+    {
+      "epoch": 0.6367924528301887,
+      "grad_norm": 41.38762916621647,
+      "learning_rate": 8.767482517482519e-05,
+      "loss": 11.3031,
+      "step": 270
+    },
+    {
+      "epoch": 0.660377358490566,
+      "grad_norm": 11.71078545247518,
+      "learning_rate": 8.68006993006993e-05,
+      "loss": 10.5437,
+      "step": 280
+    },
+    {
+      "epoch": 0.6839622641509434,
+      "grad_norm": 14.740681537649765,
+      "learning_rate": 8.592657342657343e-05,
+      "loss": 11.0656,
+      "step": 290
+    },
+    {
+      "epoch": 0.7075471698113207,
+      "grad_norm": 13.407050612496178,
+      "learning_rate": 8.505244755244756e-05,
+      "loss": 10.275,
+      "step": 300
+    },
+    {
+      "epoch": 0.7311320754716981,
+      "grad_norm": 17.285671485791312,
+      "learning_rate": 8.417832167832168e-05,
+      "loss": 10.9094,
+      "step": 310
+    },
+    {
+      "epoch": 0.7547169811320755,
+      "grad_norm": 12.234499943045662,
+      "learning_rate": 8.33041958041958e-05,
+      "loss": 10.3812,
+      "step": 320
+    },
+    {
+      "epoch": 0.7783018867924528,
+      "grad_norm": 13.844957482436712,
+      "learning_rate": 8.243006993006994e-05,
+      "loss": 10.6016,
+      "step": 330
+    },
+    {
+      "epoch": 0.8018867924528302,
+      "grad_norm": 14.359721900450984,
+      "learning_rate": 8.155594405594406e-05,
+      "loss": 10.7719,
+      "step": 340
+    },
+    {
+      "epoch": 0.8254716981132075,
+      "grad_norm": 13.685695871660895,
+      "learning_rate": 8.068181818181818e-05,
+      "loss": 10.9281,
+      "step": 350
+    },
+    {
+      "epoch": 0.8490566037735849,
+      "grad_norm": 11.817457808585134,
+      "learning_rate": 7.980769230769231e-05,
+      "loss": 10.5906,
+      "step": 360
+    },
+    {
+      "epoch": 0.8726415094339622,
+      "grad_norm": 14.157887783746466,
+      "learning_rate": 7.893356643356644e-05,
+      "loss": 10.4375,
+      "step": 370
+    },
+    {
+      "epoch": 0.8962264150943396,
+      "grad_norm": 16.666070129232146,
+      "learning_rate": 7.805944055944057e-05,
+      "loss": 10.2781,
+      "step": 380
+    },
+    {
+      "epoch": 0.9198113207547169,
+      "grad_norm": 13.667548285529008,
+      "learning_rate": 7.718531468531469e-05,
+      "loss": 10.2438,
+      "step": 390
+    },
+    {
+      "epoch": 0.9433962264150944,
+      "grad_norm": 12.066751708653879,
+      "learning_rate": 7.631118881118882e-05,
+      "loss": 10.0094,
+      "step": 400
+    },
+    {
+      "epoch": 0.9669811320754716,
+      "grad_norm": 12.248491316508403,
+      "learning_rate": 7.543706293706294e-05,
+      "loss": 10.2328,
+      "step": 410
+    },
+    {
+      "epoch": 0.9905660377358491,
+      "grad_norm": 18.024158226856255,
+      "learning_rate": 7.456293706293706e-05,
+      "loss": 9.9656,
+      "step": 420
+    },
+    {
+      "epoch": 1.0141509433962264,
+      "grad_norm": 11.870175278410656,
+      "learning_rate": 7.36888111888112e-05,
+      "loss": 9.6953,
+      "step": 430
+    },
+    {
+      "epoch": 1.0377358490566038,
+      "grad_norm": 11.950018950483969,
+      "learning_rate": 7.281468531468531e-05,
+      "loss": 10.1656,
+      "step": 440
+    },
+    {
+      "epoch": 1.0613207547169812,
+      "grad_norm": 16.767199122744902,
+      "learning_rate": 7.194055944055944e-05,
+      "loss": 10.1594,
+      "step": 450
+    },
+    {
+      "epoch": 1.0849056603773586,
+      "grad_norm": 15.290808887183745,
+      "learning_rate": 7.106643356643357e-05,
+      "loss": 9.6875,
+      "step": 460
+    },
+    {
+      "epoch": 1.1084905660377358,
+      "grad_norm": 10.022244420640183,
+      "learning_rate": 7.019230769230769e-05,
+      "loss": 10.3281,
+      "step": 470
+    },
+    {
+      "epoch": 1.1320754716981132,
+      "grad_norm": 12.951298055638265,
+      "learning_rate": 6.931818181818182e-05,
+      "loss": 9.6766,
+      "step": 480
+    },
+    {
+      "epoch": 1.1556603773584906,
+      "grad_norm": 13.127907045424376,
+      "learning_rate": 6.844405594405596e-05,
+      "loss": 10.1594,
+      "step": 490
+    },
+    {
+      "epoch": 1.179245283018868,
+      "grad_norm": 11.892546949840662,
+      "learning_rate": 6.756993006993008e-05,
+      "loss": 9.9844,
+      "step": 500
+    },
+    {
+      "epoch": 1.2028301886792452,
+      "grad_norm": 12.563312310051254,
+      "learning_rate": 6.66958041958042e-05,
+      "loss": 9.8906,
+      "step": 510
+    },
+    {
+      "epoch": 1.2264150943396226,
+      "grad_norm": 12.048069831318715,
+      "learning_rate": 6.582167832167833e-05,
+      "loss": 9.9938,
+      "step": 520
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 21.579035414990887,
+      "learning_rate": 6.494755244755245e-05,
+      "loss": 9.4234,
+      "step": 530
+    },
+    {
+      "epoch": 1.2735849056603774,
+      "grad_norm": 12.065467497517545,
+      "learning_rate": 6.407342657342657e-05,
+      "loss": 10.0563,
+      "step": 540
+    },
+    {
+      "epoch": 1.2971698113207548,
+      "grad_norm": 11.549132237202269,
+      "learning_rate": 6.31993006993007e-05,
+      "loss": 9.8984,
+      "step": 550
+    },
+    {
+      "epoch": 1.320754716981132,
+      "grad_norm": 11.938528591591838,
+      "learning_rate": 6.232517482517483e-05,
+      "loss": 9.9969,
+      "step": 560
+    },
+    {
+      "epoch": 1.3443396226415094,
+      "grad_norm": 14.929825222343986,
+      "learning_rate": 6.145104895104895e-05,
+      "loss": 9.7594,
+      "step": 570
+    },
+    {
+      "epoch": 1.3679245283018868,
+      "grad_norm": 12.842065570056926,
+      "learning_rate": 6.0576923076923076e-05,
+      "loss": 9.5531,
+      "step": 580
+    },
+    {
+      "epoch": 1.3915094339622642,
+      "grad_norm": 15.540618048359113,
+      "learning_rate": 5.97027972027972e-05,
+      "loss": 10.1094,
+      "step": 590
+    },
+    {
+      "epoch": 1.4150943396226414,
+      "grad_norm": 17.595607626825647,
+      "learning_rate": 5.882867132867134e-05,
+      "loss": 9.4891,
+      "step": 600
+    },
+    {
+      "epoch": 1.4386792452830188,
+      "grad_norm": 10.493212987412385,
+      "learning_rate": 5.7954545454545464e-05,
+      "loss": 9.6406,
+      "step": 610
+    },
+    {
+      "epoch": 1.4622641509433962,
+      "grad_norm": 24.84577511132654,
+      "learning_rate": 5.7080419580419585e-05,
+      "loss": 9.7812,
+      "step": 620
+    },
+    {
+      "epoch": 1.4858490566037736,
+      "grad_norm": 13.697240248862046,
+      "learning_rate": 5.620629370629371e-05,
+      "loss": 9.8063,
+      "step": 630
+    },
+    {
+      "epoch": 1.509433962264151,
+      "grad_norm": 11.714963877097121,
+      "learning_rate": 5.533216783216784e-05,
+      "loss": 9.85,
+      "step": 640
+    },
+    {
+      "epoch": 1.5330188679245285,
+      "grad_norm": 15.925762627169787,
+      "learning_rate": 5.445804195804196e-05,
+      "loss": 9.5437,
+      "step": 650
+    },
+    {
+      "epoch": 1.5566037735849056,
+      "grad_norm": 10.21229978672529,
+      "learning_rate": 5.358391608391609e-05,
+      "loss": 9.7359,
+      "step": 660
+    },
+    {
+      "epoch": 1.580188679245283,
+      "grad_norm": 12.299504271971967,
+      "learning_rate": 5.2709790209790214e-05,
+      "loss": 10.0203,
+      "step": 670
+    },
+    {
+      "epoch": 1.6037735849056602,
+      "grad_norm": 10.756878837227157,
+      "learning_rate": 5.1835664335664335e-05,
+      "loss": 8.9719,
+      "step": 680
+    },
+    {
+      "epoch": 1.6273584905660377,
+      "grad_norm": 47.89124644047696,
+      "learning_rate": 5.096153846153846e-05,
+      "loss": 9.8281,
+      "step": 690
+    },
+    {
+      "epoch": 1.650943396226415,
+      "grad_norm": 13.321980781383072,
+      "learning_rate": 5.008741258741258e-05,
+      "loss": 9.5188,
+      "step": 700
+    },
+    {
+      "epoch": 1.6745283018867925,
+      "grad_norm": 12.375904788348723,
+      "learning_rate": 4.9213286713286716e-05,
+      "loss": 10.3156,
+      "step": 710
+    },
+    {
+      "epoch": 1.6981132075471699,
+      "grad_norm": 13.04878290460268,
+      "learning_rate": 4.8339160839160843e-05,
+      "loss": 9.3875,
+      "step": 720
+    },
+    {
+      "epoch": 1.7216981132075473,
+      "grad_norm": 13.835045116571802,
+      "learning_rate": 4.7465034965034964e-05,
+      "loss": 10.0781,
+      "step": 730
+    },
+    {
+      "epoch": 1.7452830188679245,
+      "grad_norm": 13.127688060476284,
+      "learning_rate": 4.659090909090909e-05,
+      "loss": 9.5703,
+      "step": 740
+    },
+    {
+      "epoch": 1.7688679245283019,
+      "grad_norm": 11.61972720292625,
+      "learning_rate": 4.571678321678322e-05,
+      "loss": 9.3,
+      "step": 750
+    },
+    {
+      "epoch": 1.7924528301886793,
+      "grad_norm": 11.880032085014191,
+      "learning_rate": 4.4842657342657346e-05,
+      "loss": 9.2141,
+      "step": 760
+    },
+    {
+      "epoch": 1.8160377358490565,
+      "grad_norm": 12.679739149762932,
+      "learning_rate": 4.396853146853147e-05,
+      "loss": 10.0031,
+      "step": 770
+    },
+    {
+      "epoch": 1.8396226415094339,
+      "grad_norm": 12.587125834021709,
+      "learning_rate": 4.309440559440559e-05,
+      "loss": 10.0437,
+      "step": 780
+    },
+    {
+      "epoch": 1.8632075471698113,
+      "grad_norm": 9.743520340164396,
+      "learning_rate": 4.222027972027972e-05,
+      "loss": 9.6719,
+      "step": 790
+    },
+    {
+      "epoch": 1.8867924528301887,
+      "grad_norm": 13.971700558144239,
+      "learning_rate": 4.134615384615385e-05,
+      "loss": 9.3938,
+      "step": 800
+    },
+    {
+      "epoch": 1.9103773584905661,
+      "grad_norm": 11.607356019588352,
+      "learning_rate": 4.0472027972027975e-05,
+      "loss": 9.3859,
+      "step": 810
+    },
+    {
+      "epoch": 1.9339622641509435,
+      "grad_norm": 12.35296887661871,
+      "learning_rate": 3.95979020979021e-05,
+      "loss": 9.3297,
+      "step": 820
+    },
+    {
+      "epoch": 1.9575471698113207,
+      "grad_norm": 10.899779163585386,
+      "learning_rate": 3.872377622377622e-05,
+      "loss": 9.7781,
+      "step": 830
+    },
+    {
+      "epoch": 1.9811320754716981,
+      "grad_norm": 11.971338017749327,
+      "learning_rate": 3.784965034965035e-05,
+      "loss": 10.1312,
+      "step": 840
+    },
+    {
+      "epoch": 2.0047169811320753,
+      "grad_norm": 10.878937392831533,
+      "learning_rate": 3.697552447552448e-05,
+      "loss": 9.0563,
+      "step": 850
+    },
+    {
+      "epoch": 2.0283018867924527,
+      "grad_norm": 15.806159570601048,
+      "learning_rate": 3.61013986013986e-05,
+      "loss": 8.5625,
+      "step": 860
+    },
+    {
+      "epoch": 2.05188679245283,
+      "grad_norm": 13.812473930051553,
+      "learning_rate": 3.522727272727273e-05,
+      "loss": 9.2078,
+      "step": 870
+    },
+    {
+      "epoch": 2.0754716981132075,
+      "grad_norm": 9.891081116732627,
+      "learning_rate": 3.435314685314686e-05,
+      "loss": 8.7234,
+      "step": 880
+    },
+    {
+      "epoch": 2.099056603773585,
+      "grad_norm": 10.673891805568328,
+      "learning_rate": 3.347902097902098e-05,
+      "loss": 9.0563,
+      "step": 890
+    },
+    {
+      "epoch": 2.1226415094339623,
+      "grad_norm": 10.205081198781729,
+      "learning_rate": 3.2604895104895106e-05,
+      "loss": 8.9688,
+      "step": 900
+    },
+    {
+      "epoch": 2.1462264150943398,
+      "grad_norm": 11.566662730230892,
+      "learning_rate": 3.1730769230769234e-05,
+      "loss": 9.0531,
+      "step": 910
+    },
+    {
+      "epoch": 2.169811320754717,
+      "grad_norm": 11.119994413592556,
+      "learning_rate": 3.0856643356643354e-05,
+      "loss": 9.0516,
+      "step": 920
+    },
+    {
+      "epoch": 2.1933962264150946,
+      "grad_norm": 10.765841854970871,
+      "learning_rate": 2.9982517482517485e-05,
+      "loss": 8.7312,
+      "step": 930
+    },
+    {
+      "epoch": 2.2169811320754715,
+      "grad_norm": 14.277150964403473,
+      "learning_rate": 2.9108391608391612e-05,
+      "loss": 8.9609,
+      "step": 940
+    },
+    {
+      "epoch": 2.240566037735849,
+      "grad_norm": 13.46535727014516,
+      "learning_rate": 2.8234265734265736e-05,
+      "loss": 9.0531,
+      "step": 950
+    },
+    {
+      "epoch": 2.2641509433962264,
+      "grad_norm": 13.360257816479617,
+      "learning_rate": 2.736013986013986e-05,
+      "loss": 9.4016,
+      "step": 960
+    },
+    {
+      "epoch": 2.2877358490566038,
+      "grad_norm": 10.644001178785137,
+      "learning_rate": 2.6486013986013987e-05,
+      "loss": 9.4281,
+      "step": 970
+    },
+    {
+      "epoch": 2.311320754716981,
+      "grad_norm": 13.041471877077432,
+      "learning_rate": 2.561188811188811e-05,
+      "loss": 8.4094,
+      "step": 980
+    },
+    {
+      "epoch": 2.3349056603773586,
+      "grad_norm": 11.380539833099244,
+      "learning_rate": 2.4737762237762238e-05,
+      "loss": 8.5703,
+      "step": 990
+    },
+    {
+      "epoch": 2.358490566037736,
+      "grad_norm": 15.24184551745,
+      "learning_rate": 2.3863636363636365e-05,
+      "loss": 8.7469,
+      "step": 1000
+    },
+    {
+      "epoch": 2.3820754716981134,
+      "grad_norm": 13.296204794285742,
+      "learning_rate": 2.298951048951049e-05,
+      "loss": 8.6531,
+      "step": 1010
+    },
+    {
+      "epoch": 2.4056603773584904,
+      "grad_norm": 14.553056169204998,
+      "learning_rate": 2.2115384615384616e-05,
+      "loss": 9.4719,
+      "step": 1020
+    },
+    {
+      "epoch": 2.4292452830188678,
+      "grad_norm": 12.864094699757313,
+      "learning_rate": 2.1241258741258744e-05,
+      "loss": 8.5609,
+      "step": 1030
+    },
+    {
+      "epoch": 2.452830188679245,
+      "grad_norm": 12.84578923442427,
+      "learning_rate": 2.0367132867132867e-05,
+      "loss": 9.2219,
+      "step": 1040
+    },
+    {
+      "epoch": 2.4764150943396226,
+      "grad_norm": 13.709126969591345,
+      "learning_rate": 1.9493006993006995e-05,
+      "loss": 8.8297,
+      "step": 1050
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 12.83886757229034,
+      "learning_rate": 1.8618881118881122e-05,
+      "loss": 9.2188,
+      "step": 1060
+    },
+    {
+      "epoch": 2.5235849056603774,
+      "grad_norm": 12.383754458241203,
+      "learning_rate": 1.7744755244755246e-05,
+      "loss": 8.3781,
+      "step": 1070
+    },
+    {
+      "epoch": 2.547169811320755,
+      "grad_norm": 15.566078599420996,
+      "learning_rate": 1.687062937062937e-05,
+      "loss": 8.9109,
+      "step": 1080
+    },
+    {
+      "epoch": 2.5707547169811322,
+      "grad_norm": 12.227977636887342,
+      "learning_rate": 1.5996503496503497e-05,
+      "loss": 8.5984,
+      "step": 1090
+    },
+    {
+      "epoch": 2.5943396226415096,
+      "grad_norm": 11.392465117491689,
+      "learning_rate": 1.5122377622377622e-05,
+      "loss": 8.0125,
+      "step": 1100
+    },
+    {
+      "epoch": 2.617924528301887,
+      "grad_norm": 11.211860612831199,
+      "learning_rate": 1.4248251748251748e-05,
+      "loss": 8.6391,
+      "step": 1110
+    },
+    {
+      "epoch": 2.641509433962264,
+      "grad_norm": 12.379882956164083,
+      "learning_rate": 1.3374125874125875e-05,
+      "loss": 8.5687,
+      "step": 1120
+    },
+    {
+      "epoch": 2.6650943396226414,
+      "grad_norm": 13.914701836881665,
+      "learning_rate": 1.25e-05,
+      "loss": 7.9281,
+      "step": 1130
+    },
+    {
+      "epoch": 2.688679245283019,
+      "grad_norm": 14.688732375132988,
+      "learning_rate": 1.1625874125874126e-05,
+      "loss": 8.3812,
+      "step": 1140
+    },
+    {
+      "epoch": 2.7122641509433962,
+      "grad_norm": 15.688619890127729,
+      "learning_rate": 1.0751748251748252e-05,
+      "loss": 8.7547,
+      "step": 1150
+    },
+    {
+      "epoch": 2.7358490566037736,
+      "grad_norm": 11.089768426746776,
+      "learning_rate": 9.877622377622379e-06,
+      "loss": 8.8172,
+      "step": 1160
+    },
+    {
+      "epoch": 2.759433962264151,
+      "grad_norm": 23.92616633558165,
+      "learning_rate": 9.003496503496504e-06,
+      "loss": 8.0312,
+      "step": 1170
+    },
+    {
+      "epoch": 2.7830188679245285,
+      "grad_norm": 15.521482664204598,
+      "learning_rate": 8.12937062937063e-06,
+      "loss": 8.9219,
+      "step": 1180
+    },
+    {
+      "epoch": 2.8066037735849054,
+      "grad_norm": 11.858360531048184,
+      "learning_rate": 7.2552447552447555e-06,
+      "loss": 8.2797,
+      "step": 1190
+    },
+    {
+      "epoch": 2.830188679245283,
+      "grad_norm": 11.546554540604026,
+      "learning_rate": 6.381118881118882e-06,
+      "loss": 8.8203,
+      "step": 1200
+    },
+    {
+      "epoch": 2.8537735849056602,
+      "grad_norm": 13.017973974001212,
+      "learning_rate": 5.5069930069930074e-06,
+      "loss": 8.5516,
+      "step": 1210
+    },
+    {
+      "epoch": 2.8773584905660377,
+      "grad_norm": 28.505581742223395,
+      "learning_rate": 4.632867132867133e-06,
+      "loss": 8.5891,
+      "step": 1220
+    },
+    {
+      "epoch": 2.900943396226415,
+      "grad_norm": 13.282491850039023,
+      "learning_rate": 3.7587412587412585e-06,
+      "loss": 8.8891,
+      "step": 1230
+    },
+    {
+      "epoch": 2.9245283018867925,
+      "grad_norm": 12.713216988295962,
+      "learning_rate": 2.884615384615385e-06,
+      "loss": 8.55,
+      "step": 1240
+    },
+    {
+      "epoch": 2.94811320754717,
+      "grad_norm": 13.2259063179825,
+      "learning_rate": 2.0104895104895104e-06,
+      "loss": 8.4469,
+      "step": 1250
+    },
+    {
+      "epoch": 2.9716981132075473,
+      "grad_norm": 12.767435596869587,
+      "learning_rate": 1.1363636363636364e-06,
+      "loss": 8.4953,
+      "step": 1260
+    },
+    {
+      "epoch": 2.9952830188679247,
+      "grad_norm": 12.197730584858347,
+      "learning_rate": 2.622377622377623e-07,
+      "loss": 8.2344,
+      "step": 1270
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1272,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 235,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98ae3a9c55ceeb219ed2c62c991fab203e199638f444860986e9b55af449fe6c
+size 8593