Shreshth2002 commited on
Commit
9cbb56b
·
verified ·
1 Parent(s): 0115c6d

Upload folder using huggingface_hub

Browse files
Files changed (44) hide show
  1. .gitattributes +2 -0
  2. .gradio/certificate.pem +31 -0
  3. .gradio/flagged/dataset1.csv +2 -0
  4. README.md +3 -9
  5. __pycache__/infer.cpython-313.pyc +0 -0
  6. __pycache__/train.cpython-313.pyc +0 -0
  7. __pycache__/utils.cpython-313.pyc +0 -0
  8. app.py +36 -0
  9. infer.py +105 -0
  10. main.py +74 -0
  11. model/config.json +24 -0
  12. model/model.safetensors +3 -0
  13. model/special_tokens_map.json +7 -0
  14. model/tokenizer.json +0 -0
  15. model/tokenizer_config.json +56 -0
  16. model/training_args.bin +3 -0
  17. model/vocab.txt +0 -0
  18. requirements.txt +7 -0
  19. train.py +143 -0
  20. utils.py +32 -0
  21. wandb/run-20250720_144411-9kwggmdj/files/config.yaml +493 -0
  22. wandb/run-20250720_144411-9kwggmdj/files/output.log +148 -0
  23. wandb/run-20250720_144411-9kwggmdj/files/requirements.txt +139 -0
  24. wandb/run-20250720_144411-9kwggmdj/files/wandb-metadata.json +36 -0
  25. wandb/run-20250720_144411-9kwggmdj/files/wandb-summary.json +1 -0
  26. wandb/run-20250720_144411-9kwggmdj/logs/debug-internal.log +12 -0
  27. wandb/run-20250720_144411-9kwggmdj/logs/debug.log +25 -0
  28. wandb/run-20250720_144411-9kwggmdj/run-9kwggmdj.wandb +3 -0
  29. wandb/run-20250720_154435-9xqrzjdo/files/config.yaml +493 -0
  30. wandb/run-20250720_154435-9xqrzjdo/files/output.log +39 -0
  31. wandb/run-20250720_154435-9xqrzjdo/files/requirements.txt +139 -0
  32. wandb/run-20250720_154435-9xqrzjdo/files/wandb-metadata.json +36 -0
  33. wandb/run-20250720_154435-9xqrzjdo/files/wandb-summary.json +1 -0
  34. wandb/run-20250720_154435-9xqrzjdo/logs/debug-internal.log +12 -0
  35. wandb/run-20250720_154435-9xqrzjdo/logs/debug.log +25 -0
  36. wandb/run-20250720_154435-9xqrzjdo/run-9xqrzjdo.wandb +0 -0
  37. wandb/run-20250720_155338-0h3fksuy/files/config.yaml +494 -0
  38. wandb/run-20250720_155338-0h3fksuy/files/output.log +398 -0
  39. wandb/run-20250720_155338-0h3fksuy/files/requirements.txt +139 -0
  40. wandb/run-20250720_155338-0h3fksuy/files/wandb-metadata.json +36 -0
  41. wandb/run-20250720_155338-0h3fksuy/files/wandb-summary.json +1 -0
  42. wandb/run-20250720_155338-0h3fksuy/logs/debug-internal.log +16 -0
  43. wandb/run-20250720_155338-0h3fksuy/logs/debug.log +25 -0
  44. wandb/run-20250720_155338-0h3fksuy/run-0h3fksuy.wandb +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20250720_144411-9kwggmdj/run-9kwggmdj.wandb filter=lfs diff=lfs merge=lfs -text
37
+ wandb/run-20250720_155338-0h3fksuy/run-0h3fksuy.wandb filter=lfs diff=lfs merge=lfs -text
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
.gradio/flagged/dataset1.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Movie Review,Sentiment Prediction,timestamp
2
+ The movie gave me chills it was soo scary.,"{""label"": ""Negative"", ""confidences"": null}",2025-07-21 01:18:41.890282
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Distilbert Sentiment
3
- emoji: 🌍
4
- colorFrom: green
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 5.38.0
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: distilbert-sentiment
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 5.37.0
6
  ---
 
 
__pycache__/infer.cpython-313.pyc ADDED
Binary file (3.68 kB). View file
 
__pycache__/train.cpython-313.pyc ADDED
Binary file (4.73 kB). View file
 
__pycache__/utils.cpython-313.pyc ADDED
Binary file (975 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio frontend for DistilBERT sentiment analysis
3
+ File: app.py
4
+ """
5
+
6
+ import gradio as gr
7
+ from infer import predict
8
+
9
+ def sentiment_analyzer(text):
10
+ """Wrapper function for Gradio interface"""
11
+ if not text.strip():
12
+ return "Please enter some text"
13
+
14
+ result = predict(text)
15
+ return result.capitalize()
16
+
17
+ # Create Gradio interface
18
+ interface = gr.Interface(
19
+ fn=sentiment_analyzer,
20
+ inputs=gr.Textbox(
21
+ label="Movie Review",
22
+ placeholder="Enter your movie review here...",
23
+ lines=3
24
+ ),
25
+ outputs=gr.Label(label="Sentiment Prediction"),
26
+ title="🎬 Movie Review Sentiment Analysis",
27
+ description="Fine-tuned DistilBERT model for movie review sentiment classification",
28
+ examples=[
29
+ "This movie was absolutely fantastic! Great acting and storyline.",
30
+ "Terrible film, worst movie I've ever seen. Complete waste of time.",
31
+ "The movie was okay, not great but not terrible either."
32
+ ]
33
+ )
34
+
35
+ if __name__ == "__main__":
36
+ interface.launch(share=True)
infer.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference pipeline for DistilBERT sentiment analysis
3
+ File: infer.py (improved version)
4
+ """
5
+
6
+ import torch
7
+ import os
8
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
+
10
+ # Global variables to cache model and tokenizer
11
+ _model = None
12
+ _tokenizer = None
13
+
14
+ def load_trained_model(model_path="./model"):
15
+ """Load saved model and tokenizer (cached)"""
16
+ global _model, _tokenizer
17
+
18
+ # Check if model exists
19
+ if not os.path.exists(model_path):
20
+ raise FileNotFoundError(f"No model found at {model_path}. Please train the model first.")
21
+
22
+ # Return cached model if already loaded
23
+ if _model is not None and _tokenizer is not None:
24
+ return _model, _tokenizer
25
+
26
+ print(f"Loading model from {model_path}...")
27
+
28
+ _tokenizer = AutoTokenizer.from_pretrained(model_path)
29
+ _model = AutoModelForSequenceClassification.from_pretrained(model_path)
30
+
31
+ print("Model loaded successfully!")
32
+ return _model, _tokenizer
33
+
34
+ def predict_sentiment(text, model, tokenizer, max_length=256):
35
+ """
36
+ Predict sentiment for a single text
37
+
38
+ Args:
39
+ text: Input text string
40
+ model: Loaded model
41
+ tokenizer: Loaded tokenizer
42
+ max_length: Max sequence length
43
+
44
+ Returns:
45
+ Tuple of (predicted_label, confidence_score)
46
+ """
47
+ # Tokenize input
48
+ inputs = tokenizer(
49
+ text,
50
+ return_tensors="pt",
51
+ truncation=True,
52
+ padding="max_length",
53
+ max_length=max_length
54
+ )
55
+
56
+ # Get prediction
57
+ model.eval()
58
+ with torch.no_grad():
59
+ outputs = model(**inputs)
60
+ predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
61
+ predicted_class = torch.argmax(predictions, dim=-1).item()
62
+ confidence = predictions[0][predicted_class].item()
63
+
64
+ # Convert to readable format
65
+ label = "Positive" if predicted_class == 1 else "Negative"
66
+
67
+ return label, confidence
68
+
69
+ def predict(text, model_path="./model", max_length=256):
70
+ """
71
+ Simple prediction function for new text
72
+
73
+ Args:
74
+ text: Input text string
75
+ model_path: Path to saved model
76
+ max_length: Max sequence length
77
+
78
+ Returns:
79
+ String: "positive" or "negative"
80
+ """
81
+ try:
82
+ # Load model and tokenizer (cached)
83
+ model, tokenizer = load_trained_model(model_path)
84
+
85
+ # Tokenize input
86
+ inputs = tokenizer(
87
+ text,
88
+ return_tensors="pt",
89
+ truncation=True,
90
+ padding="max_length",
91
+ max_length=max_length
92
+ )
93
+
94
+ # Get prediction
95
+ model.eval()
96
+ with torch.no_grad():
97
+ outputs = model(**inputs)
98
+ predicted_class = torch.argmax(outputs.logits, dim=-1).item()
99
+
100
+ return "positive" if predicted_class == 1 else "negative"
101
+
102
+ except FileNotFoundError as e:
103
+ return f"Error: {str(e)}"
104
+ except Exception as e:
105
+ return f"Prediction error: {str(e)}"
main.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main pipeline for DistilBERT sentiment analysis project
3
+ File: main.py
4
+ """
5
+
6
+ import os
7
+ import argparse
8
+ from train import (
9
+ load_imdb_data,
10
+ preprocess_data,
11
+ load_model,
12
+ setup_trainer,
13
+ train_model,
14
+ evaluate_model,
15
+ save_model
16
+ )
17
+ # Remove app import since we'll run it separately
18
+
19
+ def train_pipeline(subset_size=None):
20
+ """Complete training pipeline"""
21
+ print("=== Starting Training Pipeline ===")
22
+
23
+ # 1. Load dataset
24
+ dataset = load_imdb_data(subset_size=subset_size)
25
+
26
+ # 2. Preprocess data
27
+ tokenized_dataset, tokenizer = preprocess_data(dataset)
28
+
29
+ # 3. Load model
30
+ model = load_model()
31
+
32
+ # 4. Setup trainer
33
+ trainer = setup_trainer(
34
+ model,
35
+ tokenizer,
36
+ tokenized_dataset["train"],
37
+ tokenized_dataset["test"]
38
+ )
39
+
40
+ # 5. Train model
41
+ train_model(trainer)
42
+
43
+ # 6. Evaluate model
44
+ results = evaluate_model(trainer)
45
+
46
+ # 7. Save model
47
+ save_model(trainer, tokenizer)
48
+
49
+ print("=== Training Pipeline Completed ===")
50
+ return results
51
+
52
+ def main():
53
+ parser = argparse.ArgumentParser(description="DistilBERT Sentiment Analysis - Training Only")
54
+ parser.add_argument("--subset", type=int, default=None,
55
+ help="Use subset of data for training (for testing)")
56
+
57
+ args = parser.parse_args()
58
+
59
+ # Check if model already exists
60
+ if os.path.exists("./model") and os.path.exists("./model/config.json"):
61
+ response = input("Model already exists. Retrain? (y/n): ")
62
+ if response.lower() != 'y':
63
+ print("Skipping training...")
64
+ print("To run the app: python app.py")
65
+ return
66
+
67
+ # Train the model
68
+ train_pipeline(subset_size=args.subset)
69
+
70
+ print("\n🎉 Training completed!")
71
+ print("To run the app: python app.py")
72
+
73
+ if __name__ == "__main__":
74
+ main()
model/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertForSequenceClassification"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "dim": 768,
8
+ "dropout": 0.1,
9
+ "hidden_dim": 3072,
10
+ "initializer_range": 0.02,
11
+ "max_position_embeddings": 512,
12
+ "model_type": "distilbert",
13
+ "n_heads": 12,
14
+ "n_layers": 6,
15
+ "pad_token_id": 0,
16
+ "problem_type": "single_label_classification",
17
+ "qa_dropout": 0.1,
18
+ "seq_classif_dropout": 0.2,
19
+ "sinusoidal_pos_embds": false,
20
+ "tie_weights_": true,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.53.2",
23
+ "vocab_size": 30522
24
+ }
model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65e5980bd38406f43fad7a937fbfd69641552cd0bbcf0ba62ca73f7318eb3f9f
3
+ size 267832560
model/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "DistilBertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01f6ab2453c3b34039132e185e58b0fa0c07ed65cf292dae165c993dcdab7683
3
+ size 5713
model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ torch>=2.1.0,<3.0.0
3
+ transformers>=4.45.0,<5.0.0
4
+ datasets>=2.21.0,<3.0.0
5
+ gradio>=5.0.0,<6.0.0
6
+ scikit-learn>=1.5.0,<2.0.0
7
+ numpy>=1.24.0,<2.0.0
train.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training and evaluation logic for DistilBERT sentiment analysis
3
+ File: train.py
4
+ """
5
+
6
+ # Hugging Face imports
7
+ from transformers import (
8
+ AutoTokenizer,
9
+ AutoModelForSequenceClassification,
10
+ Trainer,
11
+ TrainingArguments,
12
+ logging
13
+ )
14
+
15
+ # Local imports
16
+ from utils import compute_metrics
17
+ from datasets import load_dataset
18
+
19
+ # Standard library imports
20
+ import torch
21
+ import numpy as np
22
+ import pandas as pd
23
+
24
+ # Sklearn metrics
25
+ from sklearn.metrics import (
26
+ precision_recall_fscore_support,
27
+ accuracy_score,
28
+ confusion_matrix
29
+ )
30
+
31
+ # Suppress HF log spam
32
+ logging.set_verbosity_error()
33
+
34
+ # ===== DATASET LOADING =====
35
+
36
+ def load_imdb_data(subset_size=None):
37
+ """Load IMDB dataset with optional subsampling"""
38
+ dataset = load_dataset("imdb")
39
+
40
+ # Optional subsetting for memory constraints
41
+ if subset_size:
42
+ dataset["train"] = dataset["train"].select(range(subset_size))
43
+ dataset["test"] = dataset["test"].select(range(min(subset_size // 4, len(dataset["test"]))))
44
+
45
+ print(f"Dataset loaded - Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")
46
+ return dataset
47
+
48
+ # ===== PREPROCESSING =====
49
+
50
+ def preprocess_data(dataset, max_length=256):
51
+ """Tokenize and prepare dataset for training"""
52
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
53
+
54
+ def tokenize_function(examples):
55
+ return tokenizer(
56
+ examples["text"],
57
+ padding="max_length",
58
+ truncation=True,
59
+ max_length=max_length
60
+ )
61
+
62
+ # Tokenize both splits
63
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
64
+
65
+ # Rename label column and set format
66
+ tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
67
+ tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
68
+
69
+ return tokenized_dataset, tokenizer
70
+
71
+ # ===== MODEL LOADING =====
72
+
73
+ def load_model():
74
+ """Load pre-trained DistilBERT model for sequence classification"""
75
+ model = AutoModelForSequenceClassification.from_pretrained(
76
+ "distilbert-base-uncased",
77
+ num_labels=2,
78
+ return_dict=True
79
+ )
80
+ return model
81
+
82
+ # ===== TRAINING SETUP =====
83
+
84
+ def get_training_args():
85
+ """Define training arguments"""
86
+ return TrainingArguments(
87
+ output_dir="./model",
88
+ per_device_train_batch_size=2,
89
+ per_device_eval_batch_size=4,
90
+ gradient_accumulation_steps=2,
91
+ num_train_epochs=3,
92
+ eval_strategy="epoch", # Changed from evaluation_strategy
93
+ save_strategy="epoch",
94
+ logging_dir="./logs",
95
+ logging_steps=50,
96
+ load_best_model_at_end=True,
97
+ metric_for_best_model="f1",
98
+ greater_is_better=True,
99
+ seed=42
100
+ )
101
+
102
+ def setup_trainer(model, tokenizer, train_dataset, eval_dataset):
103
+ """Initialize Trainer with model and datasets"""
104
+ training_args = get_training_args()
105
+
106
+ trainer = Trainer(
107
+ model=model,
108
+ args=training_args,
109
+ train_dataset=train_dataset,
110
+ eval_dataset=eval_dataset,
111
+ tokenizer=tokenizer,
112
+ compute_metrics=compute_metrics
113
+ )
114
+
115
+ return trainer
116
+
117
+ # ===== TRAIN & EVALUATE =====
118
+
119
+ def train_model(trainer):
120
+ """Train the model"""
121
+ print("Starting training...")
122
+ trainer.train()
123
+ print("Training completed!")
124
+
125
+ def evaluate_model(trainer):
126
+ """Evaluate the trained model"""
127
+ print("Evaluating model...")
128
+ results = trainer.evaluate()
129
+
130
+ print("=== Evaluation Results ===")
131
+ for key, value in results.items():
132
+ print(f"{key}: {value:.4f}")
133
+
134
+ return results
135
+
136
+ # ===== SAVE MODEL =====
137
+
138
+ def save_model(trainer, tokenizer, save_path="./model"):
139
+ """Save trained model and tokenizer"""
140
+ print(f"Saving model to {save_path}...")
141
+ trainer.save_model(save_path)
142
+ tokenizer.save_pretrained(save_path)
143
+ print("Model saved successfully!")
utils.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for DistilBERT sentiment analysis
3
+ File: utils.py
4
+ """
5
+
6
+ import numpy as np
7
+ from sklearn.metrics import precision_recall_fscore_support, accuracy_score
8
+
9
+ def compute_metrics(eval_pred):
10
+ """
11
+ Compute evaluation metrics for binary classification
12
+
13
+ Args:
14
+ eval_pred: Tuple of (predictions, labels)
15
+
16
+ Returns:
17
+ Dict with accuracy, f1, precision, recall
18
+ """
19
+ predictions, labels = eval_pred
20
+ predictions = np.argmax(predictions, axis=1)
21
+
22
+ precision, recall, f1, _ = precision_recall_fscore_support(
23
+ labels, predictions, average='binary'
24
+ )
25
+ accuracy = accuracy_score(labels, predictions)
26
+
27
+ return {
28
+ 'accuracy': accuracy,
29
+ 'f1': f1,
30
+ 'precision': precision,
31
+ 'recall': recall
32
+ }
wandb/run-20250720_144411-9kwggmdj/files/config.yaml ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: distilbert-base-uncased
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.21.0
6
+ e:
7
+ qd7dze61nxdy0n83hyx7lap6a5tql6xc:
8
+ codePath: main.py
9
+ codePathLocal: main.py
10
+ cpu_count: 4
11
+ cpu_count_logical: 8
12
+ cudaVersion: "12.7"
13
+ disk:
14
+ /:
15
+ total: "255230791680"
16
+ used: "208595525632"
17
+ email: shreshthkapai@gmail.com
18
+ executable: C:\Users\Legion\Miniconda3\envs\ML\python.exe
19
+ gpu: NVIDIA GeForce GTX 1650
20
+ gpu_count: 1
21
+ gpu_nvidia:
22
+ - architecture: Turing
23
+ cudaCores: 1024
24
+ memoryTotal: "4294967296"
25
+ name: NVIDIA GeForce GTX 1650
26
+ uuid: GPU-fbcd7647-fb67-66f5-b8c7-1a4198b7e4fa
27
+ host: DESKTOP-EIHJJJL
28
+ memory:
29
+ total: "8506298368"
30
+ os: Windows-11-10.0.26100-SP0
31
+ program: C:\Users\Legion\desktop\distilbert-sentiment\main.py
32
+ python: CPython 3.13.5
33
+ root: C:\Users\Legion\desktop\distilbert-sentiment
34
+ startedAt: "2025-07-20T09:14:11.312224Z"
35
+ writerId: qd7dze61nxdy0n83hyx7lap6a5tql6xc
36
+ m:
37
+ - "1": train/global_step
38
+ "6":
39
+ - 3
40
+ "7": []
41
+ - "2": '*'
42
+ "5": 1
43
+ "6":
44
+ - 1
45
+ "7": []
46
+ python_version: 3.13.5
47
+ t:
48
+ "1":
49
+ - 1
50
+ - 5
51
+ - 11
52
+ - 41
53
+ - 49
54
+ - 51
55
+ - 53
56
+ - 71
57
+ - 105
58
+ "2":
59
+ - 1
60
+ - 5
61
+ - 11
62
+ - 41
63
+ - 49
64
+ - 51
65
+ - 53
66
+ - 71
67
+ - 105
68
+ "3":
69
+ - 7
70
+ - 13
71
+ - 19
72
+ - 66
73
+ "4": 3.13.5
74
+ "5": 0.21.0
75
+ "6": 4.53.2
76
+ "8":
77
+ - 3
78
+ "9":
79
+ "1": transformers_trainer
80
+ "12": 0.21.0
81
+ "13": windows-amd64
82
+ accelerator_config:
83
+ value:
84
+ dispatch_batches: null
85
+ even_batches: true
86
+ gradient_accumulation_kwargs: null
87
+ non_blocking: false
88
+ split_batches: false
89
+ use_seedable_sampler: true
90
+ activation:
91
+ value: gelu
92
+ adafactor:
93
+ value: false
94
+ adam_beta1:
95
+ value: 0.9
96
+ adam_beta2:
97
+ value: 0.999
98
+ adam_epsilon:
99
+ value: 1e-08
100
+ add_cross_attention:
101
+ value: false
102
+ architectures:
103
+ value:
104
+ - DistilBertForMaskedLM
105
+ attention_dropout:
106
+ value: 0.1
107
+ auto_find_batch_size:
108
+ value: false
109
+ average_tokens_across_devices:
110
+ value: false
111
+ bad_words_ids:
112
+ value: null
113
+ batch_eval_metrics:
114
+ value: false
115
+ begin_suppress_tokens:
116
+ value: null
117
+ bf16:
118
+ value: false
119
+ bf16_full_eval:
120
+ value: false
121
+ bos_token_id:
122
+ value: null
123
+ chunk_size_feed_forward:
124
+ value: 0
125
+ cross_attention_hidden_size:
126
+ value: null
127
+ data_seed:
128
+ value: null
129
+ dataloader_drop_last:
130
+ value: false
131
+ dataloader_num_workers:
132
+ value: 0
133
+ dataloader_persistent_workers:
134
+ value: false
135
+ dataloader_pin_memory:
136
+ value: true
137
+ dataloader_prefetch_factor:
138
+ value: null
139
+ ddp_backend:
140
+ value: null
141
+ ddp_broadcast_buffers:
142
+ value: null
143
+ ddp_bucket_cap_mb:
144
+ value: null
145
+ ddp_find_unused_parameters:
146
+ value: null
147
+ ddp_timeout:
148
+ value: 1800
149
+ debug:
150
+ value: []
151
+ decoder_start_token_id:
152
+ value: null
153
+ deepspeed:
154
+ value: null
155
+ dim:
156
+ value: 768
157
+ disable_tqdm:
158
+ value: true
159
+ diversity_penalty:
160
+ value: 0
161
+ do_eval:
162
+ value: true
163
+ do_predict:
164
+ value: false
165
+ do_sample:
166
+ value: false
167
+ do_train:
168
+ value: false
169
+ dropout:
170
+ value: 0.1
171
+ early_stopping:
172
+ value: false
173
+ encoder_no_repeat_ngram_size:
174
+ value: 0
175
+ eos_token_id:
176
+ value: null
177
+ eval_accumulation_steps:
178
+ value: null
179
+ eval_delay:
180
+ value: 0
181
+ eval_do_concat_batches:
182
+ value: true
183
+ eval_on_start:
184
+ value: false
185
+ eval_steps:
186
+ value: null
187
+ eval_strategy:
188
+ value: epoch
189
+ eval_use_gather_object:
190
+ value: false
191
+ exponential_decay_length_penalty:
192
+ value: null
193
+ finetuning_task:
194
+ value: null
195
+ forced_bos_token_id:
196
+ value: null
197
+ forced_eos_token_id:
198
+ value: null
199
+ fp16:
200
+ value: false
201
+ fp16_backend:
202
+ value: auto
203
+ fp16_full_eval:
204
+ value: false
205
+ fp16_opt_level:
206
+ value: O1
207
+ fsdp:
208
+ value: []
209
+ fsdp_config:
210
+ value:
211
+ min_num_params: 0
212
+ xla: false
213
+ xla_fsdp_grad_ckpt: false
214
+ xla_fsdp_v2: false
215
+ fsdp_min_num_params:
216
+ value: 0
217
+ fsdp_transformer_layer_cls_to_wrap:
218
+ value: null
219
+ full_determinism:
220
+ value: false
221
+ gradient_accumulation_steps:
222
+ value: 1
223
+ gradient_checkpointing:
224
+ value: false
225
+ gradient_checkpointing_kwargs:
226
+ value: null
227
+ greater_is_better:
228
+ value: true
229
+ group_by_length:
230
+ value: false
231
+ half_precision_backend:
232
+ value: auto
233
+ hidden_dim:
234
+ value: 3072
235
+ hub_always_push:
236
+ value: false
237
+ hub_model_id:
238
+ value: null
239
+ hub_private_repo:
240
+ value: null
241
+ hub_revision:
242
+ value: null
243
+ hub_strategy:
244
+ value: every_save
245
+ hub_token:
246
+ value: <HUB_TOKEN>
247
+ id2label:
248
+ value:
249
+ "0": LABEL_0
250
+ "1": LABEL_1
251
+ ignore_data_skip:
252
+ value: false
253
+ include_for_metrics:
254
+ value: []
255
+ include_inputs_for_metrics:
256
+ value: false
257
+ include_num_input_tokens_seen:
258
+ value: false
259
+ include_tokens_per_second:
260
+ value: false
261
+ initializer_range:
262
+ value: 0.02
263
+ is_decoder:
264
+ value: false
265
+ is_encoder_decoder:
266
+ value: false
267
+ jit_mode_eval:
268
+ value: false
269
+ label_names:
270
+ value: null
271
+ label_smoothing_factor:
272
+ value: 0
273
+ label2id:
274
+ value:
275
+ LABEL_0: 0
276
+ LABEL_1: 1
277
+ learning_rate:
278
+ value: 5e-05
279
+ length_column_name:
280
+ value: length
281
+ length_penalty:
282
+ value: 1
283
+ liger_kernel_config:
284
+ value: null
285
+ load_best_model_at_end:
286
+ value: true
287
+ local_rank:
288
+ value: 0
289
+ log_level:
290
+ value: passive
291
+ log_level_replica:
292
+ value: warning
293
+ log_on_each_node:
294
+ value: true
295
+ logging_dir:
296
+ value: ./logs
297
+ logging_first_step:
298
+ value: false
299
+ logging_nan_inf_filter:
300
+ value: true
301
+ logging_steps:
302
+ value: 50
303
+ logging_strategy:
304
+ value: steps
305
+ lr_scheduler_type:
306
+ value: linear
307
+ max_grad_norm:
308
+ value: 1
309
+ max_length:
310
+ value: 20
311
+ max_position_embeddings:
312
+ value: 512
313
+ max_steps:
314
+ value: -1
315
+ metric_for_best_model:
316
+ value: f1
317
+ min_length:
318
+ value: 0
319
+ model/num_parameters:
320
+ value: 66955010
321
+ model_type:
322
+ value: distilbert
323
+ mp_parameters:
324
+ value: ""
325
+ n_heads:
326
+ value: 12
327
+ n_layers:
328
+ value: 6
329
+ neftune_noise_alpha:
330
+ value: null
331
+ no_cuda:
332
+ value: false
333
+ no_repeat_ngram_size:
334
+ value: 0
335
+ num_beam_groups:
336
+ value: 1
337
+ num_beams:
338
+ value: 1
339
+ num_return_sequences:
340
+ value: 1
341
+ num_train_epochs:
342
+ value: 3
343
+ optim:
344
+ value: adamw_torch
345
+ optim_args:
346
+ value: null
347
+ optim_target_modules:
348
+ value: null
349
+ output_attentions:
350
+ value: false
351
+ output_dir:
352
+ value: ./model
353
+ output_hidden_states:
354
+ value: false
355
+ output_scores:
356
+ value: false
357
+ overwrite_output_dir:
358
+ value: false
359
+ pad_token_id:
360
+ value: 0
361
+ past_index:
362
+ value: -1
363
+ per_device_eval_batch_size:
364
+ value: 16
365
+ per_device_train_batch_size:
366
+ value: 8
367
+ per_gpu_eval_batch_size:
368
+ value: null
369
+ per_gpu_train_batch_size:
370
+ value: null
371
+ prediction_loss_only:
372
+ value: false
373
+ prefix:
374
+ value: null
375
+ problem_type:
376
+ value: null
377
+ push_to_hub:
378
+ value: false
379
+ push_to_hub_model_id:
380
+ value: null
381
+ push_to_hub_organization:
382
+ value: null
383
+ push_to_hub_token:
384
+ value: <PUSH_TO_HUB_TOKEN>
385
+ qa_dropout:
386
+ value: 0.1
387
+ ray_scope:
388
+ value: last
389
+ remove_invalid_values:
390
+ value: false
391
+ remove_unused_columns:
392
+ value: true
393
+ repetition_penalty:
394
+ value: 1
395
+ report_to:
396
+ value:
397
+ - wandb
398
+ restore_callback_states_from_checkpoint:
399
+ value: false
400
+ resume_from_checkpoint:
401
+ value: null
402
+ return_dict:
403
+ value: true
404
+ return_dict_in_generate:
405
+ value: false
406
+ run_name:
407
+ value: ./model
408
+ save_on_each_node:
409
+ value: false
410
+ save_only_model:
411
+ value: false
412
+ save_safetensors:
413
+ value: true
414
+ save_steps:
415
+ value: 500
416
+ save_strategy:
417
+ value: epoch
418
+ save_total_limit:
419
+ value: null
420
+ seed:
421
+ value: 42
422
+ sep_token_id:
423
+ value: null
424
+ seq_classif_dropout:
425
+ value: 0.2
426
+ sinusoidal_pos_embds:
427
+ value: false
428
+ skip_memory_metrics:
429
+ value: true
430
+ suppress_tokens:
431
+ value: null
432
+ task_specific_params:
433
+ value: null
434
+ temperature:
435
+ value: 1
436
+ tf_legacy_loss:
437
+ value: false
438
+ tf32:
439
+ value: null
440
+ tie_encoder_decoder:
441
+ value: false
442
+ tie_weights_:
443
+ value: true
444
+ tie_word_embeddings:
445
+ value: true
446
+ tokenizer_class:
447
+ value: null
448
+ top_k:
449
+ value: 50
450
+ top_p:
451
+ value: 1
452
+ torch_compile:
453
+ value: false
454
+ torch_compile_backend:
455
+ value: null
456
+ torch_compile_mode:
457
+ value: null
458
+ torch_dtype:
459
+ value: float32
460
+ torch_empty_cache_steps:
461
+ value: null
462
+ torchdynamo:
463
+ value: null
464
+ torchscript:
465
+ value: false
466
+ tpu_metrics_debug:
467
+ value: false
468
+ tpu_num_cores:
469
+ value: null
470
+ transformers_version:
471
+ value: 4.53.2
472
+ typical_p:
473
+ value: 1
474
+ use_bfloat16:
475
+ value: false
476
+ use_cpu:
477
+ value: false
478
+ use_ipex:
479
+ value: false
480
+ use_legacy_prediction_loop:
481
+ value: false
482
+ use_liger_kernel:
483
+ value: false
484
+ use_mps_device:
485
+ value: false
486
+ vocab_size:
487
+ value: 30522
488
+ warmup_ratio:
489
+ value: 0
490
+ warmup_steps:
491
+ value: 0
492
+ weight_decay:
493
+ value: 0
wandb/run-20250720_144411-9kwggmdj/files/output.log ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {'loss': 0.5564, 'grad_norm': 3.3684804439544678, 'learning_rate': 4.9738666666666665e-05, 'epoch': 0.016}
2
+ {'loss': 0.527, 'grad_norm': 12.206518173217773, 'learning_rate': 4.9472e-05, 'epoch': 0.032}
3
+ {'loss': 0.4263, 'grad_norm': 23.95633316040039, 'learning_rate': 4.9205333333333335e-05, 'epoch': 0.048}
4
+ {'loss': 0.3658, 'grad_norm': 10.685762405395508, 'learning_rate': 4.893866666666667e-05, 'epoch': 0.064}
5
+ {'loss': 0.3694, 'grad_norm': 18.01938819885254, 'learning_rate': 4.8672000000000004e-05, 'epoch': 0.08}
6
+ {'loss': 0.3841, 'grad_norm': 6.812848091125488, 'learning_rate': 4.8405333333333336e-05, 'epoch': 0.096}
7
+ {'loss': 0.3934, 'grad_norm': 4.043306827545166, 'learning_rate': 4.8138666666666674e-05, 'epoch': 0.112}
8
+ {'loss': 0.3622, 'grad_norm': 21.34123992919922, 'learning_rate': 4.7872e-05, 'epoch': 0.128}
9
+ {'loss': 0.4146, 'grad_norm': 27.191320419311523, 'learning_rate': 4.7605333333333337e-05, 'epoch': 0.144}
10
+ {'loss': 0.4305, 'grad_norm': 16.240070343017578, 'learning_rate': 4.733866666666667e-05, 'epoch': 0.16}
11
+ {'loss': 0.403, 'grad_norm': 26.02972984313965, 'learning_rate': 4.7072000000000006e-05, 'epoch': 0.176}
12
+ {'loss': 0.3811, 'grad_norm': 11.078995704650879, 'learning_rate': 4.680533333333334e-05, 'epoch': 0.192}
13
+ {'loss': 0.3766, 'grad_norm': 9.874316215515137, 'learning_rate': 4.653866666666667e-05, 'epoch': 0.208}
14
+ {'loss': 0.3032, 'grad_norm': 18.219112396240234, 'learning_rate': 4.627200000000001e-05, 'epoch': 0.224}
15
+ {'loss': 0.3812, 'grad_norm': 14.96966552734375, 'learning_rate': 4.600533333333333e-05, 'epoch': 0.24}
16
+ {'loss': 0.3765, 'grad_norm': 25.871795654296875, 'learning_rate': 4.573866666666667e-05, 'epoch': 0.256}
17
+ {'loss': 0.3693, 'grad_norm': 3.639224052429199, 'learning_rate': 4.5472e-05, 'epoch': 0.272}
18
+ {'loss': 0.2971, 'grad_norm': 15.468314170837402, 'learning_rate': 4.520533333333333e-05, 'epoch': 0.288}
19
+ {'loss': 0.3572, 'grad_norm': 3.6710922718048096, 'learning_rate': 4.493866666666667e-05, 'epoch': 0.304}
20
+ {'loss': 0.3409, 'grad_norm': 7.864389896392822, 'learning_rate': 4.4672e-05, 'epoch': 0.32}
21
+ {'loss': 0.3285, 'grad_norm': 10.038674354553223, 'learning_rate': 4.440533333333334e-05, 'epoch': 0.336}
22
+ {'loss': 0.3317, 'grad_norm': 13.171808242797852, 'learning_rate': 4.4138666666666665e-05, 'epoch': 0.352}
23
+ {'loss': 0.3674, 'grad_norm': 4.481576919555664, 'learning_rate': 4.3872e-05, 'epoch': 0.368}
24
+ {'loss': 0.3642, 'grad_norm': 6.312211513519287, 'learning_rate': 4.3605333333333334e-05, 'epoch': 0.384}
25
+ {'loss': 0.3386, 'grad_norm': 4.072713851928711, 'learning_rate': 4.3338666666666666e-05, 'epoch': 0.4}
26
+ {'loss': 0.3776, 'grad_norm': 4.920267581939697, 'learning_rate': 4.3072000000000004e-05, 'epoch': 0.416}
27
+ {'loss': 0.3519, 'grad_norm': 13.408978462219238, 'learning_rate': 4.2805333333333335e-05, 'epoch': 0.432}
28
+ {'loss': 0.343, 'grad_norm': 8.910168647766113, 'learning_rate': 4.253866666666667e-05, 'epoch': 0.448}
29
+ {'loss': 0.345, 'grad_norm': 6.50616979598999, 'learning_rate': 4.2272e-05, 'epoch': 0.464}
30
+ {'loss': 0.2931, 'grad_norm': 6.88561487197876, 'learning_rate': 4.2005333333333336e-05, 'epoch': 0.48}
31
+ {'loss': 0.3541, 'grad_norm': 2.813678026199341, 'learning_rate': 4.173866666666667e-05, 'epoch': 0.496}
32
+ {'loss': 0.3005, 'grad_norm': 18.764328002929688, 'learning_rate': 4.1472e-05, 'epoch': 0.512}
33
+ {'loss': 0.3404, 'grad_norm': 13.757184028625488, 'learning_rate': 4.120533333333334e-05, 'epoch': 0.528}
34
+ {'loss': 0.3112, 'grad_norm': 11.426987648010254, 'learning_rate': 4.093866666666667e-05, 'epoch': 0.544}
35
+ {'loss': 0.285, 'grad_norm': 0.7347564697265625, 'learning_rate': 4.0672000000000006e-05, 'epoch': 0.56}
36
+ {'loss': 0.2978, 'grad_norm': 3.315498113632202, 'learning_rate': 4.040533333333333e-05, 'epoch': 0.576}
37
+ {'loss': 0.3928, 'grad_norm': 4.304668426513672, 'learning_rate': 4.013866666666667e-05, 'epoch': 0.592}
38
+ {'loss': 0.2773, 'grad_norm': 0.5143654942512512, 'learning_rate': 3.9872e-05, 'epoch': 0.608}
39
+ {'loss': 0.3937, 'grad_norm': 10.765504837036133, 'learning_rate': 3.960533333333333e-05, 'epoch': 0.624}
40
+ {'loss': 0.2931, 'grad_norm': 3.0576841831207275, 'learning_rate': 3.933866666666667e-05, 'epoch': 0.64}
41
+ {'loss': 0.2899, 'grad_norm': 1.09218430519104, 'learning_rate': 3.9072e-05, 'epoch': 0.656}
42
+ {'loss': 0.3039, 'grad_norm': 9.193467140197754, 'learning_rate': 3.880533333333333e-05, 'epoch': 0.672}
43
+ {'loss': 0.3191, 'grad_norm': 5.1164469718933105, 'learning_rate': 3.8538666666666664e-05, 'epoch': 0.688}
44
+ {'loss': 0.3206, 'grad_norm': 10.537883758544922, 'learning_rate': 3.8272e-05, 'epoch': 0.704}
45
+ {'loss': 0.3196, 'grad_norm': 10.457417488098145, 'learning_rate': 3.800533333333334e-05, 'epoch': 0.72}
46
+ {'loss': 0.3056, 'grad_norm': 2.776677370071411, 'learning_rate': 3.7738666666666665e-05, 'epoch': 0.736}
47
+ {'loss': 0.3273, 'grad_norm': 3.808607578277588, 'learning_rate': 3.7472e-05, 'epoch': 0.752}
48
+ {'loss': 0.3754, 'grad_norm': 8.255670547485352, 'learning_rate': 3.7205333333333334e-05, 'epoch': 0.768}
49
+ {'loss': 0.2756, 'grad_norm': 8.847413063049316, 'learning_rate': 3.6938666666666666e-05, 'epoch': 0.784}
50
+ {'loss': 0.2828, 'grad_norm': 9.775912284851074, 'learning_rate': 3.6672000000000004e-05, 'epoch': 0.8}
51
+ {'loss': 0.363, 'grad_norm': 3.9166083335876465, 'learning_rate': 3.6405333333333335e-05, 'epoch': 0.816}
52
+ {'loss': 0.295, 'grad_norm': 0.43537598848342896, 'learning_rate': 3.6138666666666673e-05, 'epoch': 0.832}
53
+ {'loss': 0.2519, 'grad_norm': 4.3010735511779785, 'learning_rate': 3.5872e-05, 'epoch': 0.848}
54
+ {'loss': 0.3011, 'grad_norm': 3.7882602214813232, 'learning_rate': 3.5605333333333336e-05, 'epoch': 0.864}
55
+ {'loss': 0.3489, 'grad_norm': 5.9410400390625, 'learning_rate': 3.533866666666667e-05, 'epoch': 0.88}
56
+ {'loss': 0.2948, 'grad_norm': 6.711633205413818, 'learning_rate': 3.5072e-05, 'epoch': 0.896}
57
+ {'loss': 0.3465, 'grad_norm': 12.11922836303711, 'learning_rate': 3.480533333333334e-05, 'epoch': 0.912}
58
+ {'loss': 0.3492, 'grad_norm': 5.701395511627197, 'learning_rate': 3.453866666666667e-05, 'epoch': 0.928}
59
+ {'loss': 0.2607, 'grad_norm': 15.726317405700684, 'learning_rate': 3.427200000000001e-05, 'epoch': 0.944}
60
+ {'loss': 0.2862, 'grad_norm': 11.121344566345215, 'learning_rate': 3.400533333333333e-05, 'epoch': 0.96}
61
+ {'loss': 0.2981, 'grad_norm': 4.980706214904785, 'learning_rate': 3.373866666666667e-05, 'epoch': 0.976}
62
+ {'loss': 0.284, 'grad_norm': 6.423090934753418, 'learning_rate': 3.3472e-05, 'epoch': 0.992}
63
+ {'eval_loss': 0.2969476878643036, 'eval_accuracy': 0.89448, 'eval_f1': 0.8900558472951571, 'eval_precision': 0.9290064381416391, 'eval_recall': 0.85424, 'eval_runtime': 511.6142, 'eval_samples_per_second': 48.865, 'eval_steps_per_second': 3.055, 'epoch': 1.0}
64
+ {'loss': 0.2406, 'grad_norm': 3.1205193996429443, 'learning_rate': 3.320533333333333e-05, 'epoch': 1.008}
65
+ {'loss': 0.2386, 'grad_norm': 11.420886039733887, 'learning_rate': 3.293866666666667e-05, 'epoch': 1.024}
66
+ {'loss': 0.2133, 'grad_norm': 0.3266797661781311, 'learning_rate': 3.2672e-05, 'epoch': 1.04}
67
+ {'loss': 0.2388, 'grad_norm': 20.907642364501953, 'learning_rate': 3.240533333333334e-05, 'epoch': 1.056}
68
+ {'loss': 0.2207, 'grad_norm': 34.85378646850586, 'learning_rate': 3.2138666666666664e-05, 'epoch': 1.072}
69
+ {'loss': 0.1863, 'grad_norm': 0.08423929661512375, 'learning_rate': 3.1872e-05, 'epoch': 1.088}
70
+ {'loss': 0.2122, 'grad_norm': 2.1192731857299805, 'learning_rate': 3.1605333333333334e-05, 'epoch': 1.104}
71
+ {'loss': 0.2274, 'grad_norm': 1.2625190019607544, 'learning_rate': 3.1338666666666665e-05, 'epoch': 1.12}
72
+ {'loss': 0.146, 'grad_norm': 0.3231733441352844, 'learning_rate': 3.1072e-05, 'epoch': 1.1360000000000001}
73
+ {'loss': 0.2008, 'grad_norm': 0.6839350461959839, 'learning_rate': 3.0805333333333335e-05, 'epoch': 1.152}
74
+ {'loss': 0.2068, 'grad_norm': 3.0773186683654785, 'learning_rate': 3.0538666666666666e-05, 'epoch': 1.168}
75
+ {'loss': 0.2084, 'grad_norm': 0.05034258961677551, 'learning_rate': 3.0272e-05, 'epoch': 1.184}
76
+ {'loss': 0.2462, 'grad_norm': 11.455129623413086, 'learning_rate': 3.0005333333333336e-05, 'epoch': 1.2}
77
+ {'loss': 0.1906, 'grad_norm': 0.09377483278512955, 'learning_rate': 2.973866666666667e-05, 'epoch': 1.216}
78
+ {'loss': 0.2032, 'grad_norm': 17.590801239013672, 'learning_rate': 2.9472e-05, 'epoch': 1.232}
79
+ {'loss': 0.24, 'grad_norm': 28.78790855407715, 'learning_rate': 2.9205333333333333e-05, 'epoch': 1.248}
80
+ {'loss': 0.1331, 'grad_norm': 1.1610554456710815, 'learning_rate': 2.8938666666666668e-05, 'epoch': 1.264}
81
+ {'loss': 0.2127, 'grad_norm': 0.30296802520751953, 'learning_rate': 2.8672e-05, 'epoch': 1.28}
82
+ {'loss': 0.1867, 'grad_norm': 0.15345898270606995, 'learning_rate': 2.8405333333333334e-05, 'epoch': 1.296}
83
+ {'loss': 0.24, 'grad_norm': 8.489642143249512, 'learning_rate': 2.813866666666667e-05, 'epoch': 1.312}
84
+ {'loss': 0.1471, 'grad_norm': 0.7609522342681885, 'learning_rate': 2.7872000000000004e-05, 'epoch': 1.328}
85
+ {'loss': 0.1787, 'grad_norm': 0.15069647133350372, 'learning_rate': 2.760533333333333e-05, 'epoch': 1.3439999999999999}
86
+ {'loss': 0.2256, 'grad_norm': 0.13076968491077423, 'learning_rate': 2.733866666666667e-05, 'epoch': 1.3599999999999999}
87
+ {'loss': 0.198, 'grad_norm': 0.29645389318466187, 'learning_rate': 2.7072000000000004e-05, 'epoch': 1.376}
88
+ {'loss': 0.2099, 'grad_norm': 9.831048011779785, 'learning_rate': 2.6805333333333332e-05, 'epoch': 1.392}
89
+ {'loss': 0.2126, 'grad_norm': 0.037026241421699524, 'learning_rate': 2.6538666666666667e-05, 'epoch': 1.408}
90
+ {'loss': 0.1393, 'grad_norm': 0.2884507179260254, 'learning_rate': 2.6272000000000002e-05, 'epoch': 1.424}
91
+ {'loss': 0.1837, 'grad_norm': 0.05694892257452011, 'learning_rate': 2.6005333333333337e-05, 'epoch': 1.44}
92
+ {'loss': 0.2323, 'grad_norm': 46.10319137573242, 'learning_rate': 2.5738666666666668e-05, 'epoch': 1.456}
93
+ {'loss': 0.1858, 'grad_norm': 26.698631286621094, 'learning_rate': 2.5472000000000003e-05, 'epoch': 1.472}
94
+ Traceback (most recent call last):
95
+ File "C:\Users\Legion\desktop\distilbert-sentiment\main.py", line 74, in <module>
96
+ main()
97
+ ~~~~^^
98
+ File "C:\Users\Legion\desktop\distilbert-sentiment\main.py", line 68, in main
99
+ train_pipeline(subset_size=args.subset)
100
+ ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
101
+ File "C:\Users\Legion\desktop\distilbert-sentiment\main.py", line 41, in train_pipeline
102
+ train_model(trainer)
103
+ ~~~~~~~~~~~^^^^^^^^^
104
+ File "C:\Users\Legion\desktop\distilbert-sentiment\train.py", line 121, in train_model
105
+ trainer.train()
106
+ ~~~~~~~~~~~~~^^
107
+ File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\transformers\trainer.py", line 2206, in train
108
+ return inner_training_loop(
109
+ args=args,
110
+ ...<2 lines>...
111
+ ignore_keys_for_eval=ignore_keys_for_eval,
112
+ )
113
+ File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\transformers\trainer.py", line 2548, in _inner_training_loop
114
+ tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
115
+ File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\transformers\trainer.py", line 3797, in training_step
116
+ self.accelerator.backward(loss, **kwargs)
117
+ ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^
118
+ File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\accelerate\accelerator.py", line 2578, in backward
119
+ loss.backward(**kwargs)
120
+ ~~~~~~~~~~~~~^^^^^^^^^^
121
+ File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\torch\_tensor.py", line 648, in backward
122
+ torch.autograd.backward(
123
+ ~~~~~~~~~~~~~~~~~~~~~~~^
124
+ self, gradient, retain_graph, create_graph, inputs=inputs
125
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
126
+ )
127
+ ^
128
+ File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\torch\autograd\__init__.py", line 353, in backward
129
+ _engine_run_backward(
130
+ ~~~~~~~~~~~~~~~~~~~~^
131
+ tensors,
132
+ ^^^^^^^^
133
+ ...<5 lines>...
134
+ accumulate_grad=True,
135
+ ^^^^^^^^^^^^^^^^^^^^^
136
+ )
137
+ ^
138
+ File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\torch\autograd\graph.py", line 824, in _engine_run_backward
139
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
140
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
141
+ t_outputs, *args, **kwargs
142
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^
143
+ ) # Calls into the C++ engine to run the backward pass
144
+ ^
145
+ RuntimeError: CUDA error: out of memory
146
+ CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
147
+ For debugging consider passing CUDA_LAUNCH_BLOCKING=1
148
+ Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
wandb/run-20250720_144411-9kwggmdj/files/requirements.txt ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.9.0
2
+ aiofiles==24.1.0
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.12.14
5
+ aiosignal==1.4.0
6
+ alembic==1.16.2
7
+ altair==5.5.0
8
+ annotated-types==0.7.0
9
+ anyio==4.9.0
10
+ attrs==25.3.0
11
+ audioop-lts==0.2.1
12
+ blinker==1.9.0
13
+ Bottleneck==1.4.2
14
+ Brotli==1.1.0
15
+ cachetools==6.1.0
16
+ certifi==2025.6.15
17
+ charset-normalizer==3.4.2
18
+ click==8.2.1
19
+ cloudpickle==3.1.1
20
+ colorama==0.4.6
21
+ colorlog==6.9.0
22
+ contourpy==1.3.1
23
+ cycler==0.11.0
24
+ datasets==4.0.0
25
+ dill==0.3.8
26
+ fastapi==0.116.1
27
+ ffmpy==0.6.0
28
+ filelock==3.18.0
29
+ fonttools==4.55.3
30
+ frozenlist==1.7.0
31
+ fsspec==2025.3.0
32
+ gitdb==4.0.12
33
+ GitPython==3.1.44
34
+ gradio==5.37.0
35
+ gradio_client==1.10.4
36
+ greenlet==3.2.3
37
+ groovy==0.1.2
38
+ h11==0.16.0
39
+ httpcore==1.0.9
40
+ httpx==0.28.1
41
+ huggingface-hub==0.33.4
42
+ idna==3.10
43
+ imbalanced-learn==0.13.0
44
+ imblearn==0.0
45
+ Jinja2==3.1.6
46
+ joblib==1.4.2
47
+ jsonschema==4.24.0
48
+ jsonschema-specifications==2025.4.1
49
+ kiwisolver==1.4.8
50
+ llvmlite==0.44.0
51
+ Mako==1.3.10
52
+ markdown-it-py==3.0.0
53
+ MarkupSafe==3.0.2
54
+ matplotlib==3.9.2
55
+ mdurl==0.1.2
56
+ mpmath==1.3.0
57
+ multidict==6.6.3
58
+ multiprocess==0.70.16
59
+ narwhals==1.44.0
60
+ networkx==3.5
61
+ ninja==1.11.1.4
62
+ numba==0.61.2
63
+ numexpr==2.10.2
64
+ numpy==2.1.1
65
+ optuna==4.4.0
66
+ orjson==3.11.0
67
+ packaging==24.2
68
+ pandas==2.2.3
69
+ pillow==11.1.0
70
+ pip==25.1
71
+ platformdirs==4.3.8
72
+ plotly==6.2.0
73
+ propcache==0.3.2
74
+ protobuf==6.31.1
75
+ psutil==7.0.0
76
+ pyarrow==20.0.0
77
+ pybind11==3.0.0
78
+ pydantic==2.11.7
79
+ pydantic_core==2.33.2
80
+ pydeck==0.9.1
81
+ pydub==0.25.1
82
+ Pygments==2.19.2
83
+ pyparsing==3.2.0
84
+ PyQt6==6.7.1
85
+ PyQt6_sip==13.9.1
86
+ python-dateutil==2.9.0.post0
87
+ python-multipart==0.0.20
88
+ pytz==2024.1
89
+ PyYAML==6.0.2
90
+ referencing==0.36.2
91
+ regex==2024.11.6
92
+ requests==2.32.4
93
+ rich==14.0.0
94
+ rpds-py==0.26.0
95
+ ruff==0.12.3
96
+ safehttpx==0.1.6
97
+ safetensors==0.5.3
98
+ scikit-learn==1.5.2
99
+ scipy==1.15.2
100
+ seaborn==0.13.2
101
+ semantic-version==2.10.0
102
+ sentry-sdk==2.33.0
103
+ setuptools==78.1.1
104
+ shap==0.48.0
105
+ shellingham==1.5.4
106
+ sip==6.10.0
107
+ six==1.17.0
108
+ sklearn-compat==0.1.3
109
+ slicer==0.0.8
110
+ smmap==5.0.2
111
+ sniffio==1.3.1
112
+ SQLAlchemy==2.0.41
113
+ starlette==0.47.1
114
+ streamlit==1.46.1
115
+ sympy==1.14.0
116
+ tenacity==9.1.2
117
+ threadpoolctl==3.5.0
118
+ tokenizers==0.21.2
119
+ toml==0.10.2
120
+ tomlkit==0.13.3
121
+ torch==2.7.1+cu118
122
+ torchaudio==2.7.1+cu118
123
+ torchvision==0.22.1
124
+ tornado==6.5.1
125
+ tqdm==4.67.1
126
+ transformers==4.53.2
127
+ typer==0.16.0
128
+ typing_extensions==4.14.0
129
+ typing-inspection==0.4.1
130
+ tzdata==2025.2
131
+ urllib3==2.5.0
132
+ uvicorn==0.35.0
133
+ wandb==0.21.0
134
+ watchdog==6.0.0
135
+ websockets==15.0.1
136
+ wheel==0.45.1
137
+ xgboost==3.0.2
138
+ xxhash==3.5.0
139
+ yarl==1.20.1
wandb/run-20250720_144411-9kwggmdj/files/wandb-metadata.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Windows-11-10.0.26100-SP0",
3
+ "python": "CPython 3.13.5",
4
+ "startedAt": "2025-07-20T09:14:11.312224Z",
5
+ "program": "C:\\Users\\Legion\\desktop\\distilbert-sentiment\\main.py",
6
+ "codePath": "main.py",
7
+ "codePathLocal": "main.py",
8
+ "email": "shreshthkapai@gmail.com",
9
+ "root": "C:\\Users\\Legion\\desktop\\distilbert-sentiment",
10
+ "host": "DESKTOP-EIHJJJL",
11
+ "executable": "C:\\Users\\Legion\\Miniconda3\\envs\\ML\\python.exe",
12
+ "cpu_count": 4,
13
+ "cpu_count_logical": 8,
14
+ "gpu": "NVIDIA GeForce GTX 1650",
15
+ "gpu_count": 1,
16
+ "disk": {
17
+ "/": {
18
+ "total": "255230791680",
19
+ "used": "208595525632"
20
+ }
21
+ },
22
+ "memory": {
23
+ "total": "8506298368"
24
+ },
25
+ "gpu_nvidia": [
26
+ {
27
+ "name": "NVIDIA GeForce GTX 1650",
28
+ "memoryTotal": "4294967296",
29
+ "cudaCores": 1024,
30
+ "architecture": "Turing",
31
+ "uuid": "GPU-fbcd7647-fb67-66f5-b8c7-1a4198b7e4fa"
32
+ }
33
+ ],
34
+ "cudaVersion": "12.7",
35
+ "writerId": "qd7dze61nxdy0n83hyx7lap6a5tql6xc"
36
+ }
wandb/run-20250720_144411-9kwggmdj/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/grad_norm":26.698631286621094,"train/learning_rate":2.5472000000000003e-05,"eval/samples_per_second":48.865,"_runtime":2637,"eval/runtime":511.6142,"eval/f1":0.8900558472951571,"eval/accuracy":0.89448,"eval/steps_per_second":3.055,"_timestamp":1.7530054751176744e+09,"train/loss":0.1858,"train/global_step":4600,"train/epoch":1.472,"eval/loss":0.2969476878643036,"eval/recall":0.85424,"_wandb":{"runtime":2637},"_step":92,"eval/precision":0.9290064381416391}
wandb/run-20250720_144411-9kwggmdj/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-07-20T14:44:13.6837247+05:30","level":"INFO","msg":"stream: starting","core version":"0.21.0"}
2
+ {"time":"2025-07-20T14:44:14.9093201+05:30","level":"INFO","msg":"stream: created new stream","id":"9kwggmdj"}
3
+ {"time":"2025-07-20T14:44:14.9093201+05:30","level":"INFO","msg":"stream: started","id":"9kwggmdj"}
4
+ {"time":"2025-07-20T14:44:14.9093201+05:30","level":"INFO","msg":"handler: started","stream_id":"9kwggmdj"}
5
+ {"time":"2025-07-20T14:44:14.9093201+05:30","level":"INFO","msg":"sender: started","stream_id":"9kwggmdj"}
6
+ {"time":"2025-07-20T14:44:14.9093201+05:30","level":"INFO","msg":"writer: Do: started","stream_id":"9kwggmdj"}
7
+ {"time":"2025-07-20T15:28:13.4157038+05:30","level":"INFO","msg":"stream: closing","id":"9kwggmdj"}
8
+ {"time":"2025-07-20T15:28:16.7459113+05:30","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-07-20T15:28:17.5720383+05:30","level":"INFO","msg":"sender: closed","stream_id":"9kwggmdj"}
10
+ {"time":"2025-07-20T15:28:17.5720383+05:30","level":"INFO","msg":"handler: closed","stream_id":"9kwggmdj"}
11
+ {"time":"2025-07-20T15:28:17.5720383+05:30","level":"INFO","msg":"writer: Close: closed","stream_id":"9kwggmdj"}
12
+ {"time":"2025-07-20T15:28:17.5820507+05:30","level":"INFO","msg":"stream: closed","id":"9kwggmdj"}
wandb/run-20250720_144411-9kwggmdj/logs/debug.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-20 14:44:11,319 INFO MainThread:4228 [wandb_setup.py:_flush():80] Current SDK version is 0.21.0
2
+ 2025-07-20 14:44:11,319 INFO MainThread:4228 [wandb_setup.py:_flush():80] Configure stats pid to 4228
3
+ 2025-07-20 14:44:11,320 INFO MainThread:4228 [wandb_setup.py:_flush():80] Loading settings from C:\Users\Legion\.config\wandb\settings
4
+ 2025-07-20 14:44:11,320 INFO MainThread:4228 [wandb_setup.py:_flush():80] Loading settings from C:\Users\Legion\desktop\distilbert-sentiment\wandb\settings
5
+ 2025-07-20 14:44:11,320 INFO MainThread:4228 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-07-20 14:44:11,320 INFO MainThread:4228 [wandb_init.py:setup_run_log_directory():703] Logging user logs to C:\Users\Legion\desktop\distilbert-sentiment\wandb\run-20250720_144411-9kwggmdj\logs\debug.log
7
+ 2025-07-20 14:44:11,321 INFO MainThread:4228 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to C:\Users\Legion\desktop\distilbert-sentiment\wandb\run-20250720_144411-9kwggmdj\logs\debug-internal.log
8
+ 2025-07-20 14:44:11,321 INFO MainThread:4228 [wandb_init.py:init():830] calling init triggers
9
+ 2025-07-20 14:44:11,321 INFO MainThread:4228 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-07-20 14:44:11,321 INFO MainThread:4228 [wandb_init.py:init():871] starting backend
12
+ 2025-07-20 14:44:12,739 INFO MainThread:4228 [wandb_init.py:init():874] sending inform_init request
13
+ 2025-07-20 14:44:13,671 INFO MainThread:4228 [wandb_init.py:init():882] backend started and connected
14
+ 2025-07-20 14:44:13,676 INFO MainThread:4228 [wandb_init.py:init():953] updated telemetry
15
+ 2025-07-20 14:44:13,680 INFO MainThread:4228 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
16
+ 2025-07-20 14:44:15,444 INFO MainThread:4228 [wandb_init.py:init():1029] starting run threads in backend
17
+ 2025-07-20 14:44:15,878 INFO MainThread:4228 [wandb_run.py:_console_start():2458] atexit reg
18
+ 2025-07-20 14:44:15,879 INFO MainThread:4228 [wandb_run.py:_redirect():2306] redirect: wrap_raw
19
+ 2025-07-20 14:44:15,879 INFO MainThread:4228 [wandb_run.py:_redirect():2375] Wrapping output streams.
20
+ 2025-07-20 14:44:15,879 INFO MainThread:4228 [wandb_run.py:_redirect():2398] Redirects installed.
21
+ 2025-07-20 14:44:15,885 INFO MainThread:4228 [wandb_init.py:init():1075] run started, returning control to user process
22
+ 2025-07-20 14:44:15,889 INFO MainThread:4228 [wandb_run.py:_config_callback():1363] config_cb None None {'vocab_size': 30522, 'max_position_embeddings': 512, 'sinusoidal_pos_embds': False, 'n_layers': 6, 'n_heads': 12, 'dim': 768, 'hidden_dim': 3072, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation': 'gelu', 'initializer_range': 0.02, 'qa_dropout': 0.1, 'seq_classif_dropout': 0.2, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['DistilBertForMaskedLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 0, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'distilbert-base-uncased', 'transformers_version': '4.53.2', 'model_type': 'distilbert', 'tie_weights_': True, 'output_attentions': False, 'output_dir': './model', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 50, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './model', 'disable_tqdm': True, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'f1', 'greater_is_better': True, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
23
+ 2025-07-20 14:44:15,894 INFO MainThread:4228 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 66955010 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x0000015891724590>>
24
+ 2025-07-20 14:44:15,894 INFO MainThread:4228 [wandb_run.py:_config_callback():1363] config_cb model/num_parameters 66955010 None
25
+ 2025-07-20 15:28:12,363 INFO MsgRouterThr:4228 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
wandb/run-20250720_144411-9kwggmdj/run-9kwggmdj.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de1b57c0d7b948fcdacf3f80d9fa12fd8f80b6888eea1c8acc8593a8aa7b62d1
3
+ size 231840
wandb/run-20250720_154435-9xqrzjdo/files/config.yaml ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: distilbert-base-uncased
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.21.0
6
+ e:
7
+ 0ygkgjf4tjw3nzhcstc0bi4ropv1pahk:
8
+ codePath: main.py
9
+ codePathLocal: main.py
10
+ cpu_count: 4
11
+ cpu_count_logical: 8
12
+ cudaVersion: "12.7"
13
+ disk:
14
+ /:
15
+ total: "255230791680"
16
+ used: "225197662208"
17
+ email: shreshthkapai@gmail.com
18
+ executable: C:\Users\Legion\Miniconda3\envs\ML\python.exe
19
+ gpu: NVIDIA GeForce GTX 1650
20
+ gpu_count: 1
21
+ gpu_nvidia:
22
+ - architecture: Turing
23
+ cudaCores: 1024
24
+ memoryTotal: "4294967296"
25
+ name: NVIDIA GeForce GTX 1650
26
+ uuid: GPU-fbcd7647-fb67-66f5-b8c7-1a4198b7e4fa
27
+ host: DESKTOP-EIHJJJL
28
+ memory:
29
+ total: "8506298368"
30
+ os: Windows-11-10.0.26100-SP0
31
+ program: C:\Users\Legion\desktop\distilbert-sentiment\main.py
32
+ python: CPython 3.13.5
33
+ root: C:\Users\Legion\desktop\distilbert-sentiment
34
+ startedAt: "2025-07-20T10:14:35.345095Z"
35
+ writerId: 0ygkgjf4tjw3nzhcstc0bi4ropv1pahk
36
+ m:
37
+ - "1": train/global_step
38
+ "6":
39
+ - 3
40
+ "7": []
41
+ - "2": '*'
42
+ "5": 1
43
+ "6":
44
+ - 1
45
+ "7": []
46
+ python_version: 3.13.5
47
+ t:
48
+ "1":
49
+ - 1
50
+ - 5
51
+ - 11
52
+ - 41
53
+ - 49
54
+ - 51
55
+ - 53
56
+ - 71
57
+ - 105
58
+ "2":
59
+ - 1
60
+ - 5
61
+ - 11
62
+ - 41
63
+ - 49
64
+ - 51
65
+ - 53
66
+ - 71
67
+ - 105
68
+ "3":
69
+ - 7
70
+ - 13
71
+ - 19
72
+ - 66
73
+ "4": 3.13.5
74
+ "5": 0.21.0
75
+ "6": 4.53.2
76
+ "8":
77
+ - 3
78
+ "9":
79
+ "1": transformers_trainer
80
+ "12": 0.21.0
81
+ "13": windows-amd64
82
+ accelerator_config:
83
+ value:
84
+ dispatch_batches: null
85
+ even_batches: true
86
+ gradient_accumulation_kwargs: null
87
+ non_blocking: false
88
+ split_batches: false
89
+ use_seedable_sampler: true
90
+ activation:
91
+ value: gelu
92
+ adafactor:
93
+ value: false
94
+ adam_beta1:
95
+ value: 0.9
96
+ adam_beta2:
97
+ value: 0.999
98
+ adam_epsilon:
99
+ value: 1e-08
100
+ add_cross_attention:
101
+ value: false
102
+ architectures:
103
+ value:
104
+ - DistilBertForMaskedLM
105
+ attention_dropout:
106
+ value: 0.1
107
+ auto_find_batch_size:
108
+ value: false
109
+ average_tokens_across_devices:
110
+ value: false
111
+ bad_words_ids:
112
+ value: null
113
+ batch_eval_metrics:
114
+ value: false
115
+ begin_suppress_tokens:
116
+ value: null
117
+ bf16:
118
+ value: false
119
+ bf16_full_eval:
120
+ value: false
121
+ bos_token_id:
122
+ value: null
123
+ chunk_size_feed_forward:
124
+ value: 0
125
+ cross_attention_hidden_size:
126
+ value: null
127
+ data_seed:
128
+ value: null
129
+ dataloader_drop_last:
130
+ value: false
131
+ dataloader_num_workers:
132
+ value: 0
133
+ dataloader_persistent_workers:
134
+ value: false
135
+ dataloader_pin_memory:
136
+ value: true
137
+ dataloader_prefetch_factor:
138
+ value: null
139
+ ddp_backend:
140
+ value: null
141
+ ddp_broadcast_buffers:
142
+ value: null
143
+ ddp_bucket_cap_mb:
144
+ value: null
145
+ ddp_find_unused_parameters:
146
+ value: null
147
+ ddp_timeout:
148
+ value: 1800
149
+ debug:
150
+ value: []
151
+ decoder_start_token_id:
152
+ value: null
153
+ deepspeed:
154
+ value: null
155
+ dim:
156
+ value: 768
157
+ disable_tqdm:
158
+ value: true
159
+ diversity_penalty:
160
+ value: 0
161
+ do_eval:
162
+ value: true
163
+ do_predict:
164
+ value: false
165
+ do_sample:
166
+ value: false
167
+ do_train:
168
+ value: false
169
+ dropout:
170
+ value: 0.1
171
+ early_stopping:
172
+ value: false
173
+ encoder_no_repeat_ngram_size:
174
+ value: 0
175
+ eos_token_id:
176
+ value: null
177
+ eval_accumulation_steps:
178
+ value: null
179
+ eval_delay:
180
+ value: 0
181
+ eval_do_concat_batches:
182
+ value: true
183
+ eval_on_start:
184
+ value: false
185
+ eval_steps:
186
+ value: null
187
+ eval_strategy:
188
+ value: epoch
189
+ eval_use_gather_object:
190
+ value: false
191
+ exponential_decay_length_penalty:
192
+ value: null
193
+ finetuning_task:
194
+ value: null
195
+ forced_bos_token_id:
196
+ value: null
197
+ forced_eos_token_id:
198
+ value: null
199
+ fp16:
200
+ value: false
201
+ fp16_backend:
202
+ value: auto
203
+ fp16_full_eval:
204
+ value: false
205
+ fp16_opt_level:
206
+ value: O1
207
+ fsdp:
208
+ value: []
209
+ fsdp_config:
210
+ value:
211
+ min_num_params: 0
212
+ xla: false
213
+ xla_fsdp_grad_ckpt: false
214
+ xla_fsdp_v2: false
215
+ fsdp_min_num_params:
216
+ value: 0
217
+ fsdp_transformer_layer_cls_to_wrap:
218
+ value: null
219
+ full_determinism:
220
+ value: false
221
+ gradient_accumulation_steps:
222
+ value: 1
223
+ gradient_checkpointing:
224
+ value: false
225
+ gradient_checkpointing_kwargs:
226
+ value: null
227
+ greater_is_better:
228
+ value: true
229
+ group_by_length:
230
+ value: false
231
+ half_precision_backend:
232
+ value: auto
233
+ hidden_dim:
234
+ value: 3072
235
+ hub_always_push:
236
+ value: false
237
+ hub_model_id:
238
+ value: null
239
+ hub_private_repo:
240
+ value: null
241
+ hub_revision:
242
+ value: null
243
+ hub_strategy:
244
+ value: every_save
245
+ hub_token:
246
+ value: <HUB_TOKEN>
247
+ id2label:
248
+ value:
249
+ "0": LABEL_0
250
+ "1": LABEL_1
251
+ ignore_data_skip:
252
+ value: false
253
+ include_for_metrics:
254
+ value: []
255
+ include_inputs_for_metrics:
256
+ value: false
257
+ include_num_input_tokens_seen:
258
+ value: false
259
+ include_tokens_per_second:
260
+ value: false
261
+ initializer_range:
262
+ value: 0.02
263
+ is_decoder:
264
+ value: false
265
+ is_encoder_decoder:
266
+ value: false
267
+ jit_mode_eval:
268
+ value: false
269
+ label_names:
270
+ value: null
271
+ label_smoothing_factor:
272
+ value: 0
273
+ label2id:
274
+ value:
275
+ LABEL_0: 0
276
+ LABEL_1: 1
277
+ learning_rate:
278
+ value: 5e-05
279
+ length_column_name:
280
+ value: length
281
+ length_penalty:
282
+ value: 1
283
+ liger_kernel_config:
284
+ value: null
285
+ load_best_model_at_end:
286
+ value: true
287
+ local_rank:
288
+ value: 0
289
+ log_level:
290
+ value: passive
291
+ log_level_replica:
292
+ value: warning
293
+ log_on_each_node:
294
+ value: true
295
+ logging_dir:
296
+ value: ./logs
297
+ logging_first_step:
298
+ value: false
299
+ logging_nan_inf_filter:
300
+ value: true
301
+ logging_steps:
302
+ value: 50
303
+ logging_strategy:
304
+ value: steps
305
+ lr_scheduler_type:
306
+ value: linear
307
+ max_grad_norm:
308
+ value: 1
309
+ max_length:
310
+ value: 20
311
+ max_position_embeddings:
312
+ value: 512
313
+ max_steps:
314
+ value: -1
315
+ metric_for_best_model:
316
+ value: f1
317
+ min_length:
318
+ value: 0
319
+ model/num_parameters:
320
+ value: 66955010
321
+ model_type:
322
+ value: distilbert
323
+ mp_parameters:
324
+ value: ""
325
+ n_heads:
326
+ value: 12
327
+ n_layers:
328
+ value: 6
329
+ neftune_noise_alpha:
330
+ value: null
331
+ no_cuda:
332
+ value: false
333
+ no_repeat_ngram_size:
334
+ value: 0
335
+ num_beam_groups:
336
+ value: 1
337
+ num_beams:
338
+ value: 1
339
+ num_return_sequences:
340
+ value: 1
341
+ num_train_epochs:
342
+ value: 3
343
+ optim:
344
+ value: adamw_torch
345
+ optim_args:
346
+ value: null
347
+ optim_target_modules:
348
+ value: null
349
+ output_attentions:
350
+ value: false
351
+ output_dir:
352
+ value: ./model
353
+ output_hidden_states:
354
+ value: false
355
+ output_scores:
356
+ value: false
357
+ overwrite_output_dir:
358
+ value: false
359
+ pad_token_id:
360
+ value: 0
361
+ past_index:
362
+ value: -1
363
+ per_device_eval_batch_size:
364
+ value: 16
365
+ per_device_train_batch_size:
366
+ value: 8
367
+ per_gpu_eval_batch_size:
368
+ value: null
369
+ per_gpu_train_batch_size:
370
+ value: null
371
+ prediction_loss_only:
372
+ value: false
373
+ prefix:
374
+ value: null
375
+ problem_type:
376
+ value: null
377
+ push_to_hub:
378
+ value: false
379
+ push_to_hub_model_id:
380
+ value: null
381
+ push_to_hub_organization:
382
+ value: null
383
+ push_to_hub_token:
384
+ value: <PUSH_TO_HUB_TOKEN>
385
+ qa_dropout:
386
+ value: 0.1
387
+ ray_scope:
388
+ value: last
389
+ remove_invalid_values:
390
+ value: false
391
+ remove_unused_columns:
392
+ value: true
393
+ repetition_penalty:
394
+ value: 1
395
+ report_to:
396
+ value:
397
+ - wandb
398
+ restore_callback_states_from_checkpoint:
399
+ value: false
400
+ resume_from_checkpoint:
401
+ value: null
402
+ return_dict:
403
+ value: true
404
+ return_dict_in_generate:
405
+ value: false
406
+ run_name:
407
+ value: ./model
408
+ save_on_each_node:
409
+ value: false
410
+ save_only_model:
411
+ value: false
412
+ save_safetensors:
413
+ value: true
414
+ save_steps:
415
+ value: 500
416
+ save_strategy:
417
+ value: epoch
418
+ save_total_limit:
419
+ value: null
420
+ seed:
421
+ value: 42
422
+ sep_token_id:
423
+ value: null
424
+ seq_classif_dropout:
425
+ value: 0.2
426
+ sinusoidal_pos_embds:
427
+ value: false
428
+ skip_memory_metrics:
429
+ value: true
430
+ suppress_tokens:
431
+ value: null
432
+ task_specific_params:
433
+ value: null
434
+ temperature:
435
+ value: 1
436
+ tf_legacy_loss:
437
+ value: false
438
+ tf32:
439
+ value: null
440
+ tie_encoder_decoder:
441
+ value: false
442
+ tie_weights_:
443
+ value: true
444
+ tie_word_embeddings:
445
+ value: true
446
+ tokenizer_class:
447
+ value: null
448
+ top_k:
449
+ value: 50
450
+ top_p:
451
+ value: 1
452
+ torch_compile:
453
+ value: false
454
+ torch_compile_backend:
455
+ value: null
456
+ torch_compile_mode:
457
+ value: null
458
+ torch_dtype:
459
+ value: float32
460
+ torch_empty_cache_steps:
461
+ value: null
462
+ torchdynamo:
463
+ value: null
464
+ torchscript:
465
+ value: false
466
+ tpu_metrics_debug:
467
+ value: false
468
+ tpu_num_cores:
469
+ value: null
470
+ transformers_version:
471
+ value: 4.53.2
472
+ typical_p:
473
+ value: 1
474
+ use_bfloat16:
475
+ value: false
476
+ use_cpu:
477
+ value: false
478
+ use_ipex:
479
+ value: false
480
+ use_legacy_prediction_loop:
481
+ value: false
482
+ use_liger_kernel:
483
+ value: false
484
+ use_mps_device:
485
+ value: false
486
+ vocab_size:
487
+ value: 30522
488
+ warmup_ratio:
489
+ value: 0
490
+ warmup_steps:
491
+ value: 0
492
+ weight_decay:
493
+ value: 0
wandb/run-20250720_154435-9xqrzjdo/files/output.log ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {'loss': 0.6215, 'grad_norm': 4.55625057220459, 'learning_rate': 4.9738666666666665e-05, 'epoch': 0.016}
2
+ {'loss': 0.5086, 'grad_norm': 13.619754791259766, 'learning_rate': 4.9472e-05, 'epoch': 0.032}
3
+ {'loss': 0.4128, 'grad_norm': 10.843639373779297, 'learning_rate': 4.9205333333333335e-05, 'epoch': 0.048}
4
+ {'loss': 0.3603, 'grad_norm': 7.094396114349365, 'learning_rate': 4.893866666666667e-05, 'epoch': 0.064}
5
+ {'loss': 0.3572, 'grad_norm': 32.03938674926758, 'learning_rate': 4.8672000000000004e-05, 'epoch': 0.08}
6
+ {'loss': 0.4255, 'grad_norm': 2.2694833278656006, 'learning_rate': 4.8405333333333336e-05, 'epoch': 0.096}
7
+ {'loss': 0.3592, 'grad_norm': 1.1852556467056274, 'learning_rate': 4.8138666666666674e-05, 'epoch': 0.112}
8
+ {'loss': 0.3759, 'grad_norm': 8.895912170410156, 'learning_rate': 4.7872e-05, 'epoch': 0.128}
9
+ {'loss': 0.4246, 'grad_norm': 16.175556182861328, 'learning_rate': 4.7605333333333337e-05, 'epoch': 0.144}
10
+ {'loss': 0.3949, 'grad_norm': 13.036661148071289, 'learning_rate': 4.733866666666667e-05, 'epoch': 0.16}
11
+ {'loss': 0.3442, 'grad_norm': 8.27635669708252, 'learning_rate': 4.7072000000000006e-05, 'epoch': 0.176}
12
+ {'loss': 0.4416, 'grad_norm': 19.103059768676758, 'learning_rate': 4.680533333333334e-05, 'epoch': 0.192}
13
+ {'loss': 0.3638, 'grad_norm': 22.78896713256836, 'learning_rate': 4.653866666666667e-05, 'epoch': 0.208}
14
+ {'loss': 0.2995, 'grad_norm': 21.197683334350586, 'learning_rate': 4.627200000000001e-05, 'epoch': 0.224}
15
+ {'loss': 0.3702, 'grad_norm': 6.810858726501465, 'learning_rate': 4.600533333333333e-05, 'epoch': 0.24}
16
+ {'loss': 0.3149, 'grad_norm': 16.848161697387695, 'learning_rate': 4.573866666666667e-05, 'epoch': 0.256}
17
+ Traceback (most recent call last):
18
+ File "C:\Users\Legion\desktop\distilbert-sentiment\main.py", line 74, in <module>
19
+ main()
20
+ ~~~~^^
21
+ File "C:\Users\Legion\desktop\distilbert-sentiment\main.py", line 68, in main
22
+ train_pipeline(subset_size=args.subset)
23
+ ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
24
+ File "C:\Users\Legion\desktop\distilbert-sentiment\main.py", line 41, in train_pipeline
25
+ train_model(trainer)
26
+ ~~~~~~~~~~~^^^^^^^^^
27
+ File "C:\Users\Legion\desktop\distilbert-sentiment\train.py", line 121, in train_model
28
+ print("Starting training...")
29
+ ^^^^^^^^^^^^^^^
30
+ File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\transformers\trainer.py", line 2206, in train
31
+ return inner_training_loop(
32
+ args=args,
33
+ ...<2 lines>...
34
+ ignore_keys_for_eval=ignore_keys_for_eval,
35
+ )
36
+ File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\transformers\trainer.py", line 2553, in _inner_training_loop
37
+ and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
38
+ ~~~~~~~~~~~^^^^^^^^^^^^^^
39
+ KeyboardInterrupt
wandb/run-20250720_154435-9xqrzjdo/files/requirements.txt ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.9.0
2
+ aiofiles==24.1.0
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.12.14
5
+ aiosignal==1.4.0
6
+ alembic==1.16.2
7
+ altair==5.5.0
8
+ annotated-types==0.7.0
9
+ anyio==4.9.0
10
+ attrs==25.3.0
11
+ audioop-lts==0.2.1
12
+ blinker==1.9.0
13
+ Bottleneck==1.4.2
14
+ Brotli==1.1.0
15
+ cachetools==6.1.0
16
+ certifi==2025.6.15
17
+ charset-normalizer==3.4.2
18
+ click==8.2.1
19
+ cloudpickle==3.1.1
20
+ colorama==0.4.6
21
+ colorlog==6.9.0
22
+ contourpy==1.3.1
23
+ cycler==0.11.0
24
+ datasets==4.0.0
25
+ dill==0.3.8
26
+ fastapi==0.116.1
27
+ ffmpy==0.6.0
28
+ filelock==3.18.0
29
+ fonttools==4.55.3
30
+ frozenlist==1.7.0
31
+ fsspec==2025.3.0
32
+ gitdb==4.0.12
33
+ GitPython==3.1.44
34
+ gradio==5.37.0
35
+ gradio_client==1.10.4
36
+ greenlet==3.2.3
37
+ groovy==0.1.2
38
+ h11==0.16.0
39
+ httpcore==1.0.9
40
+ httpx==0.28.1
41
+ huggingface-hub==0.33.4
42
+ idna==3.10
43
+ imbalanced-learn==0.13.0
44
+ imblearn==0.0
45
+ Jinja2==3.1.6
46
+ joblib==1.4.2
47
+ jsonschema==4.24.0
48
+ jsonschema-specifications==2025.4.1
49
+ kiwisolver==1.4.8
50
+ llvmlite==0.44.0
51
+ Mako==1.3.10
52
+ markdown-it-py==3.0.0
53
+ MarkupSafe==3.0.2
54
+ matplotlib==3.9.2
55
+ mdurl==0.1.2
56
+ mpmath==1.3.0
57
+ multidict==6.6.3
58
+ multiprocess==0.70.16
59
+ narwhals==1.44.0
60
+ networkx==3.5
61
+ ninja==1.11.1.4
62
+ numba==0.61.2
63
+ numexpr==2.10.2
64
+ numpy==2.1.1
65
+ optuna==4.4.0
66
+ orjson==3.11.0
67
+ packaging==24.2
68
+ pandas==2.2.3
69
+ pillow==11.1.0
70
+ pip==25.1
71
+ platformdirs==4.3.8
72
+ plotly==6.2.0
73
+ propcache==0.3.2
74
+ protobuf==6.31.1
75
+ psutil==7.0.0
76
+ pyarrow==20.0.0
77
+ pybind11==3.0.0
78
+ pydantic==2.11.7
79
+ pydantic_core==2.33.2
80
+ pydeck==0.9.1
81
+ pydub==0.25.1
82
+ Pygments==2.19.2
83
+ pyparsing==3.2.0
84
+ PyQt6==6.7.1
85
+ PyQt6_sip==13.9.1
86
+ python-dateutil==2.9.0.post0
87
+ python-multipart==0.0.20
88
+ pytz==2024.1
89
+ PyYAML==6.0.2
90
+ referencing==0.36.2
91
+ regex==2024.11.6
92
+ requests==2.32.4
93
+ rich==14.0.0
94
+ rpds-py==0.26.0
95
+ ruff==0.12.3
96
+ safehttpx==0.1.6
97
+ safetensors==0.5.3
98
+ scikit-learn==1.5.2
99
+ scipy==1.15.2
100
+ seaborn==0.13.2
101
+ semantic-version==2.10.0
102
+ sentry-sdk==2.33.0
103
+ setuptools==78.1.1
104
+ shap==0.48.0
105
+ shellingham==1.5.4
106
+ sip==6.10.0
107
+ six==1.17.0
108
+ sklearn-compat==0.1.3
109
+ slicer==0.0.8
110
+ smmap==5.0.2
111
+ sniffio==1.3.1
112
+ SQLAlchemy==2.0.41
113
+ starlette==0.47.1
114
+ streamlit==1.46.1
115
+ sympy==1.14.0
116
+ tenacity==9.1.2
117
+ threadpoolctl==3.5.0
118
+ tokenizers==0.21.2
119
+ toml==0.10.2
120
+ tomlkit==0.13.3
121
+ torch==2.7.1+cu118
122
+ torchaudio==2.7.1+cu118
123
+ torchvision==0.22.1
124
+ tornado==6.5.1
125
+ tqdm==4.67.1
126
+ transformers==4.53.2
127
+ typer==0.16.0
128
+ typing_extensions==4.14.0
129
+ typing-inspection==0.4.1
130
+ tzdata==2025.2
131
+ urllib3==2.5.0
132
+ uvicorn==0.35.0
133
+ wandb==0.21.0
134
+ watchdog==6.0.0
135
+ websockets==15.0.1
136
+ wheel==0.45.1
137
+ xgboost==3.0.2
138
+ xxhash==3.5.0
139
+ yarl==1.20.1
wandb/run-20250720_154435-9xqrzjdo/files/wandb-metadata.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Windows-11-10.0.26100-SP0",
3
+ "python": "CPython 3.13.5",
4
+ "startedAt": "2025-07-20T10:14:35.345095Z",
5
+ "program": "C:\\Users\\Legion\\desktop\\distilbert-sentiment\\main.py",
6
+ "codePath": "main.py",
7
+ "codePathLocal": "main.py",
8
+ "email": "shreshthkapai@gmail.com",
9
+ "root": "C:\\Users\\Legion\\desktop\\distilbert-sentiment",
10
+ "host": "DESKTOP-EIHJJJL",
11
+ "executable": "C:\\Users\\Legion\\Miniconda3\\envs\\ML\\python.exe",
12
+ "cpu_count": 4,
13
+ "cpu_count_logical": 8,
14
+ "gpu": "NVIDIA GeForce GTX 1650",
15
+ "gpu_count": 1,
16
+ "disk": {
17
+ "/": {
18
+ "total": "255230791680",
19
+ "used": "225197662208"
20
+ }
21
+ },
22
+ "memory": {
23
+ "total": "8506298368"
24
+ },
25
+ "gpu_nvidia": [
26
+ {
27
+ "name": "NVIDIA GeForce GTX 1650",
28
+ "memoryTotal": "4294967296",
29
+ "cudaCores": 1024,
30
+ "architecture": "Turing",
31
+ "uuid": "GPU-fbcd7647-fb67-66f5-b8c7-1a4198b7e4fa"
32
+ }
33
+ ],
34
+ "cudaVersion": "12.7",
35
+ "writerId": "0ygkgjf4tjw3nzhcstc0bi4ropv1pahk"
36
+ }
wandb/run-20250720_154435-9xqrzjdo/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/grad_norm":16.848161697387695,"_runtime":388,"_wandb":{"runtime":388},"train/epoch":0.256,"train/global_step":800,"_timestamp":1.753006850924072e+09,"train/learning_rate":4.573866666666667e-05,"_step":15,"train/loss":0.3149}
wandb/run-20250720_154435-9xqrzjdo/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-07-20T15:44:35.9771205+05:30","level":"INFO","msg":"stream: starting","core version":"0.21.0"}
2
+ {"time":"2025-07-20T15:44:36.9333877+05:30","level":"INFO","msg":"stream: created new stream","id":"9xqrzjdo"}
3
+ {"time":"2025-07-20T15:44:36.9333877+05:30","level":"INFO","msg":"stream: started","id":"9xqrzjdo"}
4
+ {"time":"2025-07-20T15:44:36.9333877+05:30","level":"INFO","msg":"handler: started","stream_id":"9xqrzjdo"}
5
+ {"time":"2025-07-20T15:44:36.9333877+05:30","level":"INFO","msg":"sender: started","stream_id":"9xqrzjdo"}
6
+ {"time":"2025-07-20T15:44:36.9333877+05:30","level":"INFO","msg":"writer: Do: started","stream_id":"9xqrzjdo"}
7
+ {"time":"2025-07-20T15:51:06.1959407+05:30","level":"INFO","msg":"stream: closing","id":"9xqrzjdo"}
8
+ {"time":"2025-07-20T15:51:08.7071239+05:30","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-07-20T15:51:09.1729659+05:30","level":"INFO","msg":"sender: closed","stream_id":"9xqrzjdo"}
10
+ {"time":"2025-07-20T15:51:09.1735011+05:30","level":"INFO","msg":"handler: closed","stream_id":"9xqrzjdo"}
11
+ {"time":"2025-07-20T15:51:09.1735011+05:30","level":"INFO","msg":"writer: Close: closed","stream_id":"9xqrzjdo"}
12
+ {"time":"2025-07-20T15:51:09.1740459+05:30","level":"INFO","msg":"stream: closed","id":"9xqrzjdo"}
wandb/run-20250720_154435-9xqrzjdo/logs/debug.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-20 15:44:35,349 INFO MainThread:2896 [wandb_setup.py:_flush():80] Current SDK version is 0.21.0
2
+ 2025-07-20 15:44:35,350 INFO MainThread:2896 [wandb_setup.py:_flush():80] Configure stats pid to 2896
3
+ 2025-07-20 15:44:35,350 INFO MainThread:2896 [wandb_setup.py:_flush():80] Loading settings from C:\Users\Legion\.config\wandb\settings
4
+ 2025-07-20 15:44:35,350 INFO MainThread:2896 [wandb_setup.py:_flush():80] Loading settings from C:\Users\Legion\desktop\distilbert-sentiment\wandb\settings
5
+ 2025-07-20 15:44:35,350 INFO MainThread:2896 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-07-20 15:44:35,350 INFO MainThread:2896 [wandb_init.py:setup_run_log_directory():703] Logging user logs to C:\Users\Legion\desktop\distilbert-sentiment\wandb\run-20250720_154435-9xqrzjdo\logs\debug.log
7
+ 2025-07-20 15:44:35,351 INFO MainThread:2896 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to C:\Users\Legion\desktop\distilbert-sentiment\wandb\run-20250720_154435-9xqrzjdo\logs\debug-internal.log
8
+ 2025-07-20 15:44:35,351 INFO MainThread:2896 [wandb_init.py:init():830] calling init triggers
9
+ 2025-07-20 15:44:35,351 INFO MainThread:2896 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-07-20 15:44:35,351 INFO MainThread:2896 [wandb_init.py:init():871] starting backend
12
+ 2025-07-20 15:44:35,877 INFO MainThread:2896 [wandb_init.py:init():874] sending inform_init request
13
+ 2025-07-20 15:44:35,966 INFO MainThread:2896 [wandb_init.py:init():882] backend started and connected
14
+ 2025-07-20 15:44:35,969 INFO MainThread:2896 [wandb_init.py:init():953] updated telemetry
15
+ 2025-07-20 15:44:35,973 INFO MainThread:2896 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
16
+ 2025-07-20 15:44:37,312 INFO MainThread:2896 [wandb_init.py:init():1029] starting run threads in backend
17
+ 2025-07-20 15:44:37,744 INFO MainThread:2896 [wandb_run.py:_console_start():2458] atexit reg
18
+ 2025-07-20 15:44:37,744 INFO MainThread:2896 [wandb_run.py:_redirect():2306] redirect: wrap_raw
19
+ 2025-07-20 15:44:37,744 INFO MainThread:2896 [wandb_run.py:_redirect():2375] Wrapping output streams.
20
+ 2025-07-20 15:44:37,744 INFO MainThread:2896 [wandb_run.py:_redirect():2398] Redirects installed.
21
+ 2025-07-20 15:44:37,750 INFO MainThread:2896 [wandb_init.py:init():1075] run started, returning control to user process
22
+ 2025-07-20 15:44:37,752 INFO MainThread:2896 [wandb_run.py:_config_callback():1363] config_cb None None {'vocab_size': 30522, 'max_position_embeddings': 512, 'sinusoidal_pos_embds': False, 'n_layers': 6, 'n_heads': 12, 'dim': 768, 'hidden_dim': 3072, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation': 'gelu', 'initializer_range': 0.02, 'qa_dropout': 0.1, 'seq_classif_dropout': 0.2, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['DistilBertForMaskedLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 0, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'distilbert-base-uncased', 'transformers_version': '4.53.2', 'model_type': 'distilbert', 'tie_weights_': True, 'output_attentions': False, 'output_dir': './model', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 50, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './model', 'disable_tqdm': True, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'f1', 'greater_is_better': True, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
23
+ 2025-07-20 15:44:37,756 INFO MainThread:2896 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 66955010 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x0000017E91750440>>
24
+ 2025-07-20 15:44:37,756 INFO MainThread:2896 [wandb_run.py:_config_callback():1363] config_cb model/num_parameters 66955010 None
25
+ 2025-07-20 15:51:06,119 INFO MsgRouterThr:2896 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
wandb/run-20250720_154435-9xqrzjdo/run-9xqrzjdo.wandb ADDED
Binary file (42.9 kB). View file
 
wandb/run-20250720_155338-0h3fksuy/files/config.yaml ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: distilbert-base-uncased
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.21.0
6
+ e:
7
+ fshn6fq4d357dfamunx9x96y44pdzcc6:
8
+ codePath: main.py
9
+ codePathLocal: main.py
10
+ cpu_count: 4
11
+ cpu_count_logical: 8
12
+ cudaVersion: "12.7"
13
+ disk:
14
+ /:
15
+ total: "255230791680"
16
+ used: "233129451520"
17
+ email: shreshthkapai@gmail.com
18
+ executable: C:\Users\Legion\Miniconda3\envs\ML\python.exe
19
+ gpu: NVIDIA GeForce GTX 1650
20
+ gpu_count: 1
21
+ gpu_nvidia:
22
+ - architecture: Turing
23
+ cudaCores: 1024
24
+ memoryTotal: "4294967296"
25
+ name: NVIDIA GeForce GTX 1650
26
+ uuid: GPU-fbcd7647-fb67-66f5-b8c7-1a4198b7e4fa
27
+ host: DESKTOP-EIHJJJL
28
+ memory:
29
+ total: "8506298368"
30
+ os: Windows-11-10.0.26100-SP0
31
+ program: C:\Users\Legion\desktop\distilbert-sentiment\main.py
32
+ python: CPython 3.13.5
33
+ root: C:\Users\Legion\desktop\distilbert-sentiment
34
+ startedAt: "2025-07-20T10:23:38.923772Z"
35
+ writerId: fshn6fq4d357dfamunx9x96y44pdzcc6
36
+ m:
37
+ - "1": train/global_step
38
+ "6":
39
+ - 3
40
+ "7": []
41
+ - "2": '*'
42
+ "5": 1
43
+ "6":
44
+ - 1
45
+ "7": []
46
+ python_version: 3.13.5
47
+ t:
48
+ "1":
49
+ - 1
50
+ - 5
51
+ - 11
52
+ - 41
53
+ - 49
54
+ - 51
55
+ - 53
56
+ - 71
57
+ - 105
58
+ "2":
59
+ - 1
60
+ - 5
61
+ - 11
62
+ - 41
63
+ - 49
64
+ - 51
65
+ - 53
66
+ - 71
67
+ - 105
68
+ "3":
69
+ - 7
70
+ - 13
71
+ - 19
72
+ - 62
73
+ - 66
74
+ "4": 3.13.5
75
+ "5": 0.21.0
76
+ "6": 4.53.2
77
+ "8":
78
+ - 3
79
+ "9":
80
+ "1": transformers_trainer
81
+ "12": 0.21.0
82
+ "13": windows-amd64
83
+ accelerator_config:
84
+ value:
85
+ dispatch_batches: null
86
+ even_batches: true
87
+ gradient_accumulation_kwargs: null
88
+ non_blocking: false
89
+ split_batches: false
90
+ use_seedable_sampler: true
91
+ activation:
92
+ value: gelu
93
+ adafactor:
94
+ value: false
95
+ adam_beta1:
96
+ value: 0.9
97
+ adam_beta2:
98
+ value: 0.999
99
+ adam_epsilon:
100
+ value: 1e-08
101
+ add_cross_attention:
102
+ value: false
103
+ architectures:
104
+ value:
105
+ - DistilBertForMaskedLM
106
+ attention_dropout:
107
+ value: 0.1
108
+ auto_find_batch_size:
109
+ value: false
110
+ average_tokens_across_devices:
111
+ value: false
112
+ bad_words_ids:
113
+ value: null
114
+ batch_eval_metrics:
115
+ value: false
116
+ begin_suppress_tokens:
117
+ value: null
118
+ bf16:
119
+ value: false
120
+ bf16_full_eval:
121
+ value: false
122
+ bos_token_id:
123
+ value: null
124
+ chunk_size_feed_forward:
125
+ value: 0
126
+ cross_attention_hidden_size:
127
+ value: null
128
+ data_seed:
129
+ value: null
130
+ dataloader_drop_last:
131
+ value: false
132
+ dataloader_num_workers:
133
+ value: 0
134
+ dataloader_persistent_workers:
135
+ value: false
136
+ dataloader_pin_memory:
137
+ value: true
138
+ dataloader_prefetch_factor:
139
+ value: null
140
+ ddp_backend:
141
+ value: null
142
+ ddp_broadcast_buffers:
143
+ value: null
144
+ ddp_bucket_cap_mb:
145
+ value: null
146
+ ddp_find_unused_parameters:
147
+ value: null
148
+ ddp_timeout:
149
+ value: 1800
150
+ debug:
151
+ value: []
152
+ decoder_start_token_id:
153
+ value: null
154
+ deepspeed:
155
+ value: null
156
+ dim:
157
+ value: 768
158
+ disable_tqdm:
159
+ value: true
160
+ diversity_penalty:
161
+ value: 0
162
+ do_eval:
163
+ value: true
164
+ do_predict:
165
+ value: false
166
+ do_sample:
167
+ value: false
168
+ do_train:
169
+ value: false
170
+ dropout:
171
+ value: 0.1
172
+ early_stopping:
173
+ value: false
174
+ encoder_no_repeat_ngram_size:
175
+ value: 0
176
+ eos_token_id:
177
+ value: null
178
+ eval_accumulation_steps:
179
+ value: null
180
+ eval_delay:
181
+ value: 0
182
+ eval_do_concat_batches:
183
+ value: true
184
+ eval_on_start:
185
+ value: false
186
+ eval_steps:
187
+ value: null
188
+ eval_strategy:
189
+ value: epoch
190
+ eval_use_gather_object:
191
+ value: false
192
+ exponential_decay_length_penalty:
193
+ value: null
194
+ finetuning_task:
195
+ value: null
196
+ forced_bos_token_id:
197
+ value: null
198
+ forced_eos_token_id:
199
+ value: null
200
+ fp16:
201
+ value: false
202
+ fp16_backend:
203
+ value: auto
204
+ fp16_full_eval:
205
+ value: false
206
+ fp16_opt_level:
207
+ value: O1
208
+ fsdp:
209
+ value: []
210
+ fsdp_config:
211
+ value:
212
+ min_num_params: 0
213
+ xla: false
214
+ xla_fsdp_grad_ckpt: false
215
+ xla_fsdp_v2: false
216
+ fsdp_min_num_params:
217
+ value: 0
218
+ fsdp_transformer_layer_cls_to_wrap:
219
+ value: null
220
+ full_determinism:
221
+ value: false
222
+ gradient_accumulation_steps:
223
+ value: 2
224
+ gradient_checkpointing:
225
+ value: false
226
+ gradient_checkpointing_kwargs:
227
+ value: null
228
+ greater_is_better:
229
+ value: true
230
+ group_by_length:
231
+ value: false
232
+ half_precision_backend:
233
+ value: auto
234
+ hidden_dim:
235
+ value: 3072
236
+ hub_always_push:
237
+ value: false
238
+ hub_model_id:
239
+ value: null
240
+ hub_private_repo:
241
+ value: null
242
+ hub_revision:
243
+ value: null
244
+ hub_strategy:
245
+ value: every_save
246
+ hub_token:
247
+ value: <HUB_TOKEN>
248
+ id2label:
249
+ value:
250
+ "0": LABEL_0
251
+ "1": LABEL_1
252
+ ignore_data_skip:
253
+ value: false
254
+ include_for_metrics:
255
+ value: []
256
+ include_inputs_for_metrics:
257
+ value: false
258
+ include_num_input_tokens_seen:
259
+ value: false
260
+ include_tokens_per_second:
261
+ value: false
262
+ initializer_range:
263
+ value: 0.02
264
+ is_decoder:
265
+ value: false
266
+ is_encoder_decoder:
267
+ value: false
268
+ jit_mode_eval:
269
+ value: false
270
+ label_names:
271
+ value: null
272
+ label_smoothing_factor:
273
+ value: 0
274
+ label2id:
275
+ value:
276
+ LABEL_0: 0
277
+ LABEL_1: 1
278
+ learning_rate:
279
+ value: 5e-05
280
+ length_column_name:
281
+ value: length
282
+ length_penalty:
283
+ value: 1
284
+ liger_kernel_config:
285
+ value: null
286
+ load_best_model_at_end:
287
+ value: true
288
+ local_rank:
289
+ value: 0
290
+ log_level:
291
+ value: passive
292
+ log_level_replica:
293
+ value: warning
294
+ log_on_each_node:
295
+ value: true
296
+ logging_dir:
297
+ value: ./logs
298
+ logging_first_step:
299
+ value: false
300
+ logging_nan_inf_filter:
301
+ value: true
302
+ logging_steps:
303
+ value: 50
304
+ logging_strategy:
305
+ value: steps
306
+ lr_scheduler_type:
307
+ value: linear
308
+ max_grad_norm:
309
+ value: 1
310
+ max_length:
311
+ value: 20
312
+ max_position_embeddings:
313
+ value: 512
314
+ max_steps:
315
+ value: -1
316
+ metric_for_best_model:
317
+ value: f1
318
+ min_length:
319
+ value: 0
320
+ model/num_parameters:
321
+ value: 66955010
322
+ model_type:
323
+ value: distilbert
324
+ mp_parameters:
325
+ value: ""
326
+ n_heads:
327
+ value: 12
328
+ n_layers:
329
+ value: 6
330
+ neftune_noise_alpha:
331
+ value: null
332
+ no_cuda:
333
+ value: false
334
+ no_repeat_ngram_size:
335
+ value: 0
336
+ num_beam_groups:
337
+ value: 1
338
+ num_beams:
339
+ value: 1
340
+ num_return_sequences:
341
+ value: 1
342
+ num_train_epochs:
343
+ value: 3
344
+ optim:
345
+ value: adamw_torch
346
+ optim_args:
347
+ value: null
348
+ optim_target_modules:
349
+ value: null
350
+ output_attentions:
351
+ value: false
352
+ output_dir:
353
+ value: ./model
354
+ output_hidden_states:
355
+ value: false
356
+ output_scores:
357
+ value: false
358
+ overwrite_output_dir:
359
+ value: false
360
+ pad_token_id:
361
+ value: 0
362
+ past_index:
363
+ value: -1
364
+ per_device_eval_batch_size:
365
+ value: 4
366
+ per_device_train_batch_size:
367
+ value: 2
368
+ per_gpu_eval_batch_size:
369
+ value: null
370
+ per_gpu_train_batch_size:
371
+ value: null
372
+ prediction_loss_only:
373
+ value: false
374
+ prefix:
375
+ value: null
376
+ problem_type:
377
+ value: null
378
+ push_to_hub:
379
+ value: false
380
+ push_to_hub_model_id:
381
+ value: null
382
+ push_to_hub_organization:
383
+ value: null
384
+ push_to_hub_token:
385
+ value: <PUSH_TO_HUB_TOKEN>
386
+ qa_dropout:
387
+ value: 0.1
388
+ ray_scope:
389
+ value: last
390
+ remove_invalid_values:
391
+ value: false
392
+ remove_unused_columns:
393
+ value: true
394
+ repetition_penalty:
395
+ value: 1
396
+ report_to:
397
+ value:
398
+ - wandb
399
+ restore_callback_states_from_checkpoint:
400
+ value: false
401
+ resume_from_checkpoint:
402
+ value: null
403
+ return_dict:
404
+ value: true
405
+ return_dict_in_generate:
406
+ value: false
407
+ run_name:
408
+ value: ./model
409
+ save_on_each_node:
410
+ value: false
411
+ save_only_model:
412
+ value: false
413
+ save_safetensors:
414
+ value: true
415
+ save_steps:
416
+ value: 500
417
+ save_strategy:
418
+ value: epoch
419
+ save_total_limit:
420
+ value: null
421
+ seed:
422
+ value: 42
423
+ sep_token_id:
424
+ value: null
425
+ seq_classif_dropout:
426
+ value: 0.2
427
+ sinusoidal_pos_embds:
428
+ value: false
429
+ skip_memory_metrics:
430
+ value: true
431
+ suppress_tokens:
432
+ value: null
433
+ task_specific_params:
434
+ value: null
435
+ temperature:
436
+ value: 1
437
+ tf_legacy_loss:
438
+ value: false
439
+ tf32:
440
+ value: null
441
+ tie_encoder_decoder:
442
+ value: false
443
+ tie_weights_:
444
+ value: true
445
+ tie_word_embeddings:
446
+ value: true
447
+ tokenizer_class:
448
+ value: null
449
+ top_k:
450
+ value: 50
451
+ top_p:
452
+ value: 1
453
+ torch_compile:
454
+ value: false
455
+ torch_compile_backend:
456
+ value: null
457
+ torch_compile_mode:
458
+ value: null
459
+ torch_dtype:
460
+ value: float32
461
+ torch_empty_cache_steps:
462
+ value: null
463
+ torchdynamo:
464
+ value: null
465
+ torchscript:
466
+ value: false
467
+ tpu_metrics_debug:
468
+ value: false
469
+ tpu_num_cores:
470
+ value: null
471
+ transformers_version:
472
+ value: 4.53.2
473
+ typical_p:
474
+ value: 1
475
+ use_bfloat16:
476
+ value: false
477
+ use_cpu:
478
+ value: false
479
+ use_ipex:
480
+ value: false
481
+ use_legacy_prediction_loop:
482
+ value: false
483
+ use_liger_kernel:
484
+ value: false
485
+ use_mps_device:
486
+ value: false
487
+ vocab_size:
488
+ value: 30522
489
+ warmup_ratio:
490
+ value: 0
491
+ warmup_steps:
492
+ value: 0
493
+ weight_decay:
494
+ value: 0
wandb/run-20250720_155338-0h3fksuy/files/output.log ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {'loss': 0.6669, 'grad_norm': 2.2732439041137695, 'learning_rate': 4.9869333333333334e-05, 'epoch': 0.008}
2
+ {'loss': 0.5064, 'grad_norm': 1.3875774145126343, 'learning_rate': 4.9736000000000006e-05, 'epoch': 0.016}
3
+ {'loss': 0.5648, 'grad_norm': 8.71848201751709, 'learning_rate': 4.960266666666667e-05, 'epoch': 0.024}
4
+ {'loss': 0.5271, 'grad_norm': 12.96942138671875, 'learning_rate': 4.946933333333333e-05, 'epoch': 0.032}
5
+ {'loss': 0.5786, 'grad_norm': 10.856029510498047, 'learning_rate': 4.9336e-05, 'epoch': 0.04}
6
+ {'loss': 0.5966, 'grad_norm': 27.62953758239746, 'learning_rate': 4.920266666666667e-05, 'epoch': 0.048}
7
+ {'loss': 0.4653, 'grad_norm': 23.872642517089844, 'learning_rate': 4.9069333333333335e-05, 'epoch': 0.056}
8
+ {'loss': 0.4805, 'grad_norm': 0.23732933402061462, 'learning_rate': 4.893600000000001e-05, 'epoch': 0.064}
9
+ {'loss': 0.5354, 'grad_norm': 20.000877380371094, 'learning_rate': 4.8802666666666666e-05, 'epoch': 0.072}
10
+ {'loss': 0.5105, 'grad_norm': 53.1633415222168, 'learning_rate': 4.866933333333333e-05, 'epoch': 0.08}
11
+ {'loss': 0.6534, 'grad_norm': 13.990724563598633, 'learning_rate': 4.8536000000000004e-05, 'epoch': 0.088}
12
+ {'loss': 0.4999, 'grad_norm': 4.82359504699707, 'learning_rate': 4.840266666666667e-05, 'epoch': 0.096}
13
+ {'loss': 0.5484, 'grad_norm': 0.24129296839237213, 'learning_rate': 4.8269333333333336e-05, 'epoch': 0.104}
14
+ {'loss': 0.4312, 'grad_norm': 0.23947849869728088, 'learning_rate': 4.8136e-05, 'epoch': 0.112}
15
+ {'loss': 0.6295, 'grad_norm': 9.900238037109375, 'learning_rate': 4.800266666666667e-05, 'epoch': 0.12}
16
+ {'loss': 0.5561, 'grad_norm': 52.11263656616211, 'learning_rate': 4.786933333333334e-05, 'epoch': 0.128}
17
+ {'loss': 0.5402, 'grad_norm': 0.6357279419898987, 'learning_rate': 4.7736000000000005e-05, 'epoch': 0.136}
18
+ {'loss': 0.4374, 'grad_norm': 0.25493714213371277, 'learning_rate': 4.760266666666667e-05, 'epoch': 0.144}
19
+ {'loss': 0.693, 'grad_norm': 0.580426812171936, 'learning_rate': 4.7469333333333336e-05, 'epoch': 0.152}
20
+ {'loss': 0.4743, 'grad_norm': 26.836181640625, 'learning_rate': 4.7336e-05, 'epoch': 0.16}
21
+ {'loss': 0.5578, 'grad_norm': 66.35722351074219, 'learning_rate': 4.720266666666667e-05, 'epoch': 0.168}
22
+ {'loss': 0.4348, 'grad_norm': 29.878002166748047, 'learning_rate': 4.706933333333334e-05, 'epoch': 0.176}
23
+ {'loss': 0.5593, 'grad_norm': 5.273430824279785, 'learning_rate': 4.6936e-05, 'epoch': 0.184}
24
+ {'loss': 0.446, 'grad_norm': 113.90290069580078, 'learning_rate': 4.6802666666666665e-05, 'epoch': 0.192}
25
+ {'loss': 0.481, 'grad_norm': 6.638715744018555, 'learning_rate': 4.666933333333334e-05, 'epoch': 0.2}
26
+ {'loss': 0.5115, 'grad_norm': 46.855735778808594, 'learning_rate': 4.6536e-05, 'epoch': 0.208}
27
+ {'loss': 0.5586, 'grad_norm': 18.6956729888916, 'learning_rate': 4.640266666666667e-05, 'epoch': 0.216}
28
+ {'loss': 0.3987, 'grad_norm': 7.595647811889648, 'learning_rate': 4.6269333333333334e-05, 'epoch': 0.224}
29
+ {'loss': 0.482, 'grad_norm': 8.240407943725586, 'learning_rate': 4.6136e-05, 'epoch': 0.232}
30
+ {'loss': 0.3789, 'grad_norm': 0.290462851524353, 'learning_rate': 4.6002666666666666e-05, 'epoch': 0.24}
31
+ {'loss': 0.5113, 'grad_norm': 11.75820541381836, 'learning_rate': 4.586933333333334e-05, 'epoch': 0.248}
32
+ {'loss': 0.4607, 'grad_norm': 11.622576713562012, 'learning_rate': 4.5736000000000004e-05, 'epoch': 0.256}
33
+ {'loss': 0.4786, 'grad_norm': 9.230450630187988, 'learning_rate': 4.560266666666667e-05, 'epoch': 0.264}
34
+ {'loss': 0.4143, 'grad_norm': 0.15386007726192474, 'learning_rate': 4.5469333333333335e-05, 'epoch': 0.272}
35
+ {'loss': 0.4111, 'grad_norm': 5.873915672302246, 'learning_rate': 4.5336e-05, 'epoch': 0.28}
36
+ {'loss': 0.3835, 'grad_norm': 1.5295137166976929, 'learning_rate': 4.5202666666666673e-05, 'epoch': 0.288}
37
+ {'loss': 0.4735, 'grad_norm': 3.8919050693511963, 'learning_rate': 4.506933333333333e-05, 'epoch': 0.296}
38
+ {'loss': 0.4442, 'grad_norm': 1.7330166101455688, 'learning_rate': 4.4936e-05, 'epoch': 0.304}
39
+ {'loss': 0.3383, 'grad_norm': 4.891812324523926, 'learning_rate': 4.480266666666667e-05, 'epoch': 0.312}
40
+ {'loss': 0.5515, 'grad_norm': 91.70783233642578, 'learning_rate': 4.4669333333333336e-05, 'epoch': 0.32}
41
+ {'loss': 0.498, 'grad_norm': 19.019271850585938, 'learning_rate': 4.4536e-05, 'epoch': 0.328}
42
+ {'loss': 0.4775, 'grad_norm': 1.9273958206176758, 'learning_rate': 4.440266666666667e-05, 'epoch': 0.336}
43
+ {'loss': 0.5587, 'grad_norm': 0.28679159283638, 'learning_rate': 4.426933333333333e-05, 'epoch': 0.344}
44
+ {'loss': 0.2665, 'grad_norm': 30.908130645751953, 'learning_rate': 4.4136e-05, 'epoch': 0.352}
45
+ {'loss': 0.3657, 'grad_norm': 28.822193145751953, 'learning_rate': 4.400266666666667e-05, 'epoch': 0.36}
46
+ {'loss': 0.5237, 'grad_norm': 17.60547637939453, 'learning_rate': 4.386933333333334e-05, 'epoch': 0.368}
47
+ {'loss': 0.5005, 'grad_norm': 67.82170104980469, 'learning_rate': 4.3736e-05, 'epoch': 0.376}
48
+ {'loss': 0.5195, 'grad_norm': 0.14335760474205017, 'learning_rate': 4.360266666666667e-05, 'epoch': 0.384}
49
+ {'loss': 0.3884, 'grad_norm': 0.36686429381370544, 'learning_rate': 4.3469333333333334e-05, 'epoch': 0.392}
50
+ {'loss': 0.4424, 'grad_norm': 5.366738796234131, 'learning_rate': 4.3336000000000007e-05, 'epoch': 0.4}
51
+ {'loss': 0.5222, 'grad_norm': 20.56273651123047, 'learning_rate': 4.320266666666667e-05, 'epoch': 0.408}
52
+ {'loss': 0.6078, 'grad_norm': 5.502252578735352, 'learning_rate': 4.306933333333333e-05, 'epoch': 0.416}
53
+ {'loss': 0.5062, 'grad_norm': 16.406768798828125, 'learning_rate': 4.2936000000000004e-05, 'epoch': 0.424}
54
+ {'loss': 0.3554, 'grad_norm': 0.1537816971540451, 'learning_rate': 4.280266666666667e-05, 'epoch': 0.432}
55
+ {'loss': 0.4329, 'grad_norm': 35.78837966918945, 'learning_rate': 4.2669333333333335e-05, 'epoch': 0.44}
56
+ {'loss': 0.4557, 'grad_norm': 8.288016319274902, 'learning_rate': 4.2536e-05, 'epoch': 0.448}
57
+ {'loss': 0.4099, 'grad_norm': 0.16398730874061584, 'learning_rate': 4.2402666666666666e-05, 'epoch': 0.456}
58
+ {'loss': 0.5485, 'grad_norm': 1.4127204418182373, 'learning_rate': 4.226933333333333e-05, 'epoch': 0.464}
59
+ {'loss': 0.4307, 'grad_norm': 0.23355980217456818, 'learning_rate': 4.2136000000000005e-05, 'epoch': 0.472}
60
+ {'loss': 0.422, 'grad_norm': 22.04464340209961, 'learning_rate': 4.200266666666667e-05, 'epoch': 0.48}
61
+ {'loss': 0.3782, 'grad_norm': 0.1723032295703888, 'learning_rate': 4.1869333333333336e-05, 'epoch': 0.488}
62
+ {'loss': 0.5829, 'grad_norm': 8.341532707214355, 'learning_rate': 4.1736e-05, 'epoch': 0.496}
63
+ {'loss': 0.3045, 'grad_norm': 8.966949462890625, 'learning_rate': 4.160266666666667e-05, 'epoch': 0.504}
64
+ {'loss': 0.5763, 'grad_norm': 0.5718376636505127, 'learning_rate': 4.146933333333334e-05, 'epoch': 0.512}
65
+ {'loss': 0.4403, 'grad_norm': 26.9694881439209, 'learning_rate': 4.1336000000000005e-05, 'epoch': 0.52}
66
+ {'loss': 0.4884, 'grad_norm': 49.227210998535156, 'learning_rate': 4.1202666666666664e-05, 'epoch': 0.528}
67
+ {'loss': 0.4022, 'grad_norm': 11.67745304107666, 'learning_rate': 4.106933333333334e-05, 'epoch': 0.536}
68
+ {'loss': 0.5127, 'grad_norm': 35.11846160888672, 'learning_rate': 4.0936e-05, 'epoch': 0.544}
69
+ {'loss': 0.3214, 'grad_norm': 0.11048085242509842, 'learning_rate': 4.080266666666667e-05, 'epoch': 0.552}
70
+ {'loss': 0.4918, 'grad_norm': 0.13820067048072815, 'learning_rate': 4.0669333333333334e-05, 'epoch': 0.56}
71
+ {'loss': 0.4171, 'grad_norm': 0.25762712955474854, 'learning_rate': 4.0536e-05, 'epoch': 0.568}
72
+ {'loss': 0.3955, 'grad_norm': 7.20747709274292, 'learning_rate': 4.0402666666666665e-05, 'epoch': 0.576}
73
+ {'loss': 0.4939, 'grad_norm': 9.815940856933594, 'learning_rate': 4.026933333333334e-05, 'epoch': 0.584}
74
+ {'loss': 0.4533, 'grad_norm': 1.6333082914352417, 'learning_rate': 4.0136e-05, 'epoch': 0.592}
75
+ {'loss': 0.5392, 'grad_norm': 17.70346450805664, 'learning_rate': 4.000266666666667e-05, 'epoch': 0.6}
76
+ {'loss': 0.3454, 'grad_norm': 0.13321377336978912, 'learning_rate': 3.9869333333333335e-05, 'epoch': 0.608}
77
+ {'loss': 0.5585, 'grad_norm': 14.663485527038574, 'learning_rate': 3.9736e-05, 'epoch': 0.616}
78
+ {'loss': 0.4151, 'grad_norm': 19.313182830810547, 'learning_rate': 3.960266666666667e-05, 'epoch': 0.624}
79
+ {'loss': 0.4268, 'grad_norm': 1.0081754922866821, 'learning_rate': 3.946933333333334e-05, 'epoch': 0.632}
80
+ {'loss': 0.4791, 'grad_norm': 16.18073081970215, 'learning_rate': 3.9336e-05, 'epoch': 0.64}
81
+ {'loss': 0.3551, 'grad_norm': 13.099615097045898, 'learning_rate': 3.920266666666667e-05, 'epoch': 0.648}
82
+ {'loss': 0.4028, 'grad_norm': 0.2873060703277588, 'learning_rate': 3.9069333333333336e-05, 'epoch': 0.656}
83
+ {'loss': 0.4578, 'grad_norm': 6.123228073120117, 'learning_rate': 3.8936e-05, 'epoch': 0.664}
84
+ {'loss': 0.3384, 'grad_norm': 15.485557556152344, 'learning_rate': 3.8802666666666674e-05, 'epoch': 0.672}
85
+ {'loss': 0.4599, 'grad_norm': 0.3142613470554352, 'learning_rate': 3.866933333333333e-05, 'epoch': 0.68}
86
+ {'loss': 0.5153, 'grad_norm': 0.08679840713739395, 'learning_rate': 3.8536e-05, 'epoch': 0.688}
87
+ {'loss': 0.4358, 'grad_norm': 4.982065677642822, 'learning_rate': 3.840266666666667e-05, 'epoch': 0.696}
88
+ {'loss': 0.3591, 'grad_norm': 0.03635261580348015, 'learning_rate': 3.8269333333333336e-05, 'epoch': 0.704}
89
+ {'loss': 0.3954, 'grad_norm': 60.536617279052734, 'learning_rate': 3.8136e-05, 'epoch': 0.712}
90
+ {'loss': 0.5115, 'grad_norm': 8.195839881896973, 'learning_rate': 3.800266666666667e-05, 'epoch': 0.72}
91
+ {'loss': 0.5059, 'grad_norm': 6.579557418823242, 'learning_rate': 3.7869333333333334e-05, 'epoch': 0.728}
92
+ {'loss': 0.4229, 'grad_norm': 0.11510950326919556, 'learning_rate': 3.7736e-05, 'epoch': 0.736}
93
+ {'loss': 0.3961, 'grad_norm': 77.99903869628906, 'learning_rate': 3.760266666666667e-05, 'epoch': 0.744}
94
+ {'loss': 0.4849, 'grad_norm': 0.16879667341709137, 'learning_rate': 3.746933333333334e-05, 'epoch': 0.752}
95
+ {'loss': 0.5883, 'grad_norm': 0.2887319326400757, 'learning_rate': 3.7336e-05, 'epoch': 0.76}
96
+ {'loss': 0.5342, 'grad_norm': 7.673031330108643, 'learning_rate': 3.720266666666667e-05, 'epoch': 0.768}
97
+ {'loss': 0.3816, 'grad_norm': 0.26005497574806213, 'learning_rate': 3.7069333333333334e-05, 'epoch': 0.776}
98
+ {'loss': 0.4011, 'grad_norm': 0.15825381875038147, 'learning_rate': 3.693600000000001e-05, 'epoch': 0.784}
99
+ {'loss': 0.4041, 'grad_norm': 9.251302719116211, 'learning_rate': 3.6802666666666666e-05, 'epoch': 0.792}
100
+ {'loss': 0.4039, 'grad_norm': 0.3029502332210541, 'learning_rate': 3.666933333333333e-05, 'epoch': 0.8}
101
+ {'loss': 0.4208, 'grad_norm': 12.424095153808594, 'learning_rate': 3.6536000000000004e-05, 'epoch': 0.808}
102
+ {'loss': 0.3997, 'grad_norm': 0.30793556571006775, 'learning_rate': 3.640266666666667e-05, 'epoch': 0.816}
103
+ {'loss': 0.3738, 'grad_norm': 35.160221099853516, 'learning_rate': 3.6269333333333335e-05, 'epoch': 0.824}
104
+ {'loss': 0.3587, 'grad_norm': 7.636879920959473, 'learning_rate': 3.6136e-05, 'epoch': 0.832}
105
+ {'loss': 0.3555, 'grad_norm': 25.151350021362305, 'learning_rate': 3.600266666666667e-05, 'epoch': 0.84}
106
+ {'loss': 0.3044, 'grad_norm': 0.08254121243953705, 'learning_rate': 3.586933333333333e-05, 'epoch': 0.848}
107
+ {'loss': 0.4428, 'grad_norm': 0.3885553777217865, 'learning_rate': 3.5736000000000005e-05, 'epoch': 0.856}
108
+ {'loss': 0.4558, 'grad_norm': 7.154929161071777, 'learning_rate': 3.560266666666667e-05, 'epoch': 0.864}
109
+ {'loss': 0.4543, 'grad_norm': 21.201276779174805, 'learning_rate': 3.5469333333333336e-05, 'epoch': 0.872}
110
+ {'loss': 0.4821, 'grad_norm': 24.018428802490234, 'learning_rate': 3.5336e-05, 'epoch': 0.88}
111
+ {'loss': 0.2944, 'grad_norm': 0.04061014950275421, 'learning_rate': 3.520266666666667e-05, 'epoch': 0.888}
112
+ {'loss': 0.5307, 'grad_norm': 0.3313358426094055, 'learning_rate': 3.506933333333334e-05, 'epoch': 0.896}
113
+ {'loss': 0.4825, 'grad_norm': 0.49799197912216187, 'learning_rate': 3.4936e-05, 'epoch': 0.904}
114
+ {'loss': 0.3783, 'grad_norm': 7.0047712326049805, 'learning_rate': 3.4802666666666665e-05, 'epoch': 0.912}
115
+ {'loss': 0.3574, 'grad_norm': 0.4024333953857422, 'learning_rate': 3.466933333333334e-05, 'epoch': 0.92}
116
+ {'loss': 0.4618, 'grad_norm': 0.1282418966293335, 'learning_rate': 3.4536e-05, 'epoch': 0.928}
117
+ {'loss': 0.2572, 'grad_norm': 7.137022018432617, 'learning_rate': 3.440266666666667e-05, 'epoch': 0.936}
118
+ {'loss': 0.3708, 'grad_norm': 14.057695388793945, 'learning_rate': 3.4269333333333334e-05, 'epoch': 0.944}
119
+ {'loss': 0.3674, 'grad_norm': 0.06444621086120605, 'learning_rate': 3.4136e-05, 'epoch': 0.952}
120
+ {'loss': 0.3021, 'grad_norm': 6.360774993896484, 'learning_rate': 3.4002666666666665e-05, 'epoch': 0.96}
121
+ {'loss': 0.5396, 'grad_norm': 6.526275157928467, 'learning_rate': 3.386933333333334e-05, 'epoch': 0.968}
122
+ {'loss': 0.3749, 'grad_norm': 1.6904962062835693, 'learning_rate': 3.3736000000000004e-05, 'epoch': 0.976}
123
+ {'loss': 0.4539, 'grad_norm': 0.31892621517181396, 'learning_rate': 3.360266666666666e-05, 'epoch': 0.984}
124
+ {'loss': 0.4297, 'grad_norm': 0.15581363439559937, 'learning_rate': 3.3469333333333335e-05, 'epoch': 0.992}
125
+ {'loss': 0.347, 'grad_norm': 23.677379608154297, 'learning_rate': 3.3336e-05, 'epoch': 1.0}
126
+ {'eval_loss': 0.4047098457813263, 'eval_accuracy': 0.8872, 'eval_f1': 0.8824412206103052, 'eval_precision': 0.9213091922005571, 'eval_recall': 0.84672, 'eval_runtime': 387.6575, 'eval_samples_per_second': 64.49, 'eval_steps_per_second': 16.122, 'epoch': 1.0}
127
+ {'loss': 0.2086, 'grad_norm': 6.972434043884277, 'learning_rate': 3.320266666666667e-05, 'epoch': 1.008}
128
+ {'loss': 0.2871, 'grad_norm': 19.224170684814453, 'learning_rate': 3.306933333333334e-05, 'epoch': 1.016}
129
+ {'loss': 0.3594, 'grad_norm': 18.384265899658203, 'learning_rate': 3.2936e-05, 'epoch': 1.024}
130
+ {'loss': 0.2588, 'grad_norm': 0.04717381298542023, 'learning_rate': 3.280266666666667e-05, 'epoch': 1.032}
131
+ {'loss': 0.2305, 'grad_norm': 0.06328094005584717, 'learning_rate': 3.2669333333333336e-05, 'epoch': 1.04}
132
+ {'loss': 0.2759, 'grad_norm': 19.435848236083984, 'learning_rate': 3.2536e-05, 'epoch': 1.048}
133
+ {'loss': 0.31, 'grad_norm': 23.59215545654297, 'learning_rate': 3.240266666666667e-05, 'epoch': 1.056}
134
+ {'loss': 0.2949, 'grad_norm': 0.08341117948293686, 'learning_rate': 3.226933333333333e-05, 'epoch': 1.064}
135
+ {'loss': 0.3302, 'grad_norm': 6.222505569458008, 'learning_rate': 3.2136e-05, 'epoch': 1.072}
136
+ {'loss': 0.2555, 'grad_norm': 0.12312066555023193, 'learning_rate': 3.200266666666667e-05, 'epoch': 1.08}
137
+ {'loss': 0.2143, 'grad_norm': 0.026080487295985222, 'learning_rate': 3.186933333333334e-05, 'epoch': 1.088}
138
+ {'loss': 0.2562, 'grad_norm': 0.078591488301754, 'learning_rate': 3.1736e-05, 'epoch': 1.096}
139
+ {'loss': 0.2386, 'grad_norm': 4.485652923583984, 'learning_rate': 3.160266666666667e-05, 'epoch': 1.104}
140
+ {'loss': 0.2672, 'grad_norm': 0.03333387151360512, 'learning_rate': 3.1469333333333334e-05, 'epoch': 1.112}
141
+ {'loss': 0.3072, 'grad_norm': 6.5600905418396, 'learning_rate': 3.1336000000000006e-05, 'epoch': 1.12}
142
+ {'loss': 0.1519, 'grad_norm': 0.8171893954277039, 'learning_rate': 3.120266666666667e-05, 'epoch': 1.1280000000000001}
143
+ {'loss': 0.201, 'grad_norm': 0.036586660891771317, 'learning_rate': 3.106933333333333e-05, 'epoch': 1.1360000000000001}
144
+ {'loss': 0.1602, 'grad_norm': 0.021485593169927597, 'learning_rate': 3.0936e-05, 'epoch': 1.144}
145
+ {'loss': 0.3508, 'grad_norm': 0.039208535104990005, 'learning_rate': 3.080266666666667e-05, 'epoch': 1.152}
146
+ {'loss': 0.2754, 'grad_norm': 0.05171596258878708, 'learning_rate': 3.0669333333333335e-05, 'epoch': 1.16}
147
+ {'loss': 0.2923, 'grad_norm': 0.48278653621673584, 'learning_rate': 3.0536e-05, 'epoch': 1.168}
148
+ {'loss': 0.2007, 'grad_norm': 0.03526414930820465, 'learning_rate': 3.040266666666667e-05, 'epoch': 1.176}
149
+ {'loss': 0.2515, 'grad_norm': 0.06429073214530945, 'learning_rate': 3.0269333333333332e-05, 'epoch': 1.184}
150
+ {'loss': 0.4943, 'grad_norm': 0.37491822242736816, 'learning_rate': 3.0136000000000004e-05, 'epoch': 1.192}
151
+ {'loss': 0.3227, 'grad_norm': 0.2309693992137909, 'learning_rate': 3.0002666666666666e-05, 'epoch': 1.2}
152
+ {'loss': 0.2738, 'grad_norm': 0.12286447733640671, 'learning_rate': 2.9869333333333332e-05, 'epoch': 1.208}
153
+ {'loss': 0.2563, 'grad_norm': 0.04936458542943001, 'learning_rate': 2.9736e-05, 'epoch': 1.216}
154
+ {'loss': 0.3207, 'grad_norm': 0.4713517725467682, 'learning_rate': 2.9602666666666667e-05, 'epoch': 1.224}
155
+ {'loss': 0.2795, 'grad_norm': 5.336559295654297, 'learning_rate': 2.9469333333333333e-05, 'epoch': 1.232}
156
+ {'loss': 0.3249, 'grad_norm': 1.1011492013931274, 'learning_rate': 2.9336000000000002e-05, 'epoch': 1.24}
157
+ {'loss': 0.3201, 'grad_norm': 8.649012565612793, 'learning_rate': 2.9202666666666667e-05, 'epoch': 1.248}
158
+ {'loss': 0.2676, 'grad_norm': 0.059582602232694626, 'learning_rate': 2.9069333333333336e-05, 'epoch': 1.256}
159
+ {'loss': 0.1626, 'grad_norm': 0.08991962671279907, 'learning_rate': 2.8936000000000002e-05, 'epoch': 1.264}
160
+ {'loss': 0.1377, 'grad_norm': 0.03133632242679596, 'learning_rate': 2.8802666666666668e-05, 'epoch': 1.272}
161
+ {'loss': 0.3352, 'grad_norm': 0.08367053419351578, 'learning_rate': 2.8669333333333337e-05, 'epoch': 1.28}
162
+ {'loss': 0.2317, 'grad_norm': 0.022324958816170692, 'learning_rate': 2.8536000000000003e-05, 'epoch': 1.288}
163
+ {'loss': 0.2879, 'grad_norm': 0.0481320321559906, 'learning_rate': 2.8402666666666665e-05, 'epoch': 1.296}
164
+ {'loss': 0.3309, 'grad_norm': 0.05768590420484543, 'learning_rate': 2.8269333333333337e-05, 'epoch': 1.304}
165
+ {'loss': 0.3335, 'grad_norm': 0.3378739356994629, 'learning_rate': 2.8136e-05, 'epoch': 1.312}
166
+ {'loss': 0.1505, 'grad_norm': 0.04841599985957146, 'learning_rate': 2.8002666666666665e-05, 'epoch': 1.32}
167
+ {'loss': 0.3089, 'grad_norm': 0.0761469379067421, 'learning_rate': 2.7869333333333338e-05, 'epoch': 1.328}
168
+ {'loss': 0.3643, 'grad_norm': 0.7006823420524597, 'learning_rate': 2.7736e-05, 'epoch': 1.336}
169
+ {'loss': 0.2356, 'grad_norm': 12.694981575012207, 'learning_rate': 2.7602666666666666e-05, 'epoch': 1.3439999999999999}
170
+ {'loss': 0.3755, 'grad_norm': 8.449514389038086, 'learning_rate': 2.7469333333333335e-05, 'epoch': 1.3519999999999999}
171
+ {'loss': 0.1797, 'grad_norm': 0.21266134083271027, 'learning_rate': 2.7336e-05, 'epoch': 1.3599999999999999}
172
+ {'loss': 0.2732, 'grad_norm': 0.2098928540945053, 'learning_rate': 2.720266666666667e-05, 'epoch': 1.3679999999999999}
173
+ {'loss': 0.2037, 'grad_norm': 0.09150354564189911, 'learning_rate': 2.7069333333333335e-05, 'epoch': 1.376}
174
+ {'loss': 0.2829, 'grad_norm': 0.0541178435087204, 'learning_rate': 2.6936e-05, 'epoch': 1.384}
175
+ {'loss': 0.3053, 'grad_norm': 0.387103408575058, 'learning_rate': 2.680266666666667e-05, 'epoch': 1.392}
176
+ {'loss': 0.3136, 'grad_norm': 0.058676812797784805, 'learning_rate': 2.6669333333333336e-05, 'epoch': 1.4}
177
+ {'loss': 0.2385, 'grad_norm': 0.05689304694533348, 'learning_rate': 2.6536e-05, 'epoch': 1.408}
178
+ {'loss': 0.2023, 'grad_norm': 0.05833113566040993, 'learning_rate': 2.640266666666667e-05, 'epoch': 1.416}
179
+ {'loss': 0.1863, 'grad_norm': 0.5199909806251526, 'learning_rate': 2.6269333333333336e-05, 'epoch': 1.424}
180
+ {'loss': 0.1431, 'grad_norm': 0.0910777747631073, 'learning_rate': 2.6136e-05, 'epoch': 1.432}
181
+ {'loss': 0.2813, 'grad_norm': 0.19901159405708313, 'learning_rate': 2.600266666666667e-05, 'epoch': 1.44}
182
+ {'loss': 0.2673, 'grad_norm': 20.136131286621094, 'learning_rate': 2.5869333333333333e-05, 'epoch': 1.448}
183
+ {'loss': 0.2245, 'grad_norm': 78.39840698242188, 'learning_rate': 2.5736e-05, 'epoch': 1.456}
184
+ {'loss': 0.3301, 'grad_norm': 27.54892921447754, 'learning_rate': 2.5602666666666668e-05, 'epoch': 1.464}
185
+ {'loss': 0.255, 'grad_norm': 0.4987935423851013, 'learning_rate': 2.5469333333333334e-05, 'epoch': 1.472}
186
+ {'loss': 0.2217, 'grad_norm': 0.08215348422527313, 'learning_rate': 2.5336e-05, 'epoch': 1.48}
187
+ {'loss': 0.3774, 'grad_norm': 6.633873462677002, 'learning_rate': 2.520266666666667e-05, 'epoch': 1.488}
188
+ {'loss': 0.2809, 'grad_norm': 0.473483145236969, 'learning_rate': 2.5069333333333334e-05, 'epoch': 1.496}
189
+ {'loss': 0.1802, 'grad_norm': 0.1088651642203331, 'learning_rate': 2.4936e-05, 'epoch': 1.504}
190
+ {'loss': 0.3397, 'grad_norm': 0.15446412563323975, 'learning_rate': 2.480266666666667e-05, 'epoch': 1.512}
191
+ {'loss': 0.2506, 'grad_norm': 0.13606055080890656, 'learning_rate': 2.4669333333333335e-05, 'epoch': 1.52}
192
+ {'loss': 0.2989, 'grad_norm': 0.12229656428098679, 'learning_rate': 2.4536e-05, 'epoch': 1.528}
193
+ {'loss': 0.175, 'grad_norm': 0.09148360043764114, 'learning_rate': 2.440266666666667e-05, 'epoch': 1.536}
194
+ {'loss': 0.3552, 'grad_norm': 0.07437633723020554, 'learning_rate': 2.4269333333333335e-05, 'epoch': 1.544}
195
+ {'loss': 0.2242, 'grad_norm': 98.57760620117188, 'learning_rate': 2.4136e-05, 'epoch': 1.552}
196
+ {'loss': 0.2344, 'grad_norm': 0.24384742975234985, 'learning_rate': 2.4002666666666666e-05, 'epoch': 1.56}
197
+ {'loss': 0.2868, 'grad_norm': 0.06279865652322769, 'learning_rate': 2.3869333333333335e-05, 'epoch': 1.568}
198
+ {'loss': 0.2874, 'grad_norm': 0.1516159474849701, 'learning_rate': 2.3736e-05, 'epoch': 1.576}
199
+ {'loss': 0.1706, 'grad_norm': 0.02717330865561962, 'learning_rate': 2.3602666666666667e-05, 'epoch': 1.584}
200
+ {'loss': 0.3318, 'grad_norm': 2.2730720043182373, 'learning_rate': 2.3469333333333336e-05, 'epoch': 1.592}
201
+ {'loss': 0.2772, 'grad_norm': 0.027159368619322777, 'learning_rate': 2.3336e-05, 'epoch': 1.6}
202
+ {'loss': 0.2545, 'grad_norm': 0.44568705558776855, 'learning_rate': 2.3202666666666667e-05, 'epoch': 1.608}
203
+ {'loss': 0.3444, 'grad_norm': 17.193021774291992, 'learning_rate': 2.3069333333333333e-05, 'epoch': 1.616}
204
+ {'loss': 0.1768, 'grad_norm': 0.15403099358081818, 'learning_rate': 2.2936000000000002e-05, 'epoch': 1.624}
205
+ {'loss': 0.1226, 'grad_norm': 251.0621337890625, 'learning_rate': 2.2802666666666668e-05, 'epoch': 1.6320000000000001}
206
+ {'loss': 0.2795, 'grad_norm': 25.017301559448242, 'learning_rate': 2.2669333333333333e-05, 'epoch': 1.6400000000000001}
207
+ {'loss': 0.3253, 'grad_norm': 49.36235427856445, 'learning_rate': 2.2536000000000002e-05, 'epoch': 1.6480000000000001}
208
+ {'loss': 0.3206, 'grad_norm': 0.045104943215847015, 'learning_rate': 2.2402666666666668e-05, 'epoch': 1.6560000000000001}
209
+ {'loss': 0.2719, 'grad_norm': 0.21639679372310638, 'learning_rate': 2.2269333333333334e-05, 'epoch': 1.6640000000000001}
210
+ {'loss': 0.3777, 'grad_norm': 0.08187518268823624, 'learning_rate': 2.2136000000000003e-05, 'epoch': 1.6720000000000002}
211
+ {'loss': 0.2435, 'grad_norm': 0.08419207483530045, 'learning_rate': 2.200266666666667e-05, 'epoch': 1.6800000000000002}
212
+ {'loss': 0.2798, 'grad_norm': 32.25635528564453, 'learning_rate': 2.1869333333333334e-05, 'epoch': 1.688}
213
+ {'loss': 0.2435, 'grad_norm': 0.03352827951312065, 'learning_rate': 2.1736e-05, 'epoch': 1.696}
214
+ {'loss': 0.2896, 'grad_norm': 0.11488524079322815, 'learning_rate': 2.160266666666667e-05, 'epoch': 1.704}
215
+ {'loss': 0.137, 'grad_norm': 0.9820640087127686, 'learning_rate': 2.1469333333333335e-05, 'epoch': 1.712}
216
+ {'loss': 0.2503, 'grad_norm': 0.0872233659029007, 'learning_rate': 2.1336e-05, 'epoch': 1.72}
217
+ {'loss': 0.331, 'grad_norm': 0.07821047306060791, 'learning_rate': 2.120266666666667e-05, 'epoch': 1.728}
218
+ {'loss': 0.2292, 'grad_norm': 13.81276798248291, 'learning_rate': 2.1069333333333335e-05, 'epoch': 1.736}
219
+ {'loss': 0.2239, 'grad_norm': 14.37901782989502, 'learning_rate': 2.0936e-05, 'epoch': 1.744}
220
+ {'loss': 0.2351, 'grad_norm': 0.09311486035585403, 'learning_rate': 2.0802666666666666e-05, 'epoch': 1.752}
221
+ {'loss': 0.2493, 'grad_norm': 0.04642907530069351, 'learning_rate': 2.0669333333333336e-05, 'epoch': 1.76}
222
+ {'loss': 0.2468, 'grad_norm': 187.65907287597656, 'learning_rate': 2.0536e-05, 'epoch': 1.768}
223
+ {'loss': 0.2195, 'grad_norm': 0.3666624128818512, 'learning_rate': 2.0402666666666667e-05, 'epoch': 1.776}
224
+ {'loss': 0.2567, 'grad_norm': 28.931724548339844, 'learning_rate': 2.0269333333333336e-05, 'epoch': 1.784}
225
+ {'loss': 0.2707, 'grad_norm': 110.09719848632812, 'learning_rate': 2.0136e-05, 'epoch': 1.792}
226
+ {'loss': 0.2216, 'grad_norm': 0.025822747498750687, 'learning_rate': 2.0002666666666667e-05, 'epoch': 1.8}
227
+ {'loss': 0.165, 'grad_norm': 21.93601417541504, 'learning_rate': 1.9869333333333333e-05, 'epoch': 1.808}
228
+ {'loss': 0.2316, 'grad_norm': 0.23445022106170654, 'learning_rate': 1.9736000000000002e-05, 'epoch': 1.8159999999999998}
229
+ {'loss': 0.3018, 'grad_norm': 24.560941696166992, 'learning_rate': 1.9602666666666668e-05, 'epoch': 1.8239999999999998}
230
+ {'loss': 0.1176, 'grad_norm': 0.01924316957592964, 'learning_rate': 1.9469333333333333e-05, 'epoch': 1.8319999999999999}
231
+ {'loss': 0.3031, 'grad_norm': 0.3726535439491272, 'learning_rate': 1.9336000000000003e-05, 'epoch': 1.8399999999999999}
232
+ {'loss': 0.2523, 'grad_norm': 0.10653215646743774, 'learning_rate': 1.9202666666666668e-05, 'epoch': 1.8479999999999999}
233
+ {'loss': 0.243, 'grad_norm': 0.07101219147443771, 'learning_rate': 1.9069333333333334e-05, 'epoch': 1.8559999999999999}
234
+ {'loss': 0.2008, 'grad_norm': 0.12322711199522018, 'learning_rate': 1.8936e-05, 'epoch': 1.8639999999999999}
235
+ {'loss': 0.2249, 'grad_norm': 0.09139817208051682, 'learning_rate': 1.880266666666667e-05, 'epoch': 1.8719999999999999}
236
+ {'loss': 0.2285, 'grad_norm': 31.605588912963867, 'learning_rate': 1.8669333333333334e-05, 'epoch': 1.88}
237
+ {'loss': 0.308, 'grad_norm': 0.2888055145740509, 'learning_rate': 1.8536e-05, 'epoch': 1.888}
238
+ {'loss': 0.2119, 'grad_norm': 0.16984781622886658, 'learning_rate': 1.840266666666667e-05, 'epoch': 1.896}
239
+ {'loss': 0.1807, 'grad_norm': 0.018442299216985703, 'learning_rate': 1.8269333333333335e-05, 'epoch': 1.904}
240
+ {'loss': 0.2367, 'grad_norm': 0.05777069553732872, 'learning_rate': 1.8136e-05, 'epoch': 1.912}
241
+ {'loss': 0.1747, 'grad_norm': 0.06527545303106308, 'learning_rate': 1.8002666666666666e-05, 'epoch': 1.92}
242
+ {'loss': 0.3092, 'grad_norm': 0.0599406436085701, 'learning_rate': 1.7869333333333335e-05, 'epoch': 1.928}
243
+ {'loss': 0.3103, 'grad_norm': 113.66268157958984, 'learning_rate': 1.7736e-05, 'epoch': 1.936}
244
+ {'loss': 0.2114, 'grad_norm': 0.2484273612499237, 'learning_rate': 1.7602666666666667e-05, 'epoch': 1.944}
245
+ {'loss': 0.2138, 'grad_norm': 0.0685097873210907, 'learning_rate': 1.7469333333333336e-05, 'epoch': 1.952}
246
+ {'loss': 0.178, 'grad_norm': 0.08626335859298706, 'learning_rate': 1.7335999999999998e-05, 'epoch': 1.96}
247
+ {'loss': 0.3075, 'grad_norm': 0.18472443521022797, 'learning_rate': 1.7202666666666667e-05, 'epoch': 1.968}
248
+ {'loss': 0.2595, 'grad_norm': 0.09902197122573853, 'learning_rate': 1.7069333333333333e-05, 'epoch': 1.976}
249
+ {'loss': 0.3426, 'grad_norm': 0.10281559079885483, 'learning_rate': 1.6936000000000002e-05, 'epoch': 1.984}
250
+ {'loss': 0.3031, 'grad_norm': 90.46196746826172, 'learning_rate': 1.6802666666666668e-05, 'epoch': 1.992}
251
+ {'loss': 0.2931, 'grad_norm': 0.13644857704639435, 'learning_rate': 1.6669333333333333e-05, 'epoch': 2.0}
252
+ {'eval_loss': 0.4506886303424835, 'eval_accuracy': 0.89908, 'eval_f1': 0.8966195451751691, 'eval_precision': 0.9190256194876103, 'eval_recall': 0.87528, 'eval_runtime': 393.988, 'eval_samples_per_second': 63.454, 'eval_steps_per_second': 15.863, 'epoch': 2.0}
253
+ {'loss': 0.1133, 'grad_norm': 0.19643454253673553, 'learning_rate': 1.6536000000000002e-05, 'epoch': 2.008}
254
+ {'loss': 0.0441, 'grad_norm': 0.020006030797958374, 'learning_rate': 1.6402666666666665e-05, 'epoch': 2.016}
255
+ {'loss': 0.0669, 'grad_norm': 0.017264680936932564, 'learning_rate': 1.6269333333333334e-05, 'epoch': 2.024}
256
+ {'loss': 0.0532, 'grad_norm': 0.061523064970970154, 'learning_rate': 1.6136000000000003e-05, 'epoch': 2.032}
257
+ {'loss': 0.0882, 'grad_norm': 0.009066939353942871, 'learning_rate': 1.600266666666667e-05, 'epoch': 2.04}
258
+ {'loss': 0.1001, 'grad_norm': 0.03136083111166954, 'learning_rate': 1.5869333333333334e-05, 'epoch': 2.048}
259
+ {'loss': 0.138, 'grad_norm': 0.008202377706766129, 'learning_rate': 1.5736e-05, 'epoch': 2.056}
260
+ {'loss': 0.0569, 'grad_norm': 0.07132015377283096, 'learning_rate': 1.560266666666667e-05, 'epoch': 2.064}
261
+ {'loss': 0.1, 'grad_norm': 0.18235626816749573, 'learning_rate': 1.5469333333333335e-05, 'epoch': 2.072}
262
+ {'loss': 0.0579, 'grad_norm': 0.008501987904310226, 'learning_rate': 1.5336e-05, 'epoch': 2.08}
263
+ {'loss': 0.1893, 'grad_norm': 0.017202647402882576, 'learning_rate': 1.5202666666666668e-05, 'epoch': 2.088}
264
+ {'loss': 0.1071, 'grad_norm': 0.04670681431889534, 'learning_rate': 1.5069333333333335e-05, 'epoch': 2.096}
265
+ {'loss': 0.0846, 'grad_norm': 0.013939165510237217, 'learning_rate': 1.4936e-05, 'epoch': 2.104}
266
+ {'loss': 0.0508, 'grad_norm': 4.487010478973389, 'learning_rate': 1.4802666666666668e-05, 'epoch': 2.112}
267
+ {'loss': 0.166, 'grad_norm': 0.014982378110289574, 'learning_rate': 1.4669333333333335e-05, 'epoch': 2.12}
268
+ {'loss': 0.0941, 'grad_norm': 0.03977168723940849, 'learning_rate': 1.4536e-05, 'epoch': 2.128}
269
+ {'loss': 0.138, 'grad_norm': 0.01852828450500965, 'learning_rate': 1.4402666666666667e-05, 'epoch': 2.136}
270
+ {'loss': 0.0893, 'grad_norm': 0.018985146656632423, 'learning_rate': 1.4269333333333334e-05, 'epoch': 2.144}
271
+ {'loss': 0.0016, 'grad_norm': 0.010966133326292038, 'learning_rate': 1.4136000000000002e-05, 'epoch': 2.152}
272
+ {'loss': 0.026, 'grad_norm': 0.02055787853896618, 'learning_rate': 1.4002666666666667e-05, 'epoch': 2.16}
273
+ {'loss': 0.1055, 'grad_norm': 0.021019885316491127, 'learning_rate': 1.3869333333333335e-05, 'epoch': 2.168}
274
+ {'loss': 0.1479, 'grad_norm': 0.06946071982383728, 'learning_rate': 1.3736000000000002e-05, 'epoch': 2.176}
275
+ {'loss': 0.0808, 'grad_norm': 0.014382677152752876, 'learning_rate': 1.3602666666666666e-05, 'epoch': 2.184}
276
+ {'loss': 0.1624, 'grad_norm': 0.02976427599787712, 'learning_rate': 1.3469333333333333e-05, 'epoch': 2.192}
277
+ {'loss': 0.1299, 'grad_norm': 0.11172953248023987, 'learning_rate': 1.3336e-05, 'epoch': 2.2}
278
+ {'loss': 0.0482, 'grad_norm': 0.08020364493131638, 'learning_rate': 1.3202666666666666e-05, 'epoch': 2.208}
279
+ {'loss': 0.0694, 'grad_norm': 0.013661920092999935, 'learning_rate': 1.3069333333333334e-05, 'epoch': 2.216}
280
+ {'loss': 0.1619, 'grad_norm': 0.02413208782672882, 'learning_rate': 1.2936000000000001e-05, 'epoch': 2.224}
281
+ {'loss': 0.1237, 'grad_norm': 0.007472939323633909, 'learning_rate': 1.2802666666666669e-05, 'epoch': 2.232}
282
+ {'loss': 0.0676, 'grad_norm': 0.02983078546822071, 'learning_rate': 1.2669333333333333e-05, 'epoch': 2.24}
283
+ {'loss': 0.0983, 'grad_norm': 0.04998508095741272, 'learning_rate': 1.2536e-05, 'epoch': 2.248}
284
+ {'loss': 0.1647, 'grad_norm': 13.296645164489746, 'learning_rate': 1.2402666666666667e-05, 'epoch': 2.2560000000000002}
285
+ {'loss': 0.0834, 'grad_norm': 0.016014471650123596, 'learning_rate': 1.2269333333333335e-05, 'epoch': 2.2640000000000002}
286
+ {'loss': 0.1467, 'grad_norm': 0.14326101541519165, 'learning_rate': 1.2136e-05, 'epoch': 2.2720000000000002}
287
+ {'loss': 0.0136, 'grad_norm': 0.014358256943523884, 'learning_rate': 1.2002666666666668e-05, 'epoch': 2.2800000000000002}
288
+ {'loss': 0.2312, 'grad_norm': 0.03325853496789932, 'learning_rate': 1.1869333333333333e-05, 'epoch': 2.288}
289
+ {'loss': 0.0823, 'grad_norm': 0.054809004068374634, 'learning_rate': 1.1736e-05, 'epoch': 2.296}
290
+ {'loss': 0.2533, 'grad_norm': 0.02338593825697899, 'learning_rate': 1.1602666666666666e-05, 'epoch': 2.304}
291
+ {'loss': 0.0905, 'grad_norm': 0.024055376648902893, 'learning_rate': 1.1469333333333334e-05, 'epoch': 2.312}
292
+ {'loss': 0.1688, 'grad_norm': 26.65433120727539, 'learning_rate': 1.1336000000000001e-05, 'epoch': 2.32}
293
+ {'loss': 0.1274, 'grad_norm': 0.05946606397628784, 'learning_rate': 1.1202666666666667e-05, 'epoch': 2.328}
294
+ {'loss': 0.0922, 'grad_norm': 0.018317028880119324, 'learning_rate': 1.1069333333333334e-05, 'epoch': 2.336}
295
+ {'loss': 0.1224, 'grad_norm': 0.014432383701205254, 'learning_rate': 1.0936e-05, 'epoch': 2.344}
296
+ {'loss': 0.0685, 'grad_norm': 0.013095813803374767, 'learning_rate': 1.0802666666666666e-05, 'epoch': 2.352}
297
+ {'loss': 0.0257, 'grad_norm': 0.028074130415916443, 'learning_rate': 1.0669333333333335e-05, 'epoch': 2.36}
298
+ {'loss': 0.1292, 'grad_norm': 0.02423202060163021, 'learning_rate': 1.0536e-05, 'epoch': 2.368}
299
+ {'loss': 0.1137, 'grad_norm': 0.013635743409395218, 'learning_rate': 1.0402666666666668e-05, 'epoch': 2.376}
300
+ {'loss': 0.1745, 'grad_norm': 0.016421562060713768, 'learning_rate': 1.0269333333333333e-05, 'epoch': 2.384}
301
+ {'loss': 0.1689, 'grad_norm': 0.01975177228450775, 'learning_rate': 1.0136000000000001e-05, 'epoch': 2.392}
302
+ {'loss': 0.1267, 'grad_norm': 0.05990523472428322, 'learning_rate': 1.0002666666666667e-05, 'epoch': 2.4}
303
+ {'loss': 0.0714, 'grad_norm': 0.023030275478959084, 'learning_rate': 9.869333333333334e-06, 'epoch': 2.408}
304
+ {'loss': 0.0303, 'grad_norm': 0.17459280788898468, 'learning_rate': 9.736000000000001e-06, 'epoch': 2.416}
305
+ {'loss': 0.0207, 'grad_norm': 0.024825584143400192, 'learning_rate': 9.602666666666667e-06, 'epoch': 2.424}
306
+ {'loss': 0.1338, 'grad_norm': 0.00718740513548255, 'learning_rate': 9.469333333333334e-06, 'epoch': 2.432}
307
+ {'loss': 0.001, 'grad_norm': 0.006329901050776243, 'learning_rate': 9.336e-06, 'epoch': 2.44}
308
+ {'loss': 0.1752, 'grad_norm': 0.016103368252515793, 'learning_rate': 9.202666666666667e-06, 'epoch': 2.448}
309
+ {'loss': 0.1168, 'grad_norm': 0.11804729700088501, 'learning_rate': 9.069333333333333e-06, 'epoch': 2.456}
310
+ {'loss': 0.2117, 'grad_norm': 35.67884826660156, 'learning_rate': 8.936e-06, 'epoch': 2.464}
311
+ {'loss': 0.1755, 'grad_norm': 0.016014249995350838, 'learning_rate': 8.802666666666668e-06, 'epoch': 2.472}
312
+ {'loss': 0.1497, 'grad_norm': 0.22153107821941376, 'learning_rate': 8.669333333333334e-06, 'epoch': 2.48}
313
+ {'loss': 0.1113, 'grad_norm': 0.01318784523755312, 'learning_rate': 8.536000000000001e-06, 'epoch': 2.488}
314
+ {'loss': 0.1143, 'grad_norm': 0.1176510900259018, 'learning_rate': 8.402666666666667e-06, 'epoch': 2.496}
315
+ {'loss': 0.1492, 'grad_norm': 0.06879352778196335, 'learning_rate': 8.269333333333332e-06, 'epoch': 2.504}
316
+ {'loss': 0.1984, 'grad_norm': 0.021879026666283607, 'learning_rate': 8.136000000000001e-06, 'epoch': 2.512}
317
+ {'loss': 0.0812, 'grad_norm': 0.03925799950957298, 'learning_rate': 8.002666666666667e-06, 'epoch': 2.52}
318
+ {'loss': 0.1615, 'grad_norm': 0.0319889560341835, 'learning_rate': 7.869333333333334e-06, 'epoch': 2.528}
319
+ {'loss': 0.0291, 'grad_norm': 0.015960585325956345, 'learning_rate': 7.736e-06, 'epoch': 2.536}
320
+ {'loss': 0.135, 'grad_norm': 0.020564408972859383, 'learning_rate': 7.6026666666666675e-06, 'epoch': 2.544}
321
+ {'loss': 0.1479, 'grad_norm': 0.03615148738026619, 'learning_rate': 7.469333333333334e-06, 'epoch': 2.552}
322
+ {'loss': 0.0368, 'grad_norm': 0.016910186037421227, 'learning_rate': 7.336e-06, 'epoch': 2.56}
323
+ {'loss': 0.2437, 'grad_norm': 8.867321968078613, 'learning_rate': 7.202666666666667e-06, 'epoch': 2.568}
324
+ {'loss': 0.049, 'grad_norm': 10.037091255187988, 'learning_rate': 7.069333333333334e-06, 'epoch': 2.576}
325
+ {'loss': 0.0398, 'grad_norm': 0.25611355900764465, 'learning_rate': 6.936000000000001e-06, 'epoch': 2.584}
326
+ {'loss': 0.0257, 'grad_norm': 0.05507563799619675, 'learning_rate': 6.802666666666667e-06, 'epoch': 2.592}
327
+ {'loss': 0.1173, 'grad_norm': 0.09031017869710922, 'learning_rate': 6.669333333333333e-06, 'epoch': 2.6}
328
+ {'loss': 0.1151, 'grad_norm': 0.013525927439332008, 'learning_rate': 6.536000000000001e-06, 'epoch': 2.608}
329
+ {'loss': 0.0917, 'grad_norm': 0.031039560213685036, 'learning_rate': 6.402666666666666e-06, 'epoch': 2.616}
330
+ {'loss': 0.1611, 'grad_norm': 0.02152109332382679, 'learning_rate': 6.269333333333334e-06, 'epoch': 2.624}
331
+ {'loss': 0.1082, 'grad_norm': 0.02339756488800049, 'learning_rate': 6.136e-06, 'epoch': 2.632}
332
+ {'loss': 0.0367, 'grad_norm': 0.012301336042582989, 'learning_rate': 6.002666666666667e-06, 'epoch': 2.64}
333
+ {'loss': 0.0914, 'grad_norm': 215.3618621826172, 'learning_rate': 5.869333333333333e-06, 'epoch': 2.648}
334
+ {'loss': 0.1905, 'grad_norm': 20.581954956054688, 'learning_rate': 5.736000000000001e-06, 'epoch': 2.656}
335
+ {'loss': 0.0908, 'grad_norm': 0.013410776853561401, 'learning_rate': 5.602666666666667e-06, 'epoch': 2.664}
336
+ {'loss': 0.0603, 'grad_norm': 0.13063132762908936, 'learning_rate': 5.469333333333333e-06, 'epoch': 2.672}
337
+ {'loss': 0.1159, 'grad_norm': 0.05968919396400452, 'learning_rate': 5.336e-06, 'epoch': 2.68}
338
+ {'loss': 0.178, 'grad_norm': 0.07835003733634949, 'learning_rate': 5.202666666666667e-06, 'epoch': 2.6879999999999997}
339
+ {'loss': 0.1182, 'grad_norm': 38.63554000854492, 'learning_rate': 5.069333333333333e-06, 'epoch': 2.6959999999999997}
340
+ {'loss': 0.065, 'grad_norm': 1.430072546005249, 'learning_rate': 4.936000000000001e-06, 'epoch': 2.7039999999999997}
341
+ {'loss': 0.0336, 'grad_norm': 0.009959719143807888, 'learning_rate': 4.802666666666667e-06, 'epoch': 2.7119999999999997}
342
+ {'loss': 0.0495, 'grad_norm': 0.6715738773345947, 'learning_rate': 4.669333333333334e-06, 'epoch': 2.7199999999999998}
343
+ {'loss': 0.0314, 'grad_norm': 0.010251459665596485, 'learning_rate': 4.536e-06, 'epoch': 2.7279999999999998}
344
+ {'loss': 0.0264, 'grad_norm': 0.12389620393514633, 'learning_rate': 4.402666666666667e-06, 'epoch': 2.7359999999999998}
345
+ {'loss': 0.0462, 'grad_norm': 0.008583267219364643, 'learning_rate': 4.269333333333333e-06, 'epoch': 2.7439999999999998}
346
+ {'loss': 0.0661, 'grad_norm': 0.007292643189430237, 'learning_rate': 4.136e-06, 'epoch': 2.752}
347
+ {'loss': 0.1706, 'grad_norm': 0.047004811465740204, 'learning_rate': 4.002666666666667e-06, 'epoch': 2.76}
348
+ {'loss': 0.0777, 'grad_norm': 0.020715517923235893, 'learning_rate': 3.869333333333334e-06, 'epoch': 2.768}
349
+ {'loss': 0.031, 'grad_norm': 0.008295822888612747, 'learning_rate': 3.736e-06, 'epoch': 2.776}
350
+ {'loss': 0.1627, 'grad_norm': 0.015728944912552834, 'learning_rate': 3.602666666666667e-06, 'epoch': 2.784}
351
+ {'loss': 0.0297, 'grad_norm': 0.1496945321559906, 'learning_rate': 3.4693333333333334e-06, 'epoch': 2.792}
352
+ {'loss': 0.1484, 'grad_norm': 0.02585836499929428, 'learning_rate': 3.3360000000000003e-06, 'epoch': 2.8}
353
+ {'loss': 0.0878, 'grad_norm': 0.0086339320987463, 'learning_rate': 3.202666666666667e-06, 'epoch': 2.808}
354
+ {'loss': 0.2442, 'grad_norm': 8.712865829467773, 'learning_rate': 3.0693333333333334e-06, 'epoch': 2.816}
355
+ {'loss': 0.0921, 'grad_norm': 0.02131008356809616, 'learning_rate': 2.9360000000000003e-06, 'epoch': 2.824}
356
+ {'loss': 0.2405, 'grad_norm': 0.00918051227927208, 'learning_rate': 2.8026666666666665e-06, 'epoch': 2.832}
357
+ {'loss': 0.0815, 'grad_norm': 0.020189447328448296, 'learning_rate': 2.6693333333333334e-06, 'epoch': 2.84}
358
+ {'loss': 0.0638, 'grad_norm': 0.18166711926460266, 'learning_rate': 2.5360000000000004e-06, 'epoch': 2.848}
359
+ {'loss': 0.0522, 'grad_norm': 0.00875813141465187, 'learning_rate': 2.402666666666667e-06, 'epoch': 2.856}
360
+ {'loss': 0.0009, 'grad_norm': 0.0431634895503521, 'learning_rate': 2.2693333333333334e-06, 'epoch': 2.864}
361
+ {'loss': 0.1156, 'grad_norm': 0.023334724828600883, 'learning_rate': 2.136e-06, 'epoch': 2.872}
362
+ {'loss': 0.1775, 'grad_norm': 36.20563507080078, 'learning_rate': 2.002666666666667e-06, 'epoch': 2.88}
363
+ {'loss': 0.2143, 'grad_norm': 25.47490882873535, 'learning_rate': 1.8693333333333334e-06, 'epoch': 2.888}
364
+ {'loss': 0.0035, 'grad_norm': 0.013057650066912174, 'learning_rate': 1.7360000000000002e-06, 'epoch': 2.896}
365
+ {'loss': 0.0009, 'grad_norm': 0.01746312901377678, 'learning_rate': 1.602666666666667e-06, 'epoch': 2.904}
366
+ {'loss': 0.1258, 'grad_norm': 0.012785250321030617, 'learning_rate': 1.4693333333333333e-06, 'epoch': 2.912}
367
+ {'loss': 0.1393, 'grad_norm': 0.026742149144411087, 'learning_rate': 1.336e-06, 'epoch': 2.92}
368
+ {'loss': 0.1293, 'grad_norm': 31.66493797302246, 'learning_rate': 1.2026666666666667e-06, 'epoch': 2.928}
369
+ {'loss': 0.0443, 'grad_norm': 0.12351831048727036, 'learning_rate': 1.0693333333333333e-06, 'epoch': 2.936}
370
+ {'loss': 0.0358, 'grad_norm': 0.01323748379945755, 'learning_rate': 9.360000000000001e-07, 'epoch': 2.944}
371
+ {'loss': 0.0679, 'grad_norm': 0.010095755569636822, 'learning_rate': 8.026666666666667e-07, 'epoch': 2.952}
372
+ {'loss': 0.075, 'grad_norm': 0.09313926100730896, 'learning_rate': 6.693333333333334e-07, 'epoch': 2.96}
373
+ {'loss': 0.1417, 'grad_norm': 0.014779884368181229, 'learning_rate': 5.36e-07, 'epoch': 2.968}
374
+ {'loss': 0.0665, 'grad_norm': 0.011904980055987835, 'learning_rate': 4.026666666666666e-07, 'epoch': 2.976}
375
+ {'loss': 0.1183, 'grad_norm': 0.03782917186617851, 'learning_rate': 2.693333333333333e-07, 'epoch': 2.984}
376
+ {'loss': 0.167, 'grad_norm': 8.537976264953613, 'learning_rate': 1.3600000000000003e-07, 'epoch': 2.992}
377
+ {'loss': 0.033, 'grad_norm': 0.010841709561645985, 'learning_rate': 2.666666666666667e-09, 'epoch': 3.0}
378
+ {'eval_loss': 0.5613933801651001, 'eval_accuracy': 0.90404, 'eval_f1': 0.903332393117621, 'eval_precision': 0.9100430299585938, 'eval_recall': 0.89672, 'eval_runtime': 403.0149, 'eval_samples_per_second': 62.032, 'eval_steps_per_second': 15.508, 'epoch': 3.0}
379
+ {'train_runtime': 17654.2392, 'train_samples_per_second': 4.248, 'train_steps_per_second': 1.062, 'train_loss': 0.2737119940789541, 'epoch': 3.0}
380
+ Training completed!
381
+ Evaluating model...
382
+ {'eval_loss': 0.5613933801651001, 'eval_accuracy': 0.90404, 'eval_f1': 0.903332393117621, 'eval_precision': 0.9100430299585938, 'eval_recall': 0.89672, 'eval_runtime': 383.8561, 'eval_samples_per_second': 65.129, 'eval_steps_per_second': 16.282, 'epoch': 3.0}
383
+ === Evaluation Results ===
384
+ eval_loss: 0.5614
385
+ eval_accuracy: 0.9040
386
+ eval_f1: 0.9033
387
+ eval_precision: 0.9100
388
+ eval_recall: 0.8967
389
+ eval_runtime: 383.8561
390
+ eval_samples_per_second: 65.1290
391
+ eval_steps_per_second: 16.2820
392
+ epoch: 3.0000
393
+ Saving model to ./model...
394
+ Model saved successfully!
395
+ === Training Pipeline Completed ===
396
+
397
+ 🎉 Training completed!
398
+ To run the app: python app.py
wandb/run-20250720_155338-0h3fksuy/files/requirements.txt ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.9.0
2
+ aiofiles==24.1.0
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.12.14
5
+ aiosignal==1.4.0
6
+ alembic==1.16.2
7
+ altair==5.5.0
8
+ annotated-types==0.7.0
9
+ anyio==4.9.0
10
+ attrs==25.3.0
11
+ audioop-lts==0.2.1
12
+ blinker==1.9.0
13
+ Bottleneck==1.4.2
14
+ Brotli==1.1.0
15
+ cachetools==6.1.0
16
+ certifi==2025.6.15
17
+ charset-normalizer==3.4.2
18
+ click==8.2.1
19
+ cloudpickle==3.1.1
20
+ colorama==0.4.6
21
+ colorlog==6.9.0
22
+ contourpy==1.3.1
23
+ cycler==0.11.0
24
+ datasets==4.0.0
25
+ dill==0.3.8
26
+ fastapi==0.116.1
27
+ ffmpy==0.6.0
28
+ filelock==3.18.0
29
+ fonttools==4.55.3
30
+ frozenlist==1.7.0
31
+ fsspec==2025.3.0
32
+ gitdb==4.0.12
33
+ GitPython==3.1.44
34
+ gradio==5.37.0
35
+ gradio_client==1.10.4
36
+ greenlet==3.2.3
37
+ groovy==0.1.2
38
+ h11==0.16.0
39
+ httpcore==1.0.9
40
+ httpx==0.28.1
41
+ huggingface-hub==0.33.4
42
+ idna==3.10
43
+ imbalanced-learn==0.13.0
44
+ imblearn==0.0
45
+ Jinja2==3.1.6
46
+ joblib==1.4.2
47
+ jsonschema==4.24.0
48
+ jsonschema-specifications==2025.4.1
49
+ kiwisolver==1.4.8
50
+ llvmlite==0.44.0
51
+ Mako==1.3.10
52
+ markdown-it-py==3.0.0
53
+ MarkupSafe==3.0.2
54
+ matplotlib==3.9.2
55
+ mdurl==0.1.2
56
+ mpmath==1.3.0
57
+ multidict==6.6.3
58
+ multiprocess==0.70.16
59
+ narwhals==1.44.0
60
+ networkx==3.5
61
+ ninja==1.11.1.4
62
+ numba==0.61.2
63
+ numexpr==2.10.2
64
+ numpy==2.1.1
65
+ optuna==4.4.0
66
+ orjson==3.11.0
67
+ packaging==24.2
68
+ pandas==2.2.3
69
+ pillow==11.1.0
70
+ pip==25.1
71
+ platformdirs==4.3.8
72
+ plotly==6.2.0
73
+ propcache==0.3.2
74
+ protobuf==6.31.1
75
+ psutil==7.0.0
76
+ pyarrow==20.0.0
77
+ pybind11==3.0.0
78
+ pydantic==2.11.7
79
+ pydantic_core==2.33.2
80
+ pydeck==0.9.1
81
+ pydub==0.25.1
82
+ Pygments==2.19.2
83
+ pyparsing==3.2.0
84
+ PyQt6==6.7.1
85
+ PyQt6_sip==13.9.1
86
+ python-dateutil==2.9.0.post0
87
+ python-multipart==0.0.20
88
+ pytz==2024.1
89
+ PyYAML==6.0.2
90
+ referencing==0.36.2
91
+ regex==2024.11.6
92
+ requests==2.32.4
93
+ rich==14.0.0
94
+ rpds-py==0.26.0
95
+ ruff==0.12.3
96
+ safehttpx==0.1.6
97
+ safetensors==0.5.3
98
+ scikit-learn==1.5.2
99
+ scipy==1.15.2
100
+ seaborn==0.13.2
101
+ semantic-version==2.10.0
102
+ sentry-sdk==2.33.0
103
+ setuptools==78.1.1
104
+ shap==0.48.0
105
+ shellingham==1.5.4
106
+ sip==6.10.0
107
+ six==1.17.0
108
+ sklearn-compat==0.1.3
109
+ slicer==0.0.8
110
+ smmap==5.0.2
111
+ sniffio==1.3.1
112
+ SQLAlchemy==2.0.41
113
+ starlette==0.47.1
114
+ streamlit==1.46.1
115
+ sympy==1.14.0
116
+ tenacity==9.1.2
117
+ threadpoolctl==3.5.0
118
+ tokenizers==0.21.2
119
+ toml==0.10.2
120
+ tomlkit==0.13.3
121
+ torch==2.7.1+cu118
122
+ torchaudio==2.7.1+cu118
123
+ torchvision==0.22.1
124
+ tornado==6.5.1
125
+ tqdm==4.67.1
126
+ transformers==4.53.2
127
+ typer==0.16.0
128
+ typing_extensions==4.14.0
129
+ typing-inspection==0.4.1
130
+ tzdata==2025.2
131
+ urllib3==2.5.0
132
+ uvicorn==0.35.0
133
+ wandb==0.21.0
134
+ watchdog==6.0.0
135
+ websockets==15.0.1
136
+ wheel==0.45.1
137
+ xgboost==3.0.2
138
+ xxhash==3.5.0
139
+ yarl==1.20.1
wandb/run-20250720_155338-0h3fksuy/files/wandb-metadata.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Windows-11-10.0.26100-SP0",
3
+ "python": "CPython 3.13.5",
4
+ "startedAt": "2025-07-20T10:23:38.923772Z",
5
+ "program": "C:\\Users\\Legion\\desktop\\distilbert-sentiment\\main.py",
6
+ "codePath": "main.py",
7
+ "codePathLocal": "main.py",
8
+ "email": "shreshthkapai@gmail.com",
9
+ "root": "C:\\Users\\Legion\\desktop\\distilbert-sentiment",
10
+ "host": "DESKTOP-EIHJJJL",
11
+ "executable": "C:\\Users\\Legion\\Miniconda3\\envs\\ML\\python.exe",
12
+ "cpu_count": 4,
13
+ "cpu_count_logical": 8,
14
+ "gpu": "NVIDIA GeForce GTX 1650",
15
+ "gpu_count": 1,
16
+ "disk": {
17
+ "/": {
18
+ "total": "255230791680",
19
+ "used": "233129451520"
20
+ }
21
+ },
22
+ "memory": {
23
+ "total": "8506298368"
24
+ },
25
+ "gpu_nvidia": [
26
+ {
27
+ "name": "NVIDIA GeForce GTX 1650",
28
+ "memoryTotal": "4294967296",
29
+ "cudaCores": 1024,
30
+ "architecture": "Turing",
31
+ "uuid": "GPU-fbcd7647-fb67-66f5-b8c7-1a4198b7e4fa"
32
+ }
33
+ ],
34
+ "cudaVersion": "12.7",
35
+ "writerId": "fshn6fq4d357dfamunx9x96y44pdzcc6"
36
+ }
wandb/run-20250720_155338-0h3fksuy/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/global_step":18750,"train/epoch":3,"_timestamp":1.75302505616767e+09,"eval/steps_per_second":16.282,"train_samples_per_second":4.248,"eval/recall":0.89672,"train_loss":0.2737119940789541,"train_runtime":17654.2392,"total_flos":4.9675274496e+15,"eval/loss":0.5613933801651001,"_wandb":{"runtime":18036},"eval/runtime":383.8561,"train_steps_per_second":1.062,"eval/accuracy":0.90404,"eval/samples_per_second":65.129,"train/loss":0.033,"train/learning_rate":2.666666666666667e-09,"_step":379,"_runtime":18036,"train/grad_norm":0.010841709561645985,"eval/precision":0.9100430299585938,"eval/f1":0.903332393117621}
wandb/run-20250720_155338-0h3fksuy/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-07-20T15:53:39.5114812+05:30","level":"INFO","msg":"stream: starting","core version":"0.21.0"}
2
+ {"time":"2025-07-20T15:53:41.0961508+05:30","level":"INFO","msg":"stream: created new stream","id":"0h3fksuy"}
3
+ {"time":"2025-07-20T15:53:41.0967346+05:30","level":"INFO","msg":"stream: started","id":"0h3fksuy"}
4
+ {"time":"2025-07-20T15:53:41.0967346+05:30","level":"INFO","msg":"handler: started","stream_id":"0h3fksuy"}
5
+ {"time":"2025-07-20T15:53:41.0967346+05:30","level":"INFO","msg":"sender: started","stream_id":"0h3fksuy"}
6
+ {"time":"2025-07-20T15:53:41.0967346+05:30","level":"INFO","msg":"writer: Do: started","stream_id":"0h3fksuy"}
7
+ {"time":"2025-07-20T20:27:20.3681207+05:30","level":"WARN","msg":"sender: taking a long time","seconds":11118.8689693,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"qx9z56z7vy8w\" connection_id:\"1(127.0.0.1:59166)\")"}
8
+ {"time":"2025-07-20T20:27:20.6988531+05:30","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
9
+ {"time":"2025-07-20T20:27:20.856851+05:30","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/shreshth/huggingface/0h3fksuy/file_stream\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
10
+ {"time":"2025-07-20T20:27:24.1042083+05:30","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":11124.0731302,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"qx9z56z7vy8w\" connection_id:\"1(127.0.0.1:59166)\")"}
11
+ {"time":"2025-07-20T20:54:18.4689135+05:30","level":"INFO","msg":"stream: closing","id":"0h3fksuy"}
12
+ {"time":"2025-07-20T20:54:19.7177233+05:30","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-07-20T20:54:20.2321569+05:30","level":"INFO","msg":"handler: closed","stream_id":"0h3fksuy"}
14
+ {"time":"2025-07-20T20:54:20.2321569+05:30","level":"INFO","msg":"sender: closed","stream_id":"0h3fksuy"}
15
+ {"time":"2025-07-20T20:54:20.2321569+05:30","level":"INFO","msg":"writer: Close: closed","stream_id":"0h3fksuy"}
16
+ {"time":"2025-07-20T20:54:20.2327206+05:30","level":"INFO","msg":"stream: closed","id":"0h3fksuy"}
wandb/run-20250720_155338-0h3fksuy/logs/debug.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-20 15:53:38,929 INFO MainThread:1648 [wandb_setup.py:_flush():80] Current SDK version is 0.21.0
2
+ 2025-07-20 15:53:38,930 INFO MainThread:1648 [wandb_setup.py:_flush():80] Configure stats pid to 1648
3
+ 2025-07-20 15:53:38,930 INFO MainThread:1648 [wandb_setup.py:_flush():80] Loading settings from C:\Users\Legion\.config\wandb\settings
4
+ 2025-07-20 15:53:38,930 INFO MainThread:1648 [wandb_setup.py:_flush():80] Loading settings from C:\Users\Legion\desktop\distilbert-sentiment\wandb\settings
5
+ 2025-07-20 15:53:38,930 INFO MainThread:1648 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-07-20 15:53:38,930 INFO MainThread:1648 [wandb_init.py:setup_run_log_directory():703] Logging user logs to C:\Users\Legion\desktop\distilbert-sentiment\wandb\run-20250720_155338-0h3fksuy\logs\debug.log
7
+ 2025-07-20 15:53:38,931 INFO MainThread:1648 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to C:\Users\Legion\desktop\distilbert-sentiment\wandb\run-20250720_155338-0h3fksuy\logs\debug-internal.log
8
+ 2025-07-20 15:53:38,931 INFO MainThread:1648 [wandb_init.py:init():830] calling init triggers
9
+ 2025-07-20 15:53:38,931 INFO MainThread:1648 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-07-20 15:53:38,931 INFO MainThread:1648 [wandb_init.py:init():871] starting backend
12
+ 2025-07-20 15:53:39,433 INFO MainThread:1648 [wandb_init.py:init():874] sending inform_init request
13
+ 2025-07-20 15:53:39,505 INFO MainThread:1648 [wandb_init.py:init():882] backend started and connected
14
+ 2025-07-20 15:53:39,507 INFO MainThread:1648 [wandb_init.py:init():953] updated telemetry
15
+ 2025-07-20 15:53:39,511 INFO MainThread:1648 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
16
+ 2025-07-20 15:53:41,579 INFO MainThread:1648 [wandb_init.py:init():1029] starting run threads in backend
17
+ 2025-07-20 15:53:42,183 INFO MainThread:1648 [wandb_run.py:_console_start():2458] atexit reg
18
+ 2025-07-20 15:53:42,183 INFO MainThread:1648 [wandb_run.py:_redirect():2306] redirect: wrap_raw
19
+ 2025-07-20 15:53:42,184 INFO MainThread:1648 [wandb_run.py:_redirect():2375] Wrapping output streams.
20
+ 2025-07-20 15:53:42,184 INFO MainThread:1648 [wandb_run.py:_redirect():2398] Redirects installed.
21
+ 2025-07-20 15:53:42,191 INFO MainThread:1648 [wandb_init.py:init():1075] run started, returning control to user process
22
+ 2025-07-20 15:53:42,195 INFO MainThread:1648 [wandb_run.py:_config_callback():1363] config_cb None None {'vocab_size': 30522, 'max_position_embeddings': 512, 'sinusoidal_pos_embds': False, 'n_layers': 6, 'n_heads': 12, 'dim': 768, 'hidden_dim': 3072, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation': 'gelu', 'initializer_range': 0.02, 'qa_dropout': 0.1, 'seq_classif_dropout': 0.2, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['DistilBertForMaskedLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 0, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'distilbert-base-uncased', 'transformers_version': '4.53.2', 'model_type': 'distilbert', 'tie_weights_': True, 'output_attentions': False, 'output_dir': './model', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 50, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './model', 'disable_tqdm': True, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'f1', 'greater_is_better': True, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
23
+ 2025-07-20 15:53:42,202 INFO MainThread:1648 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 66955010 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x0000024ABE770590>>
24
+ 2025-07-20 15:53:42,202 INFO MainThread:1648 [wandb_run.py:_config_callback():1363] config_cb model/num_parameters 66955010 None
25
+ 2025-07-20 20:54:18,327 INFO MsgRouterThr:1648 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.
wandb/run-20250720_155338-0h3fksuy/run-0h3fksuy.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ee10c6e19f6dde4c416e8ee5e2f7791dacbb667462dbe42de372ff1eaca5b68
3
+ size 703284