Spaces:

Shreshth2002
/

distilbert-sentiment

Sleeping

App Files Files Community

Shreshth2002 commited on Jul 20, 2025

Commit

9cbb56b

verified ·

1 Parent(s): 0115c6d

Upload folder using huggingface_hub

Browse files

Files changed (44) hide show

.gitattributes +2 -0
.gradio/certificate.pem +31 -0
.gradio/flagged/dataset1.csv +2 -0
README.md +3 -9
__pycache__/infer.cpython-313.pyc +0 -0
__pycache__/train.cpython-313.pyc +0 -0
__pycache__/utils.cpython-313.pyc +0 -0
app.py +36 -0
infer.py +105 -0
main.py +74 -0
model/config.json +24 -0
model/model.safetensors +3 -0
model/special_tokens_map.json +7 -0
model/tokenizer.json +0 -0
model/tokenizer_config.json +56 -0
model/training_args.bin +3 -0
model/vocab.txt +0 -0
requirements.txt +7 -0
train.py +143 -0
utils.py +32 -0
wandb/run-20250720_144411-9kwggmdj/files/config.yaml +493 -0
wandb/run-20250720_144411-9kwggmdj/files/output.log +148 -0
wandb/run-20250720_144411-9kwggmdj/files/requirements.txt +139 -0
wandb/run-20250720_144411-9kwggmdj/files/wandb-metadata.json +36 -0
wandb/run-20250720_144411-9kwggmdj/files/wandb-summary.json +1 -0
wandb/run-20250720_144411-9kwggmdj/logs/debug-internal.log +12 -0
wandb/run-20250720_144411-9kwggmdj/logs/debug.log +25 -0
wandb/run-20250720_144411-9kwggmdj/run-9kwggmdj.wandb +3 -0
wandb/run-20250720_154435-9xqrzjdo/files/config.yaml +493 -0
wandb/run-20250720_154435-9xqrzjdo/files/output.log +39 -0
wandb/run-20250720_154435-9xqrzjdo/files/requirements.txt +139 -0
wandb/run-20250720_154435-9xqrzjdo/files/wandb-metadata.json +36 -0
wandb/run-20250720_154435-9xqrzjdo/files/wandb-summary.json +1 -0
wandb/run-20250720_154435-9xqrzjdo/logs/debug-internal.log +12 -0
wandb/run-20250720_154435-9xqrzjdo/logs/debug.log +25 -0
wandb/run-20250720_154435-9xqrzjdo/run-9xqrzjdo.wandb +0 -0
wandb/run-20250720_155338-0h3fksuy/files/config.yaml +494 -0
wandb/run-20250720_155338-0h3fksuy/files/output.log +398 -0
wandb/run-20250720_155338-0h3fksuy/files/requirements.txt +139 -0
wandb/run-20250720_155338-0h3fksuy/files/wandb-metadata.json +36 -0
wandb/run-20250720_155338-0h3fksuy/files/wandb-summary.json +1 -0
wandb/run-20250720_155338-0h3fksuy/logs/debug-internal.log +16 -0
wandb/run-20250720_155338-0h3fksuy/logs/debug.log +25 -0
wandb/run-20250720_155338-0h3fksuy/run-0h3fksuy.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wandb/run-20250720_144411-9kwggmdj/run-9kwggmdj.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20250720_155338-0h3fksuy/run-0h3fksuy.wandb filter=lfs diff=lfs merge=lfs -text

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

.gradio/flagged/dataset1.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Movie Review,Sentiment Prediction,timestamp
2	+ The movie gave me chills it was soo scary.,"{""label"": ""Negative"", ""confidences"": null}",2025-07-21 01:18:41.890282

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Distilbert Sentiment
-emoji: 🌍
-colorFrom: green
-colorTo: green
-sdk: gradio
-sdk_version: 5.38.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: distilbert-sentiment
 app_file: app.py
+sdk: gradio
+sdk_version: 5.37.0
 ---

__pycache__/infer.cpython-313.pyc ADDED Viewed

Binary file (3.68 kB). View file

__pycache__/train.cpython-313.pyc ADDED Viewed

Binary file (4.73 kB). View file

__pycache__/utils.cpython-313.pyc ADDED Viewed

Binary file (975 Bytes). View file

app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+Gradio frontend for DistilBERT sentiment analysis
+File: app.py
+"""
+import gradio as gr
+from infer import predict
+def sentiment_analyzer(text):
+    """Wrapper function for Gradio interface"""
+    if not text.strip():
+        return "Please enter some text"
+    result = predict(text)
+    return result.capitalize()
+# Create Gradio interface
+interface = gr.Interface(
+    fn=sentiment_analyzer,
+    inputs=gr.Textbox(
+        label="Movie Review",
+        placeholder="Enter your movie review here...",
+        lines=3
+    ),
+    outputs=gr.Label(label="Sentiment Prediction"),
+    title="🎬 Movie Review Sentiment Analysis",
+    description="Fine-tuned DistilBERT model for movie review sentiment classification",
+    examples=[
+        "This movie was absolutely fantastic! Great acting and storyline.",
+        "Terrible film, worst movie I've ever seen. Complete waste of time.",
+        "The movie was okay, not great but not terrible either."
+    ]
+)
+if __name__ == "__main__":
+    interface.launch(share=True)

infer.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+Inference pipeline for DistilBERT sentiment analysis
+File: infer.py (improved version)
+"""
+import torch
+import os
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# Global variables to cache model and tokenizer
+_model = None
+_tokenizer = None
+def load_trained_model(model_path="./model"):
+    """Load saved model and tokenizer (cached)"""
+    global _model, _tokenizer
+    # Check if model exists
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"No model found at {model_path}. Please train the model first.")
+    # Return cached model if already loaded
+    if _model is not None and _tokenizer is not None:
+        return _model, _tokenizer
+    print(f"Loading model from {model_path}...")
+    _tokenizer = AutoTokenizer.from_pretrained(model_path)
+    _model = AutoModelForSequenceClassification.from_pretrained(model_path)
+    print("Model loaded successfully!")
+    return _model, _tokenizer
+def predict_sentiment(text, model, tokenizer, max_length=256):
+    """
+    Predict sentiment for a single text
+    Args:
+        text: Input text string
+        model: Loaded model
+        tokenizer: Loaded tokenizer
+        max_length: Max sequence length
+    Returns:
+        Tuple of (predicted_label, confidence_score)
+    """
+    # Tokenize input
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding="max_length",
+        max_length=max_length
+    )
+    # Get prediction
+    model.eval()
+    with torch.no_grad():
+        outputs = model(**inputs)
+        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+        predicted_class = torch.argmax(predictions, dim=-1).item()
+        confidence = predictions[0][predicted_class].item()
+    # Convert to readable format
+    label = "Positive" if predicted_class == 1 else "Negative"
+    return label, confidence
+def predict(text, model_path="./model", max_length=256):
+    """
+    Simple prediction function for new text
+    Args:
+        text: Input text string
+        model_path: Path to saved model
+        max_length: Max sequence length
+    Returns:
+        String: "positive" or "negative"
+    """
+    try:
+        # Load model and tokenizer (cached)
+        model, tokenizer = load_trained_model(model_path)
+        # Tokenize input
+        inputs = tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            padding="max_length",
+            max_length=max_length
+        )
+        # Get prediction
+        model.eval()
+        with torch.no_grad():
+            outputs = model(**inputs)
+            predicted_class = torch.argmax(outputs.logits, dim=-1).item()
+        return "positive" if predicted_class == 1 else "negative"
+    except FileNotFoundError as e:
+        return f"Error: {str(e)}"
+    except Exception as e:
+        return f"Prediction error: {str(e)}"

main.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+Main pipeline for DistilBERT sentiment analysis project
+File: main.py
+"""
+import os
+import argparse
+from train import (
+    load_imdb_data,
+    preprocess_data,
+    load_model,
+    setup_trainer,
+    train_model,
+    evaluate_model,
+    save_model
+)
+# Remove app import since we'll run it separately
+def train_pipeline(subset_size=None):
+    """Complete training pipeline"""
+    print("=== Starting Training Pipeline ===")
+    # 1. Load dataset
+    dataset = load_imdb_data(subset_size=subset_size)
+    # 2. Preprocess data
+    tokenized_dataset, tokenizer = preprocess_data(dataset)
+    # 3. Load model
+    model = load_model()
+    # 4. Setup trainer
+    trainer = setup_trainer(
+        model,
+        tokenizer,
+        tokenized_dataset["train"],
+        tokenized_dataset["test"]
+    )
+    # 5. Train model
+    train_model(trainer)
+    # 6. Evaluate model
+    results = evaluate_model(trainer)
+    # 7. Save model
+    save_model(trainer, tokenizer)
+    print("=== Training Pipeline Completed ===")
+    return results
+def main():
+    parser = argparse.ArgumentParser(description="DistilBERT Sentiment Analysis - Training Only")
+    parser.add_argument("--subset", type=int, default=None,
+                       help="Use subset of data for training (for testing)")
+    args = parser.parse_args()
+    # Check if model already exists
+    if os.path.exists("./model") and os.path.exists("./model/config.json"):
+        response = input("Model already exists. Retrain? (y/n): ")
+        if response.lower() != 'y':
+            print("Skipping training...")
+            print("To run the app: python app.py")
+            return
+    # Train the model
+    train_pipeline(subset_size=args.subset)
+    print("\n🎉 Training completed!")
+    print("To run the app: python app.py")
+if __name__ == "__main__":
+    main()

model/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.2",
+  "vocab_size": 30522
+}

model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65e5980bd38406f43fad7a937fbfd69641552cd0bbcf0ba62ca73f7318eb3f9f
+size 267832560

model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

model/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01f6ab2453c3b34039132e185e58b0fa0c07ed65cf292dae165c993dcdab7683
+size 5713

model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch>=2.1.0,<3.0.0
+transformers>=4.45.0,<5.0.0
+datasets>=2.21.0,<3.0.0
+gradio>=5.0.0,<6.0.0
+scikit-learn>=1.5.0,<2.0.0
+numpy>=1.24.0,<2.0.0

train.py ADDED Viewed

	@@ -0,0 +1,143 @@

+"""
+Training and evaluation logic for DistilBERT sentiment analysis
+File: train.py
+"""
+# Hugging Face imports
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    Trainer,
+    TrainingArguments,
+    logging
+)
+# Local imports
+from utils import compute_metrics
+from datasets import load_dataset
+# Standard library imports
+import torch
+import numpy as np
+import pandas as pd
+# Sklearn metrics
+from sklearn.metrics import (
+    precision_recall_fscore_support,
+    accuracy_score,
+    confusion_matrix
+)
+# Suppress HF log spam
+logging.set_verbosity_error()
+# ===== DATASET LOADING =====
+def load_imdb_data(subset_size=None):
+    """Load IMDB dataset with optional subsampling"""
+    dataset = load_dataset("imdb")
+    # Optional subsetting for memory constraints
+    if subset_size:
+        dataset["train"] = dataset["train"].select(range(subset_size))
+        dataset["test"] = dataset["test"].select(range(min(subset_size // 4, len(dataset["test"]))))
+    print(f"Dataset loaded - Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")
+    return dataset
+# ===== PREPROCESSING =====
+def preprocess_data(dataset, max_length=256):
+    """Tokenize and prepare dataset for training"""
+    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+    def tokenize_function(examples):
+        return tokenizer(
+            examples["text"],
+            padding="max_length",
+            truncation=True,
+            max_length=max_length
+        )
+    # Tokenize both splits
+    tokenized_dataset = dataset.map(tokenize_function, batched=True)
+    # Rename label column and set format
+    tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
+    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
+    return tokenized_dataset, tokenizer
+# ===== MODEL LOADING =====
+def load_model():
+    """Load pre-trained DistilBERT model for sequence classification"""
+    model = AutoModelForSequenceClassification.from_pretrained(
+        "distilbert-base-uncased",
+        num_labels=2,
+        return_dict=True
+    )
+    return model
+# ===== TRAINING SETUP =====
+def get_training_args():
+    """Define training arguments"""
+    return TrainingArguments(
+        output_dir="./model",
+        per_device_train_batch_size=2,
+        per_device_eval_batch_size=4,
+        gradient_accumulation_steps=2,
+        num_train_epochs=3,
+        eval_strategy="epoch",  # Changed from evaluation_strategy
+        save_strategy="epoch",
+        logging_dir="./logs",
+        logging_steps=50,
+        load_best_model_at_end=True,
+        metric_for_best_model="f1",
+        greater_is_better=True,
+        seed=42
+    )
+def setup_trainer(model, tokenizer, train_dataset, eval_dataset):
+    """Initialize Trainer with model and datasets"""
+    training_args = get_training_args()
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics
+    )
+    return trainer
+# ===== TRAIN & EVALUATE =====
+def train_model(trainer):
+    """Train the model"""
+    print("Starting training...")
+    trainer.train()
+    print("Training completed!")
+def evaluate_model(trainer):
+    """Evaluate the trained model"""
+    print("Evaluating model...")
+    results = trainer.evaluate()
+    print("=== Evaluation Results ===")
+    for key, value in results.items():
+        print(f"{key}: {value:.4f}")
+    return results
+# ===== SAVE MODEL =====
+def save_model(trainer, tokenizer, save_path="./model"):
+    """Save trained model and tokenizer"""
+    print(f"Saving model to {save_path}...")
+    trainer.save_model(save_path)
+    tokenizer.save_pretrained(save_path)
+    print("Model saved successfully!")

utils.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+Utility functions for DistilBERT sentiment analysis
+File: utils.py
+"""
+import numpy as np
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score
+def compute_metrics(eval_pred):
+    """
+    Compute evaluation metrics for binary classification
+    Args:
+        eval_pred: Tuple of (predictions, labels)
+    Returns:
+        Dict with accuracy, f1, precision, recall
+    """
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=1)
+    precision, recall, f1, _ = precision_recall_fscore_support(
+        labels, predictions, average='binary'
+    )
+    accuracy = accuracy_score(labels, predictions)
+    return {
+        'accuracy': accuracy,
+        'f1': f1,
+        'precision': precision,
+        'recall': recall
+    }

wandb/run-20250720_144411-9kwggmdj/files/config.yaml ADDED Viewed

	@@ -0,0 +1,493 @@

+_name_or_path:
+    value: distilbert-base-uncased
+_wandb:
+    value:
+        cli_version: 0.21.0
+        e:
+            qd7dze61nxdy0n83hyx7lap6a5tql6xc:
+                codePath: main.py
+                codePathLocal: main.py
+                cpu_count: 4
+                cpu_count_logical: 8
+                cudaVersion: "12.7"
+                disk:
+                    /:
+                        total: "255230791680"
+                        used: "208595525632"
+                email: shreshthkapai@gmail.com
+                executable: C:\Users\Legion\Miniconda3\envs\ML\python.exe
+                gpu: NVIDIA GeForce GTX 1650
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Turing
+                      cudaCores: 1024
+                      memoryTotal: "4294967296"
+                      name: NVIDIA GeForce GTX 1650
+                      uuid: GPU-fbcd7647-fb67-66f5-b8c7-1a4198b7e4fa
+                host: DESKTOP-EIHJJJL
+                memory:
+                    total: "8506298368"
+                os: Windows-11-10.0.26100-SP0
+                program: C:\Users\Legion\desktop\distilbert-sentiment\main.py
+                python: CPython 3.13.5
+                root: C:\Users\Legion\desktop\distilbert-sentiment
+                startedAt: "2025-07-20T09:14:11.312224Z"
+                writerId: qd7dze61nxdy0n83hyx7lap6a5tql6xc
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.13.5
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 41
+                - 49
+                - 51
+                - 53
+                - 71
+                - 105
+            "2":
+                - 1
+                - 5
+                - 11
+                - 41
+                - 49
+                - 51
+                - 53
+                - 71
+                - 105
+            "3":
+                - 7
+                - 13
+                - 19
+                - 66
+            "4": 3.13.5
+            "5": 0.21.0
+            "6": 4.53.2
+            "8":
+                - 3
+            "9":
+                "1": transformers_trainer
+            "12": 0.21.0
+            "13": windows-amd64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+activation:
+    value: gelu
+adafactor:
+    value: false
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.999
+adam_epsilon:
+    value: 1e-08
+add_cross_attention:
+    value: false
+architectures:
+    value:
+        - DistilBertForMaskedLM
+attention_dropout:
+    value: 0.1
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: false
+bad_words_ids:
+    value: null
+batch_eval_metrics:
+    value: false
+begin_suppress_tokens:
+    value: null
+bf16:
+    value: false
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+cross_attention_hidden_size:
+    value: null
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 1800
+debug:
+    value: []
+decoder_start_token_id:
+    value: null
+deepspeed:
+    value: null
+dim:
+    value: 768
+disable_tqdm:
+    value: true
+diversity_penalty:
+    value: 0
+do_eval:
+    value: true
+do_predict:
+    value: false
+do_sample:
+    value: false
+do_train:
+    value: false
+dropout:
+    value: 0.1
+early_stopping:
+    value: false
+encoder_no_repeat_ngram_size:
+    value: 0
+eos_token_id:
+    value: null
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: epoch
+eval_use_gather_object:
+    value: false
+exponential_decay_length_penalty:
+    value: null
+finetuning_task:
+    value: null
+forced_bos_token_id:
+    value: null
+forced_eos_token_id:
+    value: null
+fp16:
+    value: false
+fp16_backend:
+    value: auto
+fp16_full_eval:
+    value: false
+fp16_opt_level:
+    value: O1
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+fsdp_min_num_params:
+    value: 0
+fsdp_transformer_layer_cls_to_wrap:
+    value: null
+full_determinism:
+    value: false
+gradient_accumulation_steps:
+    value: 1
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: true
+group_by_length:
+    value: false
+half_precision_backend:
+    value: auto
+hidden_dim:
+    value: 3072
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_inputs_for_metrics:
+    value: false
+include_num_input_tokens_seen:
+    value: false
+include_tokens_per_second:
+    value: false
+initializer_range:
+    value: 0.02
+is_decoder:
+    value: false
+is_encoder_decoder:
+    value: false
+jit_mode_eval:
+    value: false
+label_names:
+    value: null
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+learning_rate:
+    value: 5e-05
+length_column_name:
+    value: length
+length_penalty:
+    value: 1
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: true
+local_rank:
+    value: 0
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: ./logs
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 50
+logging_strategy:
+    value: steps
+lr_scheduler_type:
+    value: linear
+max_grad_norm:
+    value: 1
+max_length:
+    value: 20
+max_position_embeddings:
+    value: 512
+max_steps:
+    value: -1
+metric_for_best_model:
+    value: f1
+min_length:
+    value: 0
+model/num_parameters:
+    value: 66955010
+model_type:
+    value: distilbert
+mp_parameters:
+    value: ""
+n_heads:
+    value: 12
+n_layers:
+    value: 6
+neftune_noise_alpha:
+    value: null
+no_cuda:
+    value: false
+no_repeat_ngram_size:
+    value: 0
+num_beam_groups:
+    value: 1
+num_beams:
+    value: 1
+num_return_sequences:
+    value: 1
+num_train_epochs:
+    value: 3
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: ./model
+output_hidden_states:
+    value: false
+output_scores:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 0
+past_index:
+    value: -1
+per_device_eval_batch_size:
+    value: 16
+per_device_train_batch_size:
+    value: 8
+per_gpu_eval_batch_size:
+    value: null
+per_gpu_train_batch_size:
+    value: null
+prediction_loss_only:
+    value: false
+prefix:
+    value: null
+problem_type:
+    value: null
+push_to_hub:
+    value: false
+push_to_hub_model_id:
+    value: null
+push_to_hub_organization:
+    value: null
+push_to_hub_token:
+    value: <PUSH_TO_HUB_TOKEN>
+qa_dropout:
+    value: 0.1
+ray_scope:
+    value: last
+remove_invalid_values:
+    value: false
+remove_unused_columns:
+    value: true
+repetition_penalty:
+    value: 1
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+return_dict_in_generate:
+    value: false
+run_name:
+    value: ./model
+save_on_each_node:
+    value: false
+save_only_model:
+    value: false
+save_safetensors:
+    value: true
+save_steps:
+    value: 500
+save_strategy:
+    value: epoch
+save_total_limit:
+    value: null
+seed:
+    value: 42
+sep_token_id:
+    value: null
+seq_classif_dropout:
+    value: 0.2
+sinusoidal_pos_embds:
+    value: false
+skip_memory_metrics:
+    value: true
+suppress_tokens:
+    value: null
+task_specific_params:
+    value: null
+temperature:
+    value: 1
+tf_legacy_loss:
+    value: false
+tf32:
+    value: null
+tie_encoder_decoder:
+    value: false
+tie_weights_:
+    value: true
+tie_word_embeddings:
+    value: true
+tokenizer_class:
+    value: null
+top_k:
+    value: 50
+top_p:
+    value: 1
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_dtype:
+    value: float32
+torch_empty_cache_steps:
+    value: null
+torchdynamo:
+    value: null
+torchscript:
+    value: false
+tpu_metrics_debug:
+    value: false
+tpu_num_cores:
+    value: null
+transformers_version:
+    value: 4.53.2
+typical_p:
+    value: 1
+use_bfloat16:
+    value: false
+use_cpu:
+    value: false
+use_ipex:
+    value: false
+use_legacy_prediction_loop:
+    value: false
+use_liger_kernel:
+    value: false
+use_mps_device:
+    value: false
+vocab_size:
+    value: 30522
+warmup_ratio:
+    value: 0
+warmup_steps:
+    value: 0
+weight_decay:
+    value: 0

wandb/run-20250720_144411-9kwggmdj/files/output.log ADDED Viewed

	@@ -0,0 +1,148 @@

+{'loss': 0.5564, 'grad_norm': 3.3684804439544678, 'learning_rate': 4.9738666666666665e-05, 'epoch': 0.016}
+{'loss': 0.527, 'grad_norm': 12.206518173217773, 'learning_rate': 4.9472e-05, 'epoch': 0.032}
+{'loss': 0.4263, 'grad_norm': 23.95633316040039, 'learning_rate': 4.9205333333333335e-05, 'epoch': 0.048}
+{'loss': 0.3658, 'grad_norm': 10.685762405395508, 'learning_rate': 4.893866666666667e-05, 'epoch': 0.064}
+{'loss': 0.3694, 'grad_norm': 18.01938819885254, 'learning_rate': 4.8672000000000004e-05, 'epoch': 0.08}
+{'loss': 0.3841, 'grad_norm': 6.812848091125488, 'learning_rate': 4.8405333333333336e-05, 'epoch': 0.096}
+{'loss': 0.3934, 'grad_norm': 4.043306827545166, 'learning_rate': 4.8138666666666674e-05, 'epoch': 0.112}
+{'loss': 0.3622, 'grad_norm': 21.34123992919922, 'learning_rate': 4.7872e-05, 'epoch': 0.128}
+{'loss': 0.4146, 'grad_norm': 27.191320419311523, 'learning_rate': 4.7605333333333337e-05, 'epoch': 0.144}
+{'loss': 0.4305, 'grad_norm': 16.240070343017578, 'learning_rate': 4.733866666666667e-05, 'epoch': 0.16}
+{'loss': 0.403, 'grad_norm': 26.02972984313965, 'learning_rate': 4.7072000000000006e-05, 'epoch': 0.176}
+{'loss': 0.3811, 'grad_norm': 11.078995704650879, 'learning_rate': 4.680533333333334e-05, 'epoch': 0.192}
+{'loss': 0.3766, 'grad_norm': 9.874316215515137, 'learning_rate': 4.653866666666667e-05, 'epoch': 0.208}
+{'loss': 0.3032, 'grad_norm': 18.219112396240234, 'learning_rate': 4.627200000000001e-05, 'epoch': 0.224}
+{'loss': 0.3812, 'grad_norm': 14.96966552734375, 'learning_rate': 4.600533333333333e-05, 'epoch': 0.24}
+{'loss': 0.3765, 'grad_norm': 25.871795654296875, 'learning_rate': 4.573866666666667e-05, 'epoch': 0.256}
+{'loss': 0.3693, 'grad_norm': 3.639224052429199, 'learning_rate': 4.5472e-05, 'epoch': 0.272}
+{'loss': 0.2971, 'grad_norm': 15.468314170837402, 'learning_rate': 4.520533333333333e-05, 'epoch': 0.288}
+{'loss': 0.3572, 'grad_norm': 3.6710922718048096, 'learning_rate': 4.493866666666667e-05, 'epoch': 0.304}
+{'loss': 0.3409, 'grad_norm': 7.864389896392822, 'learning_rate': 4.4672e-05, 'epoch': 0.32}
+{'loss': 0.3285, 'grad_norm': 10.038674354553223, 'learning_rate': 4.440533333333334e-05, 'epoch': 0.336}
+{'loss': 0.3317, 'grad_norm': 13.171808242797852, 'learning_rate': 4.4138666666666665e-05, 'epoch': 0.352}
+{'loss': 0.3674, 'grad_norm': 4.481576919555664, 'learning_rate': 4.3872e-05, 'epoch': 0.368}
+{'loss': 0.3642, 'grad_norm': 6.312211513519287, 'learning_rate': 4.3605333333333334e-05, 'epoch': 0.384}
+{'loss': 0.3386, 'grad_norm': 4.072713851928711, 'learning_rate': 4.3338666666666666e-05, 'epoch': 0.4}
+{'loss': 0.3776, 'grad_norm': 4.920267581939697, 'learning_rate': 4.3072000000000004e-05, 'epoch': 0.416}
+{'loss': 0.3519, 'grad_norm': 13.408978462219238, 'learning_rate': 4.2805333333333335e-05, 'epoch': 0.432}
+{'loss': 0.343, 'grad_norm': 8.910168647766113, 'learning_rate': 4.253866666666667e-05, 'epoch': 0.448}
+{'loss': 0.345, 'grad_norm': 6.50616979598999, 'learning_rate': 4.2272e-05, 'epoch': 0.464}
+{'loss': 0.2931, 'grad_norm': 6.88561487197876, 'learning_rate': 4.2005333333333336e-05, 'epoch': 0.48}
+{'loss': 0.3541, 'grad_norm': 2.813678026199341, 'learning_rate': 4.173866666666667e-05, 'epoch': 0.496}
+{'loss': 0.3005, 'grad_norm': 18.764328002929688, 'learning_rate': 4.1472e-05, 'epoch': 0.512}
+{'loss': 0.3404, 'grad_norm': 13.757184028625488, 'learning_rate': 4.120533333333334e-05, 'epoch': 0.528}
+{'loss': 0.3112, 'grad_norm': 11.426987648010254, 'learning_rate': 4.093866666666667e-05, 'epoch': 0.544}
+{'loss': 0.285, 'grad_norm': 0.7347564697265625, 'learning_rate': 4.0672000000000006e-05, 'epoch': 0.56}
+{'loss': 0.2978, 'grad_norm': 3.315498113632202, 'learning_rate': 4.040533333333333e-05, 'epoch': 0.576}
+{'loss': 0.3928, 'grad_norm': 4.304668426513672, 'learning_rate': 4.013866666666667e-05, 'epoch': 0.592}
+{'loss': 0.2773, 'grad_norm': 0.5143654942512512, 'learning_rate': 3.9872e-05, 'epoch': 0.608}
+{'loss': 0.3937, 'grad_norm': 10.765504837036133, 'learning_rate': 3.960533333333333e-05, 'epoch': 0.624}
+{'loss': 0.2931, 'grad_norm': 3.0576841831207275, 'learning_rate': 3.933866666666667e-05, 'epoch': 0.64}
+{'loss': 0.2899, 'grad_norm': 1.09218430519104, 'learning_rate': 3.9072e-05, 'epoch': 0.656}
+{'loss': 0.3039, 'grad_norm': 9.193467140197754, 'learning_rate': 3.880533333333333e-05, 'epoch': 0.672}
+{'loss': 0.3191, 'grad_norm': 5.1164469718933105, 'learning_rate': 3.8538666666666664e-05, 'epoch': 0.688}
+{'loss': 0.3206, 'grad_norm': 10.537883758544922, 'learning_rate': 3.8272e-05, 'epoch': 0.704}
+{'loss': 0.3196, 'grad_norm': 10.457417488098145, 'learning_rate': 3.800533333333334e-05, 'epoch': 0.72}
+{'loss': 0.3056, 'grad_norm': 2.776677370071411, 'learning_rate': 3.7738666666666665e-05, 'epoch': 0.736}
+{'loss': 0.3273, 'grad_norm': 3.808607578277588, 'learning_rate': 3.7472e-05, 'epoch': 0.752}
+{'loss': 0.3754, 'grad_norm': 8.255670547485352, 'learning_rate': 3.7205333333333334e-05, 'epoch': 0.768}
+{'loss': 0.2756, 'grad_norm': 8.847413063049316, 'learning_rate': 3.6938666666666666e-05, 'epoch': 0.784}
+{'loss': 0.2828, 'grad_norm': 9.775912284851074, 'learning_rate': 3.6672000000000004e-05, 'epoch': 0.8}
+{'loss': 0.363, 'grad_norm': 3.9166083335876465, 'learning_rate': 3.6405333333333335e-05, 'epoch': 0.816}
+{'loss': 0.295, 'grad_norm': 0.43537598848342896, 'learning_rate': 3.6138666666666673e-05, 'epoch': 0.832}
+{'loss': 0.2519, 'grad_norm': 4.3010735511779785, 'learning_rate': 3.5872e-05, 'epoch': 0.848}
+{'loss': 0.3011, 'grad_norm': 3.7882602214813232, 'learning_rate': 3.5605333333333336e-05, 'epoch': 0.864}
+{'loss': 0.3489, 'grad_norm': 5.9410400390625, 'learning_rate': 3.533866666666667e-05, 'epoch': 0.88}
+{'loss': 0.2948, 'grad_norm': 6.711633205413818, 'learning_rate': 3.5072e-05, 'epoch': 0.896}
+{'loss': 0.3465, 'grad_norm': 12.11922836303711, 'learning_rate': 3.480533333333334e-05, 'epoch': 0.912}
+{'loss': 0.3492, 'grad_norm': 5.701395511627197, 'learning_rate': 3.453866666666667e-05, 'epoch': 0.928}
+{'loss': 0.2607, 'grad_norm': 15.726317405700684, 'learning_rate': 3.427200000000001e-05, 'epoch': 0.944}
+{'loss': 0.2862, 'grad_norm': 11.121344566345215, 'learning_rate': 3.400533333333333e-05, 'epoch': 0.96}
+{'loss': 0.2981, 'grad_norm': 4.980706214904785, 'learning_rate': 3.373866666666667e-05, 'epoch': 0.976}
+{'loss': 0.284, 'grad_norm': 6.423090934753418, 'learning_rate': 3.3472e-05, 'epoch': 0.992}
+{'eval_loss': 0.2969476878643036, 'eval_accuracy': 0.89448, 'eval_f1': 0.8900558472951571, 'eval_precision': 0.9290064381416391, 'eval_recall': 0.85424, 'eval_runtime': 511.6142, 'eval_samples_per_second': 48.865, 'eval_steps_per_second': 3.055, 'epoch': 1.0}
+{'loss': 0.2406, 'grad_norm': 3.1205193996429443, 'learning_rate': 3.320533333333333e-05, 'epoch': 1.008}
+{'loss': 0.2386, 'grad_norm': 11.420886039733887, 'learning_rate': 3.293866666666667e-05, 'epoch': 1.024}
+{'loss': 0.2133, 'grad_norm': 0.3266797661781311, 'learning_rate': 3.2672e-05, 'epoch': 1.04}
+{'loss': 0.2388, 'grad_norm': 20.907642364501953, 'learning_rate': 3.240533333333334e-05, 'epoch': 1.056}
+{'loss': 0.2207, 'grad_norm': 34.85378646850586, 'learning_rate': 3.2138666666666664e-05, 'epoch': 1.072}
+{'loss': 0.1863, 'grad_norm': 0.08423929661512375, 'learning_rate': 3.1872e-05, 'epoch': 1.088}
+{'loss': 0.2122, 'grad_norm': 2.1192731857299805, 'learning_rate': 3.1605333333333334e-05, 'epoch': 1.104}
+{'loss': 0.2274, 'grad_norm': 1.2625190019607544, 'learning_rate': 3.1338666666666665e-05, 'epoch': 1.12}
+{'loss': 0.146, 'grad_norm': 0.3231733441352844, 'learning_rate': 3.1072e-05, 'epoch': 1.1360000000000001}
+{'loss': 0.2008, 'grad_norm': 0.6839350461959839, 'learning_rate': 3.0805333333333335e-05, 'epoch': 1.152}
+{'loss': 0.2068, 'grad_norm': 3.0773186683654785, 'learning_rate': 3.0538666666666666e-05, 'epoch': 1.168}
+{'loss': 0.2084, 'grad_norm': 0.05034258961677551, 'learning_rate': 3.0272e-05, 'epoch': 1.184}
+{'loss': 0.2462, 'grad_norm': 11.455129623413086, 'learning_rate': 3.0005333333333336e-05, 'epoch': 1.2}
+{'loss': 0.1906, 'grad_norm': 0.09377483278512955, 'learning_rate': 2.973866666666667e-05, 'epoch': 1.216}
+{'loss': 0.2032, 'grad_norm': 17.590801239013672, 'learning_rate': 2.9472e-05, 'epoch': 1.232}
+{'loss': 0.24, 'grad_norm': 28.78790855407715, 'learning_rate': 2.9205333333333333e-05, 'epoch': 1.248}
+{'loss': 0.1331, 'grad_norm': 1.1610554456710815, 'learning_rate': 2.8938666666666668e-05, 'epoch': 1.264}
+{'loss': 0.2127, 'grad_norm': 0.30296802520751953, 'learning_rate': 2.8672e-05, 'epoch': 1.28}
+{'loss': 0.1867, 'grad_norm': 0.15345898270606995, 'learning_rate': 2.8405333333333334e-05, 'epoch': 1.296}
+{'loss': 0.24, 'grad_norm': 8.489642143249512, 'learning_rate': 2.813866666666667e-05, 'epoch': 1.312}
+{'loss': 0.1471, 'grad_norm': 0.7609522342681885, 'learning_rate': 2.7872000000000004e-05, 'epoch': 1.328}
+{'loss': 0.1787, 'grad_norm': 0.15069647133350372, 'learning_rate': 2.760533333333333e-05, 'epoch': 1.3439999999999999}
+{'loss': 0.2256, 'grad_norm': 0.13076968491077423, 'learning_rate': 2.733866666666667e-05, 'epoch': 1.3599999999999999}
+{'loss': 0.198, 'grad_norm': 0.29645389318466187, 'learning_rate': 2.7072000000000004e-05, 'epoch': 1.376}
+{'loss': 0.2099, 'grad_norm': 9.831048011779785, 'learning_rate': 2.6805333333333332e-05, 'epoch': 1.392}
+{'loss': 0.2126, 'grad_norm': 0.037026241421699524, 'learning_rate': 2.6538666666666667e-05, 'epoch': 1.408}
+{'loss': 0.1393, 'grad_norm': 0.2884507179260254, 'learning_rate': 2.6272000000000002e-05, 'epoch': 1.424}
+{'loss': 0.1837, 'grad_norm': 0.05694892257452011, 'learning_rate': 2.6005333333333337e-05, 'epoch': 1.44}
+{'loss': 0.2323, 'grad_norm': 46.10319137573242, 'learning_rate': 2.5738666666666668e-05, 'epoch': 1.456}
+{'loss': 0.1858, 'grad_norm': 26.698631286621094, 'learning_rate': 2.5472000000000003e-05, 'epoch': 1.472}
+Traceback (most recent call last):
+  File "C:\Users\Legion\desktop\distilbert-sentiment\main.py", line 74, in <module>
+    main()
+    ~~~~^^
+  File "C:\Users\Legion\desktop\distilbert-sentiment\main.py", line 68, in main
+    train_pipeline(subset_size=args.subset)
+    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "C:\Users\Legion\desktop\distilbert-sentiment\main.py", line 41, in train_pipeline
+    train_model(trainer)
+    ~~~~~~~~~~~^^^^^^^^^
+  File "C:\Users\Legion\desktop\distilbert-sentiment\train.py", line 121, in train_model
+    trainer.train()
+    ~~~~~~~~~~~~~^^
+  File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\transformers\trainer.py", line 2206, in train
+    return inner_training_loop(
+        args=args,
+    ...<2 lines>...
+        ignore_keys_for_eval=ignore_keys_for_eval,
+    )
+  File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\transformers\trainer.py", line 2548, in _inner_training_loop
+    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+  File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\transformers\trainer.py", line 3797, in training_step
+    self.accelerator.backward(loss, **kwargs)
+    ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^
+  File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\accelerate\accelerator.py", line 2578, in backward
+    loss.backward(**kwargs)
+    ~~~~~~~~~~~~~^^^^^^^^^^
+  File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\torch\_tensor.py", line 648, in backward
+    torch.autograd.backward(
+    ~~~~~~~~~~~~~~~~~~~~~~~^
+        self, gradient, retain_graph, create_graph, inputs=inputs
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+    )
+    ^
+  File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\torch\autograd\__init__.py", line 353, in backward
+    _engine_run_backward(
+    ~~~~~~~~~~~~~~~~~~~~^
+        tensors,
+        ^^^^^^^^
+    ...<5 lines>...
+        accumulate_grad=True,
+        ^^^^^^^^^^^^^^^^^^^^^
+    )
+    ^
+  File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\torch\autograd\graph.py", line 824, in _engine_run_backward
+    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+        t_outputs, *args, **kwargs
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^
+    )  # Calls into the C++ engine to run the backward pass
+    ^
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

wandb/run-20250720_144411-9kwggmdj/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,139 @@

+accelerate==1.9.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
+alembic==1.16.2
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+audioop-lts==0.2.1
+blinker==1.9.0
+Bottleneck==1.4.2
+Brotli==1.1.0
+cachetools==6.1.0
+certifi==2025.6.15
+charset-normalizer==3.4.2
+click==8.2.1
+cloudpickle==3.1.1
+colorama==0.4.6
+colorlog==6.9.0
+contourpy==1.3.1
+cycler==0.11.0
+datasets==4.0.0
+dill==0.3.8
+fastapi==0.116.1
+ffmpy==0.6.0
+filelock==3.18.0
+fonttools==4.55.3
+frozenlist==1.7.0
+fsspec==2025.3.0
+gitdb==4.0.12
+GitPython==3.1.44
+gradio==5.37.0
+gradio_client==1.10.4
+greenlet==3.2.3
+groovy==0.1.2
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.33.4
+idna==3.10
+imbalanced-learn==0.13.0
+imblearn==0.0
+Jinja2==3.1.6
+joblib==1.4.2
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+kiwisolver==1.4.8
+llvmlite==0.44.0
+Mako==1.3.10
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.9.2
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.6.3
+multiprocess==0.70.16
+narwhals==1.44.0
+networkx==3.5
+ninja==1.11.1.4
+numba==0.61.2
+numexpr==2.10.2
+numpy==2.1.1
+optuna==4.4.0
+orjson==3.11.0
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+pip==25.1
+platformdirs==4.3.8
+plotly==6.2.0
+propcache==0.3.2
+protobuf==6.31.1
+psutil==7.0.0
+pyarrow==20.0.0
+pybind11==3.0.0
+pydantic==2.11.7
+pydantic_core==2.33.2
+pydeck==0.9.1
+pydub==0.25.1
+Pygments==2.19.2
+pyparsing==3.2.0
+PyQt6==6.7.1
+PyQt6_sip==13.9.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2024.1
+PyYAML==6.0.2
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.4
+rich==14.0.0
+rpds-py==0.26.0
+ruff==0.12.3
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.5.2
+scipy==1.15.2
+seaborn==0.13.2
+semantic-version==2.10.0
+sentry-sdk==2.33.0
+setuptools==78.1.1
+shap==0.48.0
+shellingham==1.5.4
+sip==6.10.0
+six==1.17.0
+sklearn-compat==0.1.3
+slicer==0.0.8
+smmap==5.0.2
+sniffio==1.3.1
+SQLAlchemy==2.0.41
+starlette==0.47.1
+streamlit==1.46.1
+sympy==1.14.0
+tenacity==9.1.2
+threadpoolctl==3.5.0
+tokenizers==0.21.2
+toml==0.10.2
+tomlkit==0.13.3
+torch==2.7.1+cu118
+torchaudio==2.7.1+cu118
+torchvision==0.22.1
+tornado==6.5.1
+tqdm==4.67.1
+transformers==4.53.2
+typer==0.16.0
+typing_extensions==4.14.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
+uvicorn==0.35.0
+wandb==0.21.0
+watchdog==6.0.0
+websockets==15.0.1
+wheel==0.45.1
+xgboost==3.0.2
+xxhash==3.5.0
+yarl==1.20.1

wandb/run-20250720_144411-9kwggmdj/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "os": "Windows-11-10.0.26100-SP0",
+  "python": "CPython 3.13.5",
+  "startedAt": "2025-07-20T09:14:11.312224Z",
+  "program": "C:\\Users\\Legion\\desktop\\distilbert-sentiment\\main.py",
+  "codePath": "main.py",
+  "codePathLocal": "main.py",
+  "email": "shreshthkapai@gmail.com",
+  "root": "C:\\Users\\Legion\\desktop\\distilbert-sentiment",
+  "host": "DESKTOP-EIHJJJL",
+  "executable": "C:\\Users\\Legion\\Miniconda3\\envs\\ML\\python.exe",
+  "cpu_count": 4,
+  "cpu_count_logical": 8,
+  "gpu": "NVIDIA GeForce GTX 1650",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "255230791680",
+      "used": "208595525632"
+    }
+  },
+  "memory": {
+    "total": "8506298368"
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA GeForce GTX 1650",
+      "memoryTotal": "4294967296",
+      "cudaCores": 1024,
+      "architecture": "Turing",
+      "uuid": "GPU-fbcd7647-fb67-66f5-b8c7-1a4198b7e4fa"
+    }
+  ],
+  "cudaVersion": "12.7",
+  "writerId": "qd7dze61nxdy0n83hyx7lap6a5tql6xc"
+}

wandb/run-20250720_144411-9kwggmdj/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"train/grad_norm":26.698631286621094,"train/learning_rate":2.5472000000000003e-05,"eval/samples_per_second":48.865,"_runtime":2637,"eval/runtime":511.6142,"eval/f1":0.8900558472951571,"eval/accuracy":0.89448,"eval/steps_per_second":3.055,"_timestamp":1.7530054751176744e+09,"train/loss":0.1858,"train/global_step":4600,"train/epoch":1.472,"eval/loss":0.2969476878643036,"eval/recall":0.85424,"_wandb":{"runtime":2637},"_step":92,"eval/precision":0.9290064381416391}

wandb/run-20250720_144411-9kwggmdj/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,12 @@

+{"time":"2025-07-20T14:44:13.6837247+05:30","level":"INFO","msg":"stream: starting","core version":"0.21.0"}
+{"time":"2025-07-20T14:44:14.9093201+05:30","level":"INFO","msg":"stream: created new stream","id":"9kwggmdj"}
+{"time":"2025-07-20T14:44:14.9093201+05:30","level":"INFO","msg":"stream: started","id":"9kwggmdj"}
+{"time":"2025-07-20T14:44:14.9093201+05:30","level":"INFO","msg":"handler: started","stream_id":"9kwggmdj"}
+{"time":"2025-07-20T14:44:14.9093201+05:30","level":"INFO","msg":"sender: started","stream_id":"9kwggmdj"}
+{"time":"2025-07-20T14:44:14.9093201+05:30","level":"INFO","msg":"writer: Do: started","stream_id":"9kwggmdj"}
+{"time":"2025-07-20T15:28:13.4157038+05:30","level":"INFO","msg":"stream: closing","id":"9kwggmdj"}
+{"time":"2025-07-20T15:28:16.7459113+05:30","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-07-20T15:28:17.5720383+05:30","level":"INFO","msg":"sender: closed","stream_id":"9kwggmdj"}
+{"time":"2025-07-20T15:28:17.5720383+05:30","level":"INFO","msg":"handler: closed","stream_id":"9kwggmdj"}
+{"time":"2025-07-20T15:28:17.5720383+05:30","level":"INFO","msg":"writer: Close: closed","stream_id":"9kwggmdj"}
+{"time":"2025-07-20T15:28:17.5820507+05:30","level":"INFO","msg":"stream: closed","id":"9kwggmdj"}

wandb/run-20250720_144411-9kwggmdj/logs/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2025-07-20 14:44:11,319 INFO    MainThread:4228 [wandb_setup.py:_flush():80] Current SDK version is 0.21.0
+2025-07-20 14:44:11,319 INFO    MainThread:4228 [wandb_setup.py:_flush():80] Configure stats pid to 4228
+2025-07-20 14:44:11,320 INFO    MainThread:4228 [wandb_setup.py:_flush():80] Loading settings from C:\Users\Legion\.config\wandb\settings
+2025-07-20 14:44:11,320 INFO    MainThread:4228 [wandb_setup.py:_flush():80] Loading settings from C:\Users\Legion\desktop\distilbert-sentiment\wandb\settings
+2025-07-20 14:44:11,320 INFO    MainThread:4228 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2025-07-20 14:44:11,320 INFO    MainThread:4228 [wandb_init.py:setup_run_log_directory():703] Logging user logs to C:\Users\Legion\desktop\distilbert-sentiment\wandb\run-20250720_144411-9kwggmdj\logs\debug.log
+2025-07-20 14:44:11,321 INFO    MainThread:4228 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to C:\Users\Legion\desktop\distilbert-sentiment\wandb\run-20250720_144411-9kwggmdj\logs\debug-internal.log
+2025-07-20 14:44:11,321 INFO    MainThread:4228 [wandb_init.py:init():830] calling init triggers
+2025-07-20 14:44:11,321 INFO    MainThread:4228 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-07-20 14:44:11,321 INFO    MainThread:4228 [wandb_init.py:init():871] starting backend
+2025-07-20 14:44:12,739 INFO    MainThread:4228 [wandb_init.py:init():874] sending inform_init request
+2025-07-20 14:44:13,671 INFO    MainThread:4228 [wandb_init.py:init():882] backend started and connected
+2025-07-20 14:44:13,676 INFO    MainThread:4228 [wandb_init.py:init():953] updated telemetry
+2025-07-20 14:44:13,680 INFO    MainThread:4228 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
+2025-07-20 14:44:15,444 INFO    MainThread:4228 [wandb_init.py:init():1029] starting run threads in backend
+2025-07-20 14:44:15,878 INFO    MainThread:4228 [wandb_run.py:_console_start():2458] atexit reg
+2025-07-20 14:44:15,879 INFO    MainThread:4228 [wandb_run.py:_redirect():2306] redirect: wrap_raw
+2025-07-20 14:44:15,879 INFO    MainThread:4228 [wandb_run.py:_redirect():2375] Wrapping output streams.
+2025-07-20 14:44:15,879 INFO    MainThread:4228 [wandb_run.py:_redirect():2398] Redirects installed.
+2025-07-20 14:44:15,885 INFO    MainThread:4228 [wandb_init.py:init():1075] run started, returning control to user process
+2025-07-20 14:44:15,889 INFO    MainThread:4228 [wandb_run.py:_config_callback():1363] config_cb None None {'vocab_size': 30522, 'max_position_embeddings': 512, 'sinusoidal_pos_embds': False, 'n_layers': 6, 'n_heads': 12, 'dim': 768, 'hidden_dim': 3072, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation': 'gelu', 'initializer_range': 0.02, 'qa_dropout': 0.1, 'seq_classif_dropout': 0.2, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['DistilBertForMaskedLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 0, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'distilbert-base-uncased', 'transformers_version': '4.53.2', 'model_type': 'distilbert', 'tie_weights_': True, 'output_attentions': False, 'output_dir': './model', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 50, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './model', 'disable_tqdm': True, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'f1', 'greater_is_better': True, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
+2025-07-20 14:44:15,894 INFO    MainThread:4228 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 66955010 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x0000015891724590>>
+2025-07-20 14:44:15,894 INFO    MainThread:4228 [wandb_run.py:_config_callback():1363] config_cb model/num_parameters 66955010 None
+2025-07-20 15:28:12,363 INFO    MsgRouterThr:4228 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.

wandb/run-20250720_144411-9kwggmdj/run-9kwggmdj.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de1b57c0d7b948fcdacf3f80d9fa12fd8f80b6888eea1c8acc8593a8aa7b62d1
+size 231840

wandb/run-20250720_154435-9xqrzjdo/files/config.yaml ADDED Viewed

	@@ -0,0 +1,493 @@

+_name_or_path:
+    value: distilbert-base-uncased
+_wandb:
+    value:
+        cli_version: 0.21.0
+        e:
+            0ygkgjf4tjw3nzhcstc0bi4ropv1pahk:
+                codePath: main.py
+                codePathLocal: main.py
+                cpu_count: 4
+                cpu_count_logical: 8
+                cudaVersion: "12.7"
+                disk:
+                    /:
+                        total: "255230791680"
+                        used: "225197662208"
+                email: shreshthkapai@gmail.com
+                executable: C:\Users\Legion\Miniconda3\envs\ML\python.exe
+                gpu: NVIDIA GeForce GTX 1650
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Turing
+                      cudaCores: 1024
+                      memoryTotal: "4294967296"
+                      name: NVIDIA GeForce GTX 1650
+                      uuid: GPU-fbcd7647-fb67-66f5-b8c7-1a4198b7e4fa
+                host: DESKTOP-EIHJJJL
+                memory:
+                    total: "8506298368"
+                os: Windows-11-10.0.26100-SP0
+                program: C:\Users\Legion\desktop\distilbert-sentiment\main.py
+                python: CPython 3.13.5
+                root: C:\Users\Legion\desktop\distilbert-sentiment
+                startedAt: "2025-07-20T10:14:35.345095Z"
+                writerId: 0ygkgjf4tjw3nzhcstc0bi4ropv1pahk
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.13.5
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 41
+                - 49
+                - 51
+                - 53
+                - 71
+                - 105
+            "2":
+                - 1
+                - 5
+                - 11
+                - 41
+                - 49
+                - 51
+                - 53
+                - 71
+                - 105
+            "3":
+                - 7
+                - 13
+                - 19
+                - 66
+            "4": 3.13.5
+            "5": 0.21.0
+            "6": 4.53.2
+            "8":
+                - 3
+            "9":
+                "1": transformers_trainer
+            "12": 0.21.0
+            "13": windows-amd64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+activation:
+    value: gelu
+adafactor:
+    value: false
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.999
+adam_epsilon:
+    value: 1e-08
+add_cross_attention:
+    value: false
+architectures:
+    value:
+        - DistilBertForMaskedLM
+attention_dropout:
+    value: 0.1
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: false
+bad_words_ids:
+    value: null
+batch_eval_metrics:
+    value: false
+begin_suppress_tokens:
+    value: null
+bf16:
+    value: false
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+cross_attention_hidden_size:
+    value: null
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 1800
+debug:
+    value: []
+decoder_start_token_id:
+    value: null
+deepspeed:
+    value: null
+dim:
+    value: 768
+disable_tqdm:
+    value: true
+diversity_penalty:
+    value: 0
+do_eval:
+    value: true
+do_predict:
+    value: false
+do_sample:
+    value: false
+do_train:
+    value: false
+dropout:
+    value: 0.1
+early_stopping:
+    value: false
+encoder_no_repeat_ngram_size:
+    value: 0
+eos_token_id:
+    value: null
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: epoch
+eval_use_gather_object:
+    value: false
+exponential_decay_length_penalty:
+    value: null
+finetuning_task:
+    value: null
+forced_bos_token_id:
+    value: null
+forced_eos_token_id:
+    value: null
+fp16:
+    value: false
+fp16_backend:
+    value: auto
+fp16_full_eval:
+    value: false
+fp16_opt_level:
+    value: O1
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+fsdp_min_num_params:
+    value: 0
+fsdp_transformer_layer_cls_to_wrap:
+    value: null
+full_determinism:
+    value: false
+gradient_accumulation_steps:
+    value: 1
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: true
+group_by_length:
+    value: false
+half_precision_backend:
+    value: auto
+hidden_dim:
+    value: 3072
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_inputs_for_metrics:
+    value: false
+include_num_input_tokens_seen:
+    value: false
+include_tokens_per_second:
+    value: false
+initializer_range:
+    value: 0.02
+is_decoder:
+    value: false
+is_encoder_decoder:
+    value: false
+jit_mode_eval:
+    value: false
+label_names:
+    value: null
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+learning_rate:
+    value: 5e-05
+length_column_name:
+    value: length
+length_penalty:
+    value: 1
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: true
+local_rank:
+    value: 0
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: ./logs
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 50
+logging_strategy:
+    value: steps
+lr_scheduler_type:
+    value: linear
+max_grad_norm:
+    value: 1
+max_length:
+    value: 20
+max_position_embeddings:
+    value: 512
+max_steps:
+    value: -1
+metric_for_best_model:
+    value: f1
+min_length:
+    value: 0
+model/num_parameters:
+    value: 66955010
+model_type:
+    value: distilbert
+mp_parameters:
+    value: ""
+n_heads:
+    value: 12
+n_layers:
+    value: 6
+neftune_noise_alpha:
+    value: null
+no_cuda:
+    value: false
+no_repeat_ngram_size:
+    value: 0
+num_beam_groups:
+    value: 1
+num_beams:
+    value: 1
+num_return_sequences:
+    value: 1
+num_train_epochs:
+    value: 3
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: ./model
+output_hidden_states:
+    value: false
+output_scores:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 0
+past_index:
+    value: -1
+per_device_eval_batch_size:
+    value: 16
+per_device_train_batch_size:
+    value: 8
+per_gpu_eval_batch_size:
+    value: null
+per_gpu_train_batch_size:
+    value: null
+prediction_loss_only:
+    value: false
+prefix:
+    value: null
+problem_type:
+    value: null
+push_to_hub:
+    value: false
+push_to_hub_model_id:
+    value: null
+push_to_hub_organization:
+    value: null
+push_to_hub_token:
+    value: <PUSH_TO_HUB_TOKEN>
+qa_dropout:
+    value: 0.1
+ray_scope:
+    value: last
+remove_invalid_values:
+    value: false
+remove_unused_columns:
+    value: true
+repetition_penalty:
+    value: 1
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+return_dict_in_generate:
+    value: false
+run_name:
+    value: ./model
+save_on_each_node:
+    value: false
+save_only_model:
+    value: false
+save_safetensors:
+    value: true
+save_steps:
+    value: 500
+save_strategy:
+    value: epoch
+save_total_limit:
+    value: null
+seed:
+    value: 42
+sep_token_id:
+    value: null
+seq_classif_dropout:
+    value: 0.2
+sinusoidal_pos_embds:
+    value: false
+skip_memory_metrics:
+    value: true
+suppress_tokens:
+    value: null
+task_specific_params:
+    value: null
+temperature:
+    value: 1
+tf_legacy_loss:
+    value: false
+tf32:
+    value: null
+tie_encoder_decoder:
+    value: false
+tie_weights_:
+    value: true
+tie_word_embeddings:
+    value: true
+tokenizer_class:
+    value: null
+top_k:
+    value: 50
+top_p:
+    value: 1
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_dtype:
+    value: float32
+torch_empty_cache_steps:
+    value: null
+torchdynamo:
+    value: null
+torchscript:
+    value: false
+tpu_metrics_debug:
+    value: false
+tpu_num_cores:
+    value: null
+transformers_version:
+    value: 4.53.2
+typical_p:
+    value: 1
+use_bfloat16:
+    value: false
+use_cpu:
+    value: false
+use_ipex:
+    value: false
+use_legacy_prediction_loop:
+    value: false
+use_liger_kernel:
+    value: false
+use_mps_device:
+    value: false
+vocab_size:
+    value: 30522
+warmup_ratio:
+    value: 0
+warmup_steps:
+    value: 0
+weight_decay:
+    value: 0

wandb/run-20250720_154435-9xqrzjdo/files/output.log ADDED Viewed

	@@ -0,0 +1,39 @@

+{'loss': 0.6215, 'grad_norm': 4.55625057220459, 'learning_rate': 4.9738666666666665e-05, 'epoch': 0.016}
+{'loss': 0.5086, 'grad_norm': 13.619754791259766, 'learning_rate': 4.9472e-05, 'epoch': 0.032}
+{'loss': 0.4128, 'grad_norm': 10.843639373779297, 'learning_rate': 4.9205333333333335e-05, 'epoch': 0.048}
+{'loss': 0.3603, 'grad_norm': 7.094396114349365, 'learning_rate': 4.893866666666667e-05, 'epoch': 0.064}
+{'loss': 0.3572, 'grad_norm': 32.03938674926758, 'learning_rate': 4.8672000000000004e-05, 'epoch': 0.08}
+{'loss': 0.4255, 'grad_norm': 2.2694833278656006, 'learning_rate': 4.8405333333333336e-05, 'epoch': 0.096}
+{'loss': 0.3592, 'grad_norm': 1.1852556467056274, 'learning_rate': 4.8138666666666674e-05, 'epoch': 0.112}
+{'loss': 0.3759, 'grad_norm': 8.895912170410156, 'learning_rate': 4.7872e-05, 'epoch': 0.128}
+{'loss': 0.4246, 'grad_norm': 16.175556182861328, 'learning_rate': 4.7605333333333337e-05, 'epoch': 0.144}
+{'loss': 0.3949, 'grad_norm': 13.036661148071289, 'learning_rate': 4.733866666666667e-05, 'epoch': 0.16}
+{'loss': 0.3442, 'grad_norm': 8.27635669708252, 'learning_rate': 4.7072000000000006e-05, 'epoch': 0.176}
+{'loss': 0.4416, 'grad_norm': 19.103059768676758, 'learning_rate': 4.680533333333334e-05, 'epoch': 0.192}
+{'loss': 0.3638, 'grad_norm': 22.78896713256836, 'learning_rate': 4.653866666666667e-05, 'epoch': 0.208}
+{'loss': 0.2995, 'grad_norm': 21.197683334350586, 'learning_rate': 4.627200000000001e-05, 'epoch': 0.224}
+{'loss': 0.3702, 'grad_norm': 6.810858726501465, 'learning_rate': 4.600533333333333e-05, 'epoch': 0.24}
+{'loss': 0.3149, 'grad_norm': 16.848161697387695, 'learning_rate': 4.573866666666667e-05, 'epoch': 0.256}
+Traceback (most recent call last):
+  File "C:\Users\Legion\desktop\distilbert-sentiment\main.py", line 74, in <module>
+    main()
+    ~~~~^^
+  File "C:\Users\Legion\desktop\distilbert-sentiment\main.py", line 68, in main
+    train_pipeline(subset_size=args.subset)
+    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "C:\Users\Legion\desktop\distilbert-sentiment\main.py", line 41, in train_pipeline
+    train_model(trainer)
+    ~~~~~~~~~~~^^^^^^^^^
+  File "C:\Users\Legion\desktop\distilbert-sentiment\train.py", line 121, in train_model
+    print("Starting training...")
+    ^^^^^^^^^^^^^^^
+  File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\transformers\trainer.py", line 2206, in train
+    return inner_training_loop(
+        args=args,
+    ...<2 lines>...
+        ignore_keys_for_eval=ignore_keys_for_eval,
+    )
+  File "C:\Users\Legion\Miniconda3\envs\ML\Lib\site-packages\transformers\trainer.py", line 2553, in _inner_training_loop
+    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
+                                      ~~~~~~~~~~~^^^^^^^^^^^^^^
+KeyboardInterrupt

wandb/run-20250720_154435-9xqrzjdo/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,139 @@

+accelerate==1.9.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
+alembic==1.16.2
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+audioop-lts==0.2.1
+blinker==1.9.0
+Bottleneck==1.4.2
+Brotli==1.1.0
+cachetools==6.1.0
+certifi==2025.6.15
+charset-normalizer==3.4.2
+click==8.2.1
+cloudpickle==3.1.1
+colorama==0.4.6
+colorlog==6.9.0
+contourpy==1.3.1
+cycler==0.11.0
+datasets==4.0.0
+dill==0.3.8
+fastapi==0.116.1
+ffmpy==0.6.0
+filelock==3.18.0
+fonttools==4.55.3
+frozenlist==1.7.0
+fsspec==2025.3.0
+gitdb==4.0.12
+GitPython==3.1.44
+gradio==5.37.0
+gradio_client==1.10.4
+greenlet==3.2.3
+groovy==0.1.2
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.33.4
+idna==3.10
+imbalanced-learn==0.13.0
+imblearn==0.0
+Jinja2==3.1.6
+joblib==1.4.2
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+kiwisolver==1.4.8
+llvmlite==0.44.0
+Mako==1.3.10
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.9.2
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.6.3
+multiprocess==0.70.16
+narwhals==1.44.0
+networkx==3.5
+ninja==1.11.1.4
+numba==0.61.2
+numexpr==2.10.2
+numpy==2.1.1
+optuna==4.4.0
+orjson==3.11.0
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+pip==25.1
+platformdirs==4.3.8
+plotly==6.2.0
+propcache==0.3.2
+protobuf==6.31.1
+psutil==7.0.0
+pyarrow==20.0.0
+pybind11==3.0.0
+pydantic==2.11.7
+pydantic_core==2.33.2
+pydeck==0.9.1
+pydub==0.25.1
+Pygments==2.19.2
+pyparsing==3.2.0
+PyQt6==6.7.1
+PyQt6_sip==13.9.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2024.1
+PyYAML==6.0.2
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.4
+rich==14.0.0
+rpds-py==0.26.0
+ruff==0.12.3
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.5.2
+scipy==1.15.2
+seaborn==0.13.2
+semantic-version==2.10.0
+sentry-sdk==2.33.0
+setuptools==78.1.1
+shap==0.48.0
+shellingham==1.5.4
+sip==6.10.0
+six==1.17.0
+sklearn-compat==0.1.3
+slicer==0.0.8
+smmap==5.0.2
+sniffio==1.3.1
+SQLAlchemy==2.0.41
+starlette==0.47.1
+streamlit==1.46.1
+sympy==1.14.0
+tenacity==9.1.2
+threadpoolctl==3.5.0
+tokenizers==0.21.2
+toml==0.10.2
+tomlkit==0.13.3
+torch==2.7.1+cu118
+torchaudio==2.7.1+cu118
+torchvision==0.22.1
+tornado==6.5.1
+tqdm==4.67.1
+transformers==4.53.2
+typer==0.16.0
+typing_extensions==4.14.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
+uvicorn==0.35.0
+wandb==0.21.0
+watchdog==6.0.0
+websockets==15.0.1
+wheel==0.45.1
+xgboost==3.0.2
+xxhash==3.5.0
+yarl==1.20.1

wandb/run-20250720_154435-9xqrzjdo/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "os": "Windows-11-10.0.26100-SP0",
+  "python": "CPython 3.13.5",
+  "startedAt": "2025-07-20T10:14:35.345095Z",
+  "program": "C:\\Users\\Legion\\desktop\\distilbert-sentiment\\main.py",
+  "codePath": "main.py",
+  "codePathLocal": "main.py",
+  "email": "shreshthkapai@gmail.com",
+  "root": "C:\\Users\\Legion\\desktop\\distilbert-sentiment",
+  "host": "DESKTOP-EIHJJJL",
+  "executable": "C:\\Users\\Legion\\Miniconda3\\envs\\ML\\python.exe",
+  "cpu_count": 4,
+  "cpu_count_logical": 8,
+  "gpu": "NVIDIA GeForce GTX 1650",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "255230791680",
+      "used": "225197662208"
+    }
+  },
+  "memory": {
+    "total": "8506298368"
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA GeForce GTX 1650",
+      "memoryTotal": "4294967296",
+      "cudaCores": 1024,
+      "architecture": "Turing",
+      "uuid": "GPU-fbcd7647-fb67-66f5-b8c7-1a4198b7e4fa"
+    }
+  ],
+  "cudaVersion": "12.7",
+  "writerId": "0ygkgjf4tjw3nzhcstc0bi4ropv1pahk"
+}

wandb/run-20250720_154435-9xqrzjdo/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"train/grad_norm":16.848161697387695,"_runtime":388,"_wandb":{"runtime":388},"train/epoch":0.256,"train/global_step":800,"_timestamp":1.753006850924072e+09,"train/learning_rate":4.573866666666667e-05,"_step":15,"train/loss":0.3149}

wandb/run-20250720_154435-9xqrzjdo/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,12 @@

+{"time":"2025-07-20T15:44:35.9771205+05:30","level":"INFO","msg":"stream: starting","core version":"0.21.0"}
+{"time":"2025-07-20T15:44:36.9333877+05:30","level":"INFO","msg":"stream: created new stream","id":"9xqrzjdo"}
+{"time":"2025-07-20T15:44:36.9333877+05:30","level":"INFO","msg":"stream: started","id":"9xqrzjdo"}
+{"time":"2025-07-20T15:44:36.9333877+05:30","level":"INFO","msg":"handler: started","stream_id":"9xqrzjdo"}
+{"time":"2025-07-20T15:44:36.9333877+05:30","level":"INFO","msg":"sender: started","stream_id":"9xqrzjdo"}
+{"time":"2025-07-20T15:44:36.9333877+05:30","level":"INFO","msg":"writer: Do: started","stream_id":"9xqrzjdo"}
+{"time":"2025-07-20T15:51:06.1959407+05:30","level":"INFO","msg":"stream: closing","id":"9xqrzjdo"}
+{"time":"2025-07-20T15:51:08.7071239+05:30","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-07-20T15:51:09.1729659+05:30","level":"INFO","msg":"sender: closed","stream_id":"9xqrzjdo"}
+{"time":"2025-07-20T15:51:09.1735011+05:30","level":"INFO","msg":"handler: closed","stream_id":"9xqrzjdo"}
+{"time":"2025-07-20T15:51:09.1735011+05:30","level":"INFO","msg":"writer: Close: closed","stream_id":"9xqrzjdo"}
+{"time":"2025-07-20T15:51:09.1740459+05:30","level":"INFO","msg":"stream: closed","id":"9xqrzjdo"}

wandb/run-20250720_154435-9xqrzjdo/logs/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2025-07-20 15:44:35,349 INFO    MainThread:2896 [wandb_setup.py:_flush():80] Current SDK version is 0.21.0
+2025-07-20 15:44:35,350 INFO    MainThread:2896 [wandb_setup.py:_flush():80] Configure stats pid to 2896
+2025-07-20 15:44:35,350 INFO    MainThread:2896 [wandb_setup.py:_flush():80] Loading settings from C:\Users\Legion\.config\wandb\settings
+2025-07-20 15:44:35,350 INFO    MainThread:2896 [wandb_setup.py:_flush():80] Loading settings from C:\Users\Legion\desktop\distilbert-sentiment\wandb\settings
+2025-07-20 15:44:35,350 INFO    MainThread:2896 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2025-07-20 15:44:35,350 INFO    MainThread:2896 [wandb_init.py:setup_run_log_directory():703] Logging user logs to C:\Users\Legion\desktop\distilbert-sentiment\wandb\run-20250720_154435-9xqrzjdo\logs\debug.log
+2025-07-20 15:44:35,351 INFO    MainThread:2896 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to C:\Users\Legion\desktop\distilbert-sentiment\wandb\run-20250720_154435-9xqrzjdo\logs\debug-internal.log
+2025-07-20 15:44:35,351 INFO    MainThread:2896 [wandb_init.py:init():830] calling init triggers
+2025-07-20 15:44:35,351 INFO    MainThread:2896 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-07-20 15:44:35,351 INFO    MainThread:2896 [wandb_init.py:init():871] starting backend
+2025-07-20 15:44:35,877 INFO    MainThread:2896 [wandb_init.py:init():874] sending inform_init request
+2025-07-20 15:44:35,966 INFO    MainThread:2896 [wandb_init.py:init():882] backend started and connected
+2025-07-20 15:44:35,969 INFO    MainThread:2896 [wandb_init.py:init():953] updated telemetry
+2025-07-20 15:44:35,973 INFO    MainThread:2896 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
+2025-07-20 15:44:37,312 INFO    MainThread:2896 [wandb_init.py:init():1029] starting run threads in backend
+2025-07-20 15:44:37,744 INFO    MainThread:2896 [wandb_run.py:_console_start():2458] atexit reg
+2025-07-20 15:44:37,744 INFO    MainThread:2896 [wandb_run.py:_redirect():2306] redirect: wrap_raw
+2025-07-20 15:44:37,744 INFO    MainThread:2896 [wandb_run.py:_redirect():2375] Wrapping output streams.
+2025-07-20 15:44:37,744 INFO    MainThread:2896 [wandb_run.py:_redirect():2398] Redirects installed.
+2025-07-20 15:44:37,750 INFO    MainThread:2896 [wandb_init.py:init():1075] run started, returning control to user process
+2025-07-20 15:44:37,752 INFO    MainThread:2896 [wandb_run.py:_config_callback():1363] config_cb None None {'vocab_size': 30522, 'max_position_embeddings': 512, 'sinusoidal_pos_embds': False, 'n_layers': 6, 'n_heads': 12, 'dim': 768, 'hidden_dim': 3072, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation': 'gelu', 'initializer_range': 0.02, 'qa_dropout': 0.1, 'seq_classif_dropout': 0.2, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['DistilBertForMaskedLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 0, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'distilbert-base-uncased', 'transformers_version': '4.53.2', 'model_type': 'distilbert', 'tie_weights_': True, 'output_attentions': False, 'output_dir': './model', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 50, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './model', 'disable_tqdm': True, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'f1', 'greater_is_better': True, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
+2025-07-20 15:44:37,756 INFO    MainThread:2896 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 66955010 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x0000017E91750440>>
+2025-07-20 15:44:37,756 INFO    MainThread:2896 [wandb_run.py:_config_callback():1363] config_cb model/num_parameters 66955010 None
+2025-07-20 15:51:06,119 INFO    MsgRouterThr:2896 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.

wandb/run-20250720_154435-9xqrzjdo/run-9xqrzjdo.wandb ADDED Viewed

Binary file (42.9 kB). View file

wandb/run-20250720_155338-0h3fksuy/files/config.yaml ADDED Viewed

	@@ -0,0 +1,494 @@

+_name_or_path:
+    value: distilbert-base-uncased
+_wandb:
+    value:
+        cli_version: 0.21.0
+        e:
+            fshn6fq4d357dfamunx9x96y44pdzcc6:
+                codePath: main.py
+                codePathLocal: main.py
+                cpu_count: 4
+                cpu_count_logical: 8
+                cudaVersion: "12.7"
+                disk:
+                    /:
+                        total: "255230791680"
+                        used: "233129451520"
+                email: shreshthkapai@gmail.com
+                executable: C:\Users\Legion\Miniconda3\envs\ML\python.exe
+                gpu: NVIDIA GeForce GTX 1650
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Turing
+                      cudaCores: 1024
+                      memoryTotal: "4294967296"
+                      name: NVIDIA GeForce GTX 1650
+                      uuid: GPU-fbcd7647-fb67-66f5-b8c7-1a4198b7e4fa
+                host: DESKTOP-EIHJJJL
+                memory:
+                    total: "8506298368"
+                os: Windows-11-10.0.26100-SP0
+                program: C:\Users\Legion\desktop\distilbert-sentiment\main.py
+                python: CPython 3.13.5
+                root: C:\Users\Legion\desktop\distilbert-sentiment
+                startedAt: "2025-07-20T10:23:38.923772Z"
+                writerId: fshn6fq4d357dfamunx9x96y44pdzcc6
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.13.5
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 41
+                - 49
+                - 51
+                - 53
+                - 71
+                - 105
+            "2":
+                - 1
+                - 5
+                - 11
+                - 41
+                - 49
+                - 51
+                - 53
+                - 71
+                - 105
+            "3":
+                - 7
+                - 13
+                - 19
+                - 62
+                - 66
+            "4": 3.13.5
+            "5": 0.21.0
+            "6": 4.53.2
+            "8":
+                - 3
+            "9":
+                "1": transformers_trainer
+            "12": 0.21.0
+            "13": windows-amd64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+activation:
+    value: gelu
+adafactor:
+    value: false
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.999
+adam_epsilon:
+    value: 1e-08
+add_cross_attention:
+    value: false
+architectures:
+    value:
+        - DistilBertForMaskedLM
+attention_dropout:
+    value: 0.1
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: false
+bad_words_ids:
+    value: null
+batch_eval_metrics:
+    value: false
+begin_suppress_tokens:
+    value: null
+bf16:
+    value: false
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+cross_attention_hidden_size:
+    value: null
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 1800
+debug:
+    value: []
+decoder_start_token_id:
+    value: null
+deepspeed:
+    value: null
+dim:
+    value: 768
+disable_tqdm:
+    value: true
+diversity_penalty:
+    value: 0
+do_eval:
+    value: true
+do_predict:
+    value: false
+do_sample:
+    value: false
+do_train:
+    value: false
+dropout:
+    value: 0.1
+early_stopping:
+    value: false
+encoder_no_repeat_ngram_size:
+    value: 0
+eos_token_id:
+    value: null
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: epoch
+eval_use_gather_object:
+    value: false
+exponential_decay_length_penalty:
+    value: null
+finetuning_task:
+    value: null
+forced_bos_token_id:
+    value: null
+forced_eos_token_id:
+    value: null
+fp16:
+    value: false
+fp16_backend:
+    value: auto
+fp16_full_eval:
+    value: false
+fp16_opt_level:
+    value: O1
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+fsdp_min_num_params:
+    value: 0
+fsdp_transformer_layer_cls_to_wrap:
+    value: null
+full_determinism:
+    value: false
+gradient_accumulation_steps:
+    value: 2
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: true
+group_by_length:
+    value: false
+half_precision_backend:
+    value: auto
+hidden_dim:
+    value: 3072
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_inputs_for_metrics:
+    value: false
+include_num_input_tokens_seen:
+    value: false
+include_tokens_per_second:
+    value: false
+initializer_range:
+    value: 0.02
+is_decoder:
+    value: false
+is_encoder_decoder:
+    value: false
+jit_mode_eval:
+    value: false
+label_names:
+    value: null
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+learning_rate:
+    value: 5e-05
+length_column_name:
+    value: length
+length_penalty:
+    value: 1
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: true
+local_rank:
+    value: 0
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: ./logs
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 50
+logging_strategy:
+    value: steps
+lr_scheduler_type:
+    value: linear
+max_grad_norm:
+    value: 1
+max_length:
+    value: 20
+max_position_embeddings:
+    value: 512
+max_steps:
+    value: -1
+metric_for_best_model:
+    value: f1
+min_length:
+    value: 0
+model/num_parameters:
+    value: 66955010
+model_type:
+    value: distilbert
+mp_parameters:
+    value: ""
+n_heads:
+    value: 12
+n_layers:
+    value: 6
+neftune_noise_alpha:
+    value: null
+no_cuda:
+    value: false
+no_repeat_ngram_size:
+    value: 0
+num_beam_groups:
+    value: 1
+num_beams:
+    value: 1
+num_return_sequences:
+    value: 1
+num_train_epochs:
+    value: 3
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: ./model
+output_hidden_states:
+    value: false
+output_scores:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 0
+past_index:
+    value: -1
+per_device_eval_batch_size:
+    value: 4
+per_device_train_batch_size:
+    value: 2
+per_gpu_eval_batch_size:
+    value: null
+per_gpu_train_batch_size:
+    value: null
+prediction_loss_only:
+    value: false
+prefix:
+    value: null
+problem_type:
+    value: null
+push_to_hub:
+    value: false
+push_to_hub_model_id:
+    value: null
+push_to_hub_organization:
+    value: null
+push_to_hub_token:
+    value: <PUSH_TO_HUB_TOKEN>
+qa_dropout:
+    value: 0.1
+ray_scope:
+    value: last
+remove_invalid_values:
+    value: false
+remove_unused_columns:
+    value: true
+repetition_penalty:
+    value: 1
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+return_dict_in_generate:
+    value: false
+run_name:
+    value: ./model
+save_on_each_node:
+    value: false
+save_only_model:
+    value: false
+save_safetensors:
+    value: true
+save_steps:
+    value: 500
+save_strategy:
+    value: epoch
+save_total_limit:
+    value: null
+seed:
+    value: 42
+sep_token_id:
+    value: null
+seq_classif_dropout:
+    value: 0.2
+sinusoidal_pos_embds:
+    value: false
+skip_memory_metrics:
+    value: true
+suppress_tokens:
+    value: null
+task_specific_params:
+    value: null
+temperature:
+    value: 1
+tf_legacy_loss:
+    value: false
+tf32:
+    value: null
+tie_encoder_decoder:
+    value: false
+tie_weights_:
+    value: true
+tie_word_embeddings:
+    value: true
+tokenizer_class:
+    value: null
+top_k:
+    value: 50
+top_p:
+    value: 1
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_dtype:
+    value: float32
+torch_empty_cache_steps:
+    value: null
+torchdynamo:
+    value: null
+torchscript:
+    value: false
+tpu_metrics_debug:
+    value: false
+tpu_num_cores:
+    value: null
+transformers_version:
+    value: 4.53.2
+typical_p:
+    value: 1
+use_bfloat16:
+    value: false
+use_cpu:
+    value: false
+use_ipex:
+    value: false
+use_legacy_prediction_loop:
+    value: false
+use_liger_kernel:
+    value: false
+use_mps_device:
+    value: false
+vocab_size:
+    value: 30522
+warmup_ratio:
+    value: 0
+warmup_steps:
+    value: 0
+weight_decay:
+    value: 0

wandb/run-20250720_155338-0h3fksuy/files/output.log ADDED Viewed

	@@ -0,0 +1,398 @@

+{'loss': 0.6669, 'grad_norm': 2.2732439041137695, 'learning_rate': 4.9869333333333334e-05, 'epoch': 0.008}
+{'loss': 0.5064, 'grad_norm': 1.3875774145126343, 'learning_rate': 4.9736000000000006e-05, 'epoch': 0.016}
+{'loss': 0.5648, 'grad_norm': 8.71848201751709, 'learning_rate': 4.960266666666667e-05, 'epoch': 0.024}
+{'loss': 0.5271, 'grad_norm': 12.96942138671875, 'learning_rate': 4.946933333333333e-05, 'epoch': 0.032}
+{'loss': 0.5786, 'grad_norm': 10.856029510498047, 'learning_rate': 4.9336e-05, 'epoch': 0.04}
+{'loss': 0.5966, 'grad_norm': 27.62953758239746, 'learning_rate': 4.920266666666667e-05, 'epoch': 0.048}
+{'loss': 0.4653, 'grad_norm': 23.872642517089844, 'learning_rate': 4.9069333333333335e-05, 'epoch': 0.056}
+{'loss': 0.4805, 'grad_norm': 0.23732933402061462, 'learning_rate': 4.893600000000001e-05, 'epoch': 0.064}
+{'loss': 0.5354, 'grad_norm': 20.000877380371094, 'learning_rate': 4.8802666666666666e-05, 'epoch': 0.072}
+{'loss': 0.5105, 'grad_norm': 53.1633415222168, 'learning_rate': 4.866933333333333e-05, 'epoch': 0.08}
+{'loss': 0.6534, 'grad_norm': 13.990724563598633, 'learning_rate': 4.8536000000000004e-05, 'epoch': 0.088}
+{'loss': 0.4999, 'grad_norm': 4.82359504699707, 'learning_rate': 4.840266666666667e-05, 'epoch': 0.096}
+{'loss': 0.5484, 'grad_norm': 0.24129296839237213, 'learning_rate': 4.8269333333333336e-05, 'epoch': 0.104}
+{'loss': 0.4312, 'grad_norm': 0.23947849869728088, 'learning_rate': 4.8136e-05, 'epoch': 0.112}
+{'loss': 0.6295, 'grad_norm': 9.900238037109375, 'learning_rate': 4.800266666666667e-05, 'epoch': 0.12}
+{'loss': 0.5561, 'grad_norm': 52.11263656616211, 'learning_rate': 4.786933333333334e-05, 'epoch': 0.128}
+{'loss': 0.5402, 'grad_norm': 0.6357279419898987, 'learning_rate': 4.7736000000000005e-05, 'epoch': 0.136}
+{'loss': 0.4374, 'grad_norm': 0.25493714213371277, 'learning_rate': 4.760266666666667e-05, 'epoch': 0.144}
+{'loss': 0.693, 'grad_norm': 0.580426812171936, 'learning_rate': 4.7469333333333336e-05, 'epoch': 0.152}
+{'loss': 0.4743, 'grad_norm': 26.836181640625, 'learning_rate': 4.7336e-05, 'epoch': 0.16}
+{'loss': 0.5578, 'grad_norm': 66.35722351074219, 'learning_rate': 4.720266666666667e-05, 'epoch': 0.168}
+{'loss': 0.4348, 'grad_norm': 29.878002166748047, 'learning_rate': 4.706933333333334e-05, 'epoch': 0.176}
+{'loss': 0.5593, 'grad_norm': 5.273430824279785, 'learning_rate': 4.6936e-05, 'epoch': 0.184}
+{'loss': 0.446, 'grad_norm': 113.90290069580078, 'learning_rate': 4.6802666666666665e-05, 'epoch': 0.192}
+{'loss': 0.481, 'grad_norm': 6.638715744018555, 'learning_rate': 4.666933333333334e-05, 'epoch': 0.2}
+{'loss': 0.5115, 'grad_norm': 46.855735778808594, 'learning_rate': 4.6536e-05, 'epoch': 0.208}
+{'loss': 0.5586, 'grad_norm': 18.6956729888916, 'learning_rate': 4.640266666666667e-05, 'epoch': 0.216}
+{'loss': 0.3987, 'grad_norm': 7.595647811889648, 'learning_rate': 4.6269333333333334e-05, 'epoch': 0.224}
+{'loss': 0.482, 'grad_norm': 8.240407943725586, 'learning_rate': 4.6136e-05, 'epoch': 0.232}
+{'loss': 0.3789, 'grad_norm': 0.290462851524353, 'learning_rate': 4.6002666666666666e-05, 'epoch': 0.24}
+{'loss': 0.5113, 'grad_norm': 11.75820541381836, 'learning_rate': 4.586933333333334e-05, 'epoch': 0.248}
+{'loss': 0.4607, 'grad_norm': 11.622576713562012, 'learning_rate': 4.5736000000000004e-05, 'epoch': 0.256}
+{'loss': 0.4786, 'grad_norm': 9.230450630187988, 'learning_rate': 4.560266666666667e-05, 'epoch': 0.264}
+{'loss': 0.4143, 'grad_norm': 0.15386007726192474, 'learning_rate': 4.5469333333333335e-05, 'epoch': 0.272}
+{'loss': 0.4111, 'grad_norm': 5.873915672302246, 'learning_rate': 4.5336e-05, 'epoch': 0.28}
+{'loss': 0.3835, 'grad_norm': 1.5295137166976929, 'learning_rate': 4.5202666666666673e-05, 'epoch': 0.288}
+{'loss': 0.4735, 'grad_norm': 3.8919050693511963, 'learning_rate': 4.506933333333333e-05, 'epoch': 0.296}
+{'loss': 0.4442, 'grad_norm': 1.7330166101455688, 'learning_rate': 4.4936e-05, 'epoch': 0.304}
+{'loss': 0.3383, 'grad_norm': 4.891812324523926, 'learning_rate': 4.480266666666667e-05, 'epoch': 0.312}
+{'loss': 0.5515, 'grad_norm': 91.70783233642578, 'learning_rate': 4.4669333333333336e-05, 'epoch': 0.32}
+{'loss': 0.498, 'grad_norm': 19.019271850585938, 'learning_rate': 4.4536e-05, 'epoch': 0.328}
+{'loss': 0.4775, 'grad_norm': 1.9273958206176758, 'learning_rate': 4.440266666666667e-05, 'epoch': 0.336}
+{'loss': 0.5587, 'grad_norm': 0.28679159283638, 'learning_rate': 4.426933333333333e-05, 'epoch': 0.344}
+{'loss': 0.2665, 'grad_norm': 30.908130645751953, 'learning_rate': 4.4136e-05, 'epoch': 0.352}
+{'loss': 0.3657, 'grad_norm': 28.822193145751953, 'learning_rate': 4.400266666666667e-05, 'epoch': 0.36}
+{'loss': 0.5237, 'grad_norm': 17.60547637939453, 'learning_rate': 4.386933333333334e-05, 'epoch': 0.368}
+{'loss': 0.5005, 'grad_norm': 67.82170104980469, 'learning_rate': 4.3736e-05, 'epoch': 0.376}
+{'loss': 0.5195, 'grad_norm': 0.14335760474205017, 'learning_rate': 4.360266666666667e-05, 'epoch': 0.384}
+{'loss': 0.3884, 'grad_norm': 0.36686429381370544, 'learning_rate': 4.3469333333333334e-05, 'epoch': 0.392}
+{'loss': 0.4424, 'grad_norm': 5.366738796234131, 'learning_rate': 4.3336000000000007e-05, 'epoch': 0.4}
+{'loss': 0.5222, 'grad_norm': 20.56273651123047, 'learning_rate': 4.320266666666667e-05, 'epoch': 0.408}
+{'loss': 0.6078, 'grad_norm': 5.502252578735352, 'learning_rate': 4.306933333333333e-05, 'epoch': 0.416}
+{'loss': 0.5062, 'grad_norm': 16.406768798828125, 'learning_rate': 4.2936000000000004e-05, 'epoch': 0.424}
+{'loss': 0.3554, 'grad_norm': 0.1537816971540451, 'learning_rate': 4.280266666666667e-05, 'epoch': 0.432}
+{'loss': 0.4329, 'grad_norm': 35.78837966918945, 'learning_rate': 4.2669333333333335e-05, 'epoch': 0.44}
+{'loss': 0.4557, 'grad_norm': 8.288016319274902, 'learning_rate': 4.2536e-05, 'epoch': 0.448}
+{'loss': 0.4099, 'grad_norm': 0.16398730874061584, 'learning_rate': 4.2402666666666666e-05, 'epoch': 0.456}
+{'loss': 0.5485, 'grad_norm': 1.4127204418182373, 'learning_rate': 4.226933333333333e-05, 'epoch': 0.464}
+{'loss': 0.4307, 'grad_norm': 0.23355980217456818, 'learning_rate': 4.2136000000000005e-05, 'epoch': 0.472}
+{'loss': 0.422, 'grad_norm': 22.04464340209961, 'learning_rate': 4.200266666666667e-05, 'epoch': 0.48}
+{'loss': 0.3782, 'grad_norm': 0.1723032295703888, 'learning_rate': 4.1869333333333336e-05, 'epoch': 0.488}
+{'loss': 0.5829, 'grad_norm': 8.341532707214355, 'learning_rate': 4.1736e-05, 'epoch': 0.496}
+{'loss': 0.3045, 'grad_norm': 8.966949462890625, 'learning_rate': 4.160266666666667e-05, 'epoch': 0.504}
+{'loss': 0.5763, 'grad_norm': 0.5718376636505127, 'learning_rate': 4.146933333333334e-05, 'epoch': 0.512}
+{'loss': 0.4403, 'grad_norm': 26.9694881439209, 'learning_rate': 4.1336000000000005e-05, 'epoch': 0.52}
+{'loss': 0.4884, 'grad_norm': 49.227210998535156, 'learning_rate': 4.1202666666666664e-05, 'epoch': 0.528}
+{'loss': 0.4022, 'grad_norm': 11.67745304107666, 'learning_rate': 4.106933333333334e-05, 'epoch': 0.536}
+{'loss': 0.5127, 'grad_norm': 35.11846160888672, 'learning_rate': 4.0936e-05, 'epoch': 0.544}
+{'loss': 0.3214, 'grad_norm': 0.11048085242509842, 'learning_rate': 4.080266666666667e-05, 'epoch': 0.552}
+{'loss': 0.4918, 'grad_norm': 0.13820067048072815, 'learning_rate': 4.0669333333333334e-05, 'epoch': 0.56}
+{'loss': 0.4171, 'grad_norm': 0.25762712955474854, 'learning_rate': 4.0536e-05, 'epoch': 0.568}
+{'loss': 0.3955, 'grad_norm': 7.20747709274292, 'learning_rate': 4.0402666666666665e-05, 'epoch': 0.576}
+{'loss': 0.4939, 'grad_norm': 9.815940856933594, 'learning_rate': 4.026933333333334e-05, 'epoch': 0.584}
+{'loss': 0.4533, 'grad_norm': 1.6333082914352417, 'learning_rate': 4.0136e-05, 'epoch': 0.592}
+{'loss': 0.5392, 'grad_norm': 17.70346450805664, 'learning_rate': 4.000266666666667e-05, 'epoch': 0.6}
+{'loss': 0.3454, 'grad_norm': 0.13321377336978912, 'learning_rate': 3.9869333333333335e-05, 'epoch': 0.608}
+{'loss': 0.5585, 'grad_norm': 14.663485527038574, 'learning_rate': 3.9736e-05, 'epoch': 0.616}
+{'loss': 0.4151, 'grad_norm': 19.313182830810547, 'learning_rate': 3.960266666666667e-05, 'epoch': 0.624}
+{'loss': 0.4268, 'grad_norm': 1.0081754922866821, 'learning_rate': 3.946933333333334e-05, 'epoch': 0.632}
+{'loss': 0.4791, 'grad_norm': 16.18073081970215, 'learning_rate': 3.9336e-05, 'epoch': 0.64}
+{'loss': 0.3551, 'grad_norm': 13.099615097045898, 'learning_rate': 3.920266666666667e-05, 'epoch': 0.648}
+{'loss': 0.4028, 'grad_norm': 0.2873060703277588, 'learning_rate': 3.9069333333333336e-05, 'epoch': 0.656}
+{'loss': 0.4578, 'grad_norm': 6.123228073120117, 'learning_rate': 3.8936e-05, 'epoch': 0.664}
+{'loss': 0.3384, 'grad_norm': 15.485557556152344, 'learning_rate': 3.8802666666666674e-05, 'epoch': 0.672}
+{'loss': 0.4599, 'grad_norm': 0.3142613470554352, 'learning_rate': 3.866933333333333e-05, 'epoch': 0.68}
+{'loss': 0.5153, 'grad_norm': 0.08679840713739395, 'learning_rate': 3.8536e-05, 'epoch': 0.688}
+{'loss': 0.4358, 'grad_norm': 4.982065677642822, 'learning_rate': 3.840266666666667e-05, 'epoch': 0.696}
+{'loss': 0.3591, 'grad_norm': 0.03635261580348015, 'learning_rate': 3.8269333333333336e-05, 'epoch': 0.704}
+{'loss': 0.3954, 'grad_norm': 60.536617279052734, 'learning_rate': 3.8136e-05, 'epoch': 0.712}
+{'loss': 0.5115, 'grad_norm': 8.195839881896973, 'learning_rate': 3.800266666666667e-05, 'epoch': 0.72}
+{'loss': 0.5059, 'grad_norm': 6.579557418823242, 'learning_rate': 3.7869333333333334e-05, 'epoch': 0.728}
+{'loss': 0.4229, 'grad_norm': 0.11510950326919556, 'learning_rate': 3.7736e-05, 'epoch': 0.736}
+{'loss': 0.3961, 'grad_norm': 77.99903869628906, 'learning_rate': 3.760266666666667e-05, 'epoch': 0.744}
+{'loss': 0.4849, 'grad_norm': 0.16879667341709137, 'learning_rate': 3.746933333333334e-05, 'epoch': 0.752}
+{'loss': 0.5883, 'grad_norm': 0.2887319326400757, 'learning_rate': 3.7336e-05, 'epoch': 0.76}
+{'loss': 0.5342, 'grad_norm': 7.673031330108643, 'learning_rate': 3.720266666666667e-05, 'epoch': 0.768}
+{'loss': 0.3816, 'grad_norm': 0.26005497574806213, 'learning_rate': 3.7069333333333334e-05, 'epoch': 0.776}
+{'loss': 0.4011, 'grad_norm': 0.15825381875038147, 'learning_rate': 3.693600000000001e-05, 'epoch': 0.784}
+{'loss': 0.4041, 'grad_norm': 9.251302719116211, 'learning_rate': 3.6802666666666666e-05, 'epoch': 0.792}
+{'loss': 0.4039, 'grad_norm': 0.3029502332210541, 'learning_rate': 3.666933333333333e-05, 'epoch': 0.8}
+{'loss': 0.4208, 'grad_norm': 12.424095153808594, 'learning_rate': 3.6536000000000004e-05, 'epoch': 0.808}
+{'loss': 0.3997, 'grad_norm': 0.30793556571006775, 'learning_rate': 3.640266666666667e-05, 'epoch': 0.816}
+{'loss': 0.3738, 'grad_norm': 35.160221099853516, 'learning_rate': 3.6269333333333335e-05, 'epoch': 0.824}
+{'loss': 0.3587, 'grad_norm': 7.636879920959473, 'learning_rate': 3.6136e-05, 'epoch': 0.832}
+{'loss': 0.3555, 'grad_norm': 25.151350021362305, 'learning_rate': 3.600266666666667e-05, 'epoch': 0.84}
+{'loss': 0.3044, 'grad_norm': 0.08254121243953705, 'learning_rate': 3.586933333333333e-05, 'epoch': 0.848}
+{'loss': 0.4428, 'grad_norm': 0.3885553777217865, 'learning_rate': 3.5736000000000005e-05, 'epoch': 0.856}
+{'loss': 0.4558, 'grad_norm': 7.154929161071777, 'learning_rate': 3.560266666666667e-05, 'epoch': 0.864}
+{'loss': 0.4543, 'grad_norm': 21.201276779174805, 'learning_rate': 3.5469333333333336e-05, 'epoch': 0.872}
+{'loss': 0.4821, 'grad_norm': 24.018428802490234, 'learning_rate': 3.5336e-05, 'epoch': 0.88}
+{'loss': 0.2944, 'grad_norm': 0.04061014950275421, 'learning_rate': 3.520266666666667e-05, 'epoch': 0.888}
+{'loss': 0.5307, 'grad_norm': 0.3313358426094055, 'learning_rate': 3.506933333333334e-05, 'epoch': 0.896}
+{'loss': 0.4825, 'grad_norm': 0.49799197912216187, 'learning_rate': 3.4936e-05, 'epoch': 0.904}
+{'loss': 0.3783, 'grad_norm': 7.0047712326049805, 'learning_rate': 3.4802666666666665e-05, 'epoch': 0.912}
+{'loss': 0.3574, 'grad_norm': 0.4024333953857422, 'learning_rate': 3.466933333333334e-05, 'epoch': 0.92}
+{'loss': 0.4618, 'grad_norm': 0.1282418966293335, 'learning_rate': 3.4536e-05, 'epoch': 0.928}
+{'loss': 0.2572, 'grad_norm': 7.137022018432617, 'learning_rate': 3.440266666666667e-05, 'epoch': 0.936}
+{'loss': 0.3708, 'grad_norm': 14.057695388793945, 'learning_rate': 3.4269333333333334e-05, 'epoch': 0.944}
+{'loss': 0.3674, 'grad_norm': 0.06444621086120605, 'learning_rate': 3.4136e-05, 'epoch': 0.952}
+{'loss': 0.3021, 'grad_norm': 6.360774993896484, 'learning_rate': 3.4002666666666665e-05, 'epoch': 0.96}
+{'loss': 0.5396, 'grad_norm': 6.526275157928467, 'learning_rate': 3.386933333333334e-05, 'epoch': 0.968}
+{'loss': 0.3749, 'grad_norm': 1.6904962062835693, 'learning_rate': 3.3736000000000004e-05, 'epoch': 0.976}
+{'loss': 0.4539, 'grad_norm': 0.31892621517181396, 'learning_rate': 3.360266666666666e-05, 'epoch': 0.984}
+{'loss': 0.4297, 'grad_norm': 0.15581363439559937, 'learning_rate': 3.3469333333333335e-05, 'epoch': 0.992}
+{'loss': 0.347, 'grad_norm': 23.677379608154297, 'learning_rate': 3.3336e-05, 'epoch': 1.0}
+{'eval_loss': 0.4047098457813263, 'eval_accuracy': 0.8872, 'eval_f1': 0.8824412206103052, 'eval_precision': 0.9213091922005571, 'eval_recall': 0.84672, 'eval_runtime': 387.6575, 'eval_samples_per_second': 64.49, 'eval_steps_per_second': 16.122, 'epoch': 1.0}
+{'loss': 0.2086, 'grad_norm': 6.972434043884277, 'learning_rate': 3.320266666666667e-05, 'epoch': 1.008}
+{'loss': 0.2871, 'grad_norm': 19.224170684814453, 'learning_rate': 3.306933333333334e-05, 'epoch': 1.016}
+{'loss': 0.3594, 'grad_norm': 18.384265899658203, 'learning_rate': 3.2936e-05, 'epoch': 1.024}
+{'loss': 0.2588, 'grad_norm': 0.04717381298542023, 'learning_rate': 3.280266666666667e-05, 'epoch': 1.032}
+{'loss': 0.2305, 'grad_norm': 0.06328094005584717, 'learning_rate': 3.2669333333333336e-05, 'epoch': 1.04}
+{'loss': 0.2759, 'grad_norm': 19.435848236083984, 'learning_rate': 3.2536e-05, 'epoch': 1.048}
+{'loss': 0.31, 'grad_norm': 23.59215545654297, 'learning_rate': 3.240266666666667e-05, 'epoch': 1.056}
+{'loss': 0.2949, 'grad_norm': 0.08341117948293686, 'learning_rate': 3.226933333333333e-05, 'epoch': 1.064}
+{'loss': 0.3302, 'grad_norm': 6.222505569458008, 'learning_rate': 3.2136e-05, 'epoch': 1.072}
+{'loss': 0.2555, 'grad_norm': 0.12312066555023193, 'learning_rate': 3.200266666666667e-05, 'epoch': 1.08}
+{'loss': 0.2143, 'grad_norm': 0.026080487295985222, 'learning_rate': 3.186933333333334e-05, 'epoch': 1.088}
+{'loss': 0.2562, 'grad_norm': 0.078591488301754, 'learning_rate': 3.1736e-05, 'epoch': 1.096}
+{'loss': 0.2386, 'grad_norm': 4.485652923583984, 'learning_rate': 3.160266666666667e-05, 'epoch': 1.104}
+{'loss': 0.2672, 'grad_norm': 0.03333387151360512, 'learning_rate': 3.1469333333333334e-05, 'epoch': 1.112}
+{'loss': 0.3072, 'grad_norm': 6.5600905418396, 'learning_rate': 3.1336000000000006e-05, 'epoch': 1.12}
+{'loss': 0.1519, 'grad_norm': 0.8171893954277039, 'learning_rate': 3.120266666666667e-05, 'epoch': 1.1280000000000001}
+{'loss': 0.201, 'grad_norm': 0.036586660891771317, 'learning_rate': 3.106933333333333e-05, 'epoch': 1.1360000000000001}
+{'loss': 0.1602, 'grad_norm': 0.021485593169927597, 'learning_rate': 3.0936e-05, 'epoch': 1.144}
+{'loss': 0.3508, 'grad_norm': 0.039208535104990005, 'learning_rate': 3.080266666666667e-05, 'epoch': 1.152}
+{'loss': 0.2754, 'grad_norm': 0.05171596258878708, 'learning_rate': 3.0669333333333335e-05, 'epoch': 1.16}
+{'loss': 0.2923, 'grad_norm': 0.48278653621673584, 'learning_rate': 3.0536e-05, 'epoch': 1.168}
+{'loss': 0.2007, 'grad_norm': 0.03526414930820465, 'learning_rate': 3.040266666666667e-05, 'epoch': 1.176}
+{'loss': 0.2515, 'grad_norm': 0.06429073214530945, 'learning_rate': 3.0269333333333332e-05, 'epoch': 1.184}
+{'loss': 0.4943, 'grad_norm': 0.37491822242736816, 'learning_rate': 3.0136000000000004e-05, 'epoch': 1.192}
+{'loss': 0.3227, 'grad_norm': 0.2309693992137909, 'learning_rate': 3.0002666666666666e-05, 'epoch': 1.2}
+{'loss': 0.2738, 'grad_norm': 0.12286447733640671, 'learning_rate': 2.9869333333333332e-05, 'epoch': 1.208}
+{'loss': 0.2563, 'grad_norm': 0.04936458542943001, 'learning_rate': 2.9736e-05, 'epoch': 1.216}
+{'loss': 0.3207, 'grad_norm': 0.4713517725467682, 'learning_rate': 2.9602666666666667e-05, 'epoch': 1.224}
+{'loss': 0.2795, 'grad_norm': 5.336559295654297, 'learning_rate': 2.9469333333333333e-05, 'epoch': 1.232}
+{'loss': 0.3249, 'grad_norm': 1.1011492013931274, 'learning_rate': 2.9336000000000002e-05, 'epoch': 1.24}
+{'loss': 0.3201, 'grad_norm': 8.649012565612793, 'learning_rate': 2.9202666666666667e-05, 'epoch': 1.248}
+{'loss': 0.2676, 'grad_norm': 0.059582602232694626, 'learning_rate': 2.9069333333333336e-05, 'epoch': 1.256}
+{'loss': 0.1626, 'grad_norm': 0.08991962671279907, 'learning_rate': 2.8936000000000002e-05, 'epoch': 1.264}
+{'loss': 0.1377, 'grad_norm': 0.03133632242679596, 'learning_rate': 2.8802666666666668e-05, 'epoch': 1.272}
+{'loss': 0.3352, 'grad_norm': 0.08367053419351578, 'learning_rate': 2.8669333333333337e-05, 'epoch': 1.28}
+{'loss': 0.2317, 'grad_norm': 0.022324958816170692, 'learning_rate': 2.8536000000000003e-05, 'epoch': 1.288}
+{'loss': 0.2879, 'grad_norm': 0.0481320321559906, 'learning_rate': 2.8402666666666665e-05, 'epoch': 1.296}
+{'loss': 0.3309, 'grad_norm': 0.05768590420484543, 'learning_rate': 2.8269333333333337e-05, 'epoch': 1.304}
+{'loss': 0.3335, 'grad_norm': 0.3378739356994629, 'learning_rate': 2.8136e-05, 'epoch': 1.312}
+{'loss': 0.1505, 'grad_norm': 0.04841599985957146, 'learning_rate': 2.8002666666666665e-05, 'epoch': 1.32}
+{'loss': 0.3089, 'grad_norm': 0.0761469379067421, 'learning_rate': 2.7869333333333338e-05, 'epoch': 1.328}
+{'loss': 0.3643, 'grad_norm': 0.7006823420524597, 'learning_rate': 2.7736e-05, 'epoch': 1.336}
+{'loss': 0.2356, 'grad_norm': 12.694981575012207, 'learning_rate': 2.7602666666666666e-05, 'epoch': 1.3439999999999999}
+{'loss': 0.3755, 'grad_norm': 8.449514389038086, 'learning_rate': 2.7469333333333335e-05, 'epoch': 1.3519999999999999}
+{'loss': 0.1797, 'grad_norm': 0.21266134083271027, 'learning_rate': 2.7336e-05, 'epoch': 1.3599999999999999}
+{'loss': 0.2732, 'grad_norm': 0.2098928540945053, 'learning_rate': 2.720266666666667e-05, 'epoch': 1.3679999999999999}
+{'loss': 0.2037, 'grad_norm': 0.09150354564189911, 'learning_rate': 2.7069333333333335e-05, 'epoch': 1.376}
+{'loss': 0.2829, 'grad_norm': 0.0541178435087204, 'learning_rate': 2.6936e-05, 'epoch': 1.384}
+{'loss': 0.3053, 'grad_norm': 0.387103408575058, 'learning_rate': 2.680266666666667e-05, 'epoch': 1.392}
+{'loss': 0.3136, 'grad_norm': 0.058676812797784805, 'learning_rate': 2.6669333333333336e-05, 'epoch': 1.4}
+{'loss': 0.2385, 'grad_norm': 0.05689304694533348, 'learning_rate': 2.6536e-05, 'epoch': 1.408}
+{'loss': 0.2023, 'grad_norm': 0.05833113566040993, 'learning_rate': 2.640266666666667e-05, 'epoch': 1.416}
+{'loss': 0.1863, 'grad_norm': 0.5199909806251526, 'learning_rate': 2.6269333333333336e-05, 'epoch': 1.424}
+{'loss': 0.1431, 'grad_norm': 0.0910777747631073, 'learning_rate': 2.6136e-05, 'epoch': 1.432}
+{'loss': 0.2813, 'grad_norm': 0.19901159405708313, 'learning_rate': 2.600266666666667e-05, 'epoch': 1.44}
+{'loss': 0.2673, 'grad_norm': 20.136131286621094, 'learning_rate': 2.5869333333333333e-05, 'epoch': 1.448}
+{'loss': 0.2245, 'grad_norm': 78.39840698242188, 'learning_rate': 2.5736e-05, 'epoch': 1.456}
+{'loss': 0.3301, 'grad_norm': 27.54892921447754, 'learning_rate': 2.5602666666666668e-05, 'epoch': 1.464}
+{'loss': 0.255, 'grad_norm': 0.4987935423851013, 'learning_rate': 2.5469333333333334e-05, 'epoch': 1.472}
+{'loss': 0.2217, 'grad_norm': 0.08215348422527313, 'learning_rate': 2.5336e-05, 'epoch': 1.48}
+{'loss': 0.3774, 'grad_norm': 6.633873462677002, 'learning_rate': 2.520266666666667e-05, 'epoch': 1.488}
+{'loss': 0.2809, 'grad_norm': 0.473483145236969, 'learning_rate': 2.5069333333333334e-05, 'epoch': 1.496}
+{'loss': 0.1802, 'grad_norm': 0.1088651642203331, 'learning_rate': 2.4936e-05, 'epoch': 1.504}
+{'loss': 0.3397, 'grad_norm': 0.15446412563323975, 'learning_rate': 2.480266666666667e-05, 'epoch': 1.512}
+{'loss': 0.2506, 'grad_norm': 0.13606055080890656, 'learning_rate': 2.4669333333333335e-05, 'epoch': 1.52}
+{'loss': 0.2989, 'grad_norm': 0.12229656428098679, 'learning_rate': 2.4536e-05, 'epoch': 1.528}
+{'loss': 0.175, 'grad_norm': 0.09148360043764114, 'learning_rate': 2.440266666666667e-05, 'epoch': 1.536}
+{'loss': 0.3552, 'grad_norm': 0.07437633723020554, 'learning_rate': 2.4269333333333335e-05, 'epoch': 1.544}
+{'loss': 0.2242, 'grad_norm': 98.57760620117188, 'learning_rate': 2.4136e-05, 'epoch': 1.552}
+{'loss': 0.2344, 'grad_norm': 0.24384742975234985, 'learning_rate': 2.4002666666666666e-05, 'epoch': 1.56}
+{'loss': 0.2868, 'grad_norm': 0.06279865652322769, 'learning_rate': 2.3869333333333335e-05, 'epoch': 1.568}
+{'loss': 0.2874, 'grad_norm': 0.1516159474849701, 'learning_rate': 2.3736e-05, 'epoch': 1.576}
+{'loss': 0.1706, 'grad_norm': 0.02717330865561962, 'learning_rate': 2.3602666666666667e-05, 'epoch': 1.584}
+{'loss': 0.3318, 'grad_norm': 2.2730720043182373, 'learning_rate': 2.3469333333333336e-05, 'epoch': 1.592}
+{'loss': 0.2772, 'grad_norm': 0.027159368619322777, 'learning_rate': 2.3336e-05, 'epoch': 1.6}
+{'loss': 0.2545, 'grad_norm': 0.44568705558776855, 'learning_rate': 2.3202666666666667e-05, 'epoch': 1.608}
+{'loss': 0.3444, 'grad_norm': 17.193021774291992, 'learning_rate': 2.3069333333333333e-05, 'epoch': 1.616}
+{'loss': 0.1768, 'grad_norm': 0.15403099358081818, 'learning_rate': 2.2936000000000002e-05, 'epoch': 1.624}
+{'loss': 0.1226, 'grad_norm': 251.0621337890625, 'learning_rate': 2.2802666666666668e-05, 'epoch': 1.6320000000000001}
+{'loss': 0.2795, 'grad_norm': 25.017301559448242, 'learning_rate': 2.2669333333333333e-05, 'epoch': 1.6400000000000001}
+{'loss': 0.3253, 'grad_norm': 49.36235427856445, 'learning_rate': 2.2536000000000002e-05, 'epoch': 1.6480000000000001}
+{'loss': 0.3206, 'grad_norm': 0.045104943215847015, 'learning_rate': 2.2402666666666668e-05, 'epoch': 1.6560000000000001}
+{'loss': 0.2719, 'grad_norm': 0.21639679372310638, 'learning_rate': 2.2269333333333334e-05, 'epoch': 1.6640000000000001}
+{'loss': 0.3777, 'grad_norm': 0.08187518268823624, 'learning_rate': 2.2136000000000003e-05, 'epoch': 1.6720000000000002}
+{'loss': 0.2435, 'grad_norm': 0.08419207483530045, 'learning_rate': 2.200266666666667e-05, 'epoch': 1.6800000000000002}
+{'loss': 0.2798, 'grad_norm': 32.25635528564453, 'learning_rate': 2.1869333333333334e-05, 'epoch': 1.688}
+{'loss': 0.2435, 'grad_norm': 0.03352827951312065, 'learning_rate': 2.1736e-05, 'epoch': 1.696}
+{'loss': 0.2896, 'grad_norm': 0.11488524079322815, 'learning_rate': 2.160266666666667e-05, 'epoch': 1.704}
+{'loss': 0.137, 'grad_norm': 0.9820640087127686, 'learning_rate': 2.1469333333333335e-05, 'epoch': 1.712}
+{'loss': 0.2503, 'grad_norm': 0.0872233659029007, 'learning_rate': 2.1336e-05, 'epoch': 1.72}
+{'loss': 0.331, 'grad_norm': 0.07821047306060791, 'learning_rate': 2.120266666666667e-05, 'epoch': 1.728}
+{'loss': 0.2292, 'grad_norm': 13.81276798248291, 'learning_rate': 2.1069333333333335e-05, 'epoch': 1.736}
+{'loss': 0.2239, 'grad_norm': 14.37901782989502, 'learning_rate': 2.0936e-05, 'epoch': 1.744}
+{'loss': 0.2351, 'grad_norm': 0.09311486035585403, 'learning_rate': 2.0802666666666666e-05, 'epoch': 1.752}
+{'loss': 0.2493, 'grad_norm': 0.04642907530069351, 'learning_rate': 2.0669333333333336e-05, 'epoch': 1.76}
+{'loss': 0.2468, 'grad_norm': 187.65907287597656, 'learning_rate': 2.0536e-05, 'epoch': 1.768}
+{'loss': 0.2195, 'grad_norm': 0.3666624128818512, 'learning_rate': 2.0402666666666667e-05, 'epoch': 1.776}
+{'loss': 0.2567, 'grad_norm': 28.931724548339844, 'learning_rate': 2.0269333333333336e-05, 'epoch': 1.784}
+{'loss': 0.2707, 'grad_norm': 110.09719848632812, 'learning_rate': 2.0136e-05, 'epoch': 1.792}
+{'loss': 0.2216, 'grad_norm': 0.025822747498750687, 'learning_rate': 2.0002666666666667e-05, 'epoch': 1.8}
+{'loss': 0.165, 'grad_norm': 21.93601417541504, 'learning_rate': 1.9869333333333333e-05, 'epoch': 1.808}
+{'loss': 0.2316, 'grad_norm': 0.23445022106170654, 'learning_rate': 1.9736000000000002e-05, 'epoch': 1.8159999999999998}
+{'loss': 0.3018, 'grad_norm': 24.560941696166992, 'learning_rate': 1.9602666666666668e-05, 'epoch': 1.8239999999999998}
+{'loss': 0.1176, 'grad_norm': 0.01924316957592964, 'learning_rate': 1.9469333333333333e-05, 'epoch': 1.8319999999999999}
+{'loss': 0.3031, 'grad_norm': 0.3726535439491272, 'learning_rate': 1.9336000000000003e-05, 'epoch': 1.8399999999999999}
+{'loss': 0.2523, 'grad_norm': 0.10653215646743774, 'learning_rate': 1.9202666666666668e-05, 'epoch': 1.8479999999999999}
+{'loss': 0.243, 'grad_norm': 0.07101219147443771, 'learning_rate': 1.9069333333333334e-05, 'epoch': 1.8559999999999999}
+{'loss': 0.2008, 'grad_norm': 0.12322711199522018, 'learning_rate': 1.8936e-05, 'epoch': 1.8639999999999999}
+{'loss': 0.2249, 'grad_norm': 0.09139817208051682, 'learning_rate': 1.880266666666667e-05, 'epoch': 1.8719999999999999}
+{'loss': 0.2285, 'grad_norm': 31.605588912963867, 'learning_rate': 1.8669333333333334e-05, 'epoch': 1.88}
+{'loss': 0.308, 'grad_norm': 0.2888055145740509, 'learning_rate': 1.8536e-05, 'epoch': 1.888}
+{'loss': 0.2119, 'grad_norm': 0.16984781622886658, 'learning_rate': 1.840266666666667e-05, 'epoch': 1.896}
+{'loss': 0.1807, 'grad_norm': 0.018442299216985703, 'learning_rate': 1.8269333333333335e-05, 'epoch': 1.904}
+{'loss': 0.2367, 'grad_norm': 0.05777069553732872, 'learning_rate': 1.8136e-05, 'epoch': 1.912}
+{'loss': 0.1747, 'grad_norm': 0.06527545303106308, 'learning_rate': 1.8002666666666666e-05, 'epoch': 1.92}
+{'loss': 0.3092, 'grad_norm': 0.0599406436085701, 'learning_rate': 1.7869333333333335e-05, 'epoch': 1.928}
+{'loss': 0.3103, 'grad_norm': 113.66268157958984, 'learning_rate': 1.7736e-05, 'epoch': 1.936}
+{'loss': 0.2114, 'grad_norm': 0.2484273612499237, 'learning_rate': 1.7602666666666667e-05, 'epoch': 1.944}
+{'loss': 0.2138, 'grad_norm': 0.0685097873210907, 'learning_rate': 1.7469333333333336e-05, 'epoch': 1.952}
+{'loss': 0.178, 'grad_norm': 0.08626335859298706, 'learning_rate': 1.7335999999999998e-05, 'epoch': 1.96}
+{'loss': 0.3075, 'grad_norm': 0.18472443521022797, 'learning_rate': 1.7202666666666667e-05, 'epoch': 1.968}
+{'loss': 0.2595, 'grad_norm': 0.09902197122573853, 'learning_rate': 1.7069333333333333e-05, 'epoch': 1.976}
+{'loss': 0.3426, 'grad_norm': 0.10281559079885483, 'learning_rate': 1.6936000000000002e-05, 'epoch': 1.984}
+{'loss': 0.3031, 'grad_norm': 90.46196746826172, 'learning_rate': 1.6802666666666668e-05, 'epoch': 1.992}
+{'loss': 0.2931, 'grad_norm': 0.13644857704639435, 'learning_rate': 1.6669333333333333e-05, 'epoch': 2.0}
+{'eval_loss': 0.4506886303424835, 'eval_accuracy': 0.89908, 'eval_f1': 0.8966195451751691, 'eval_precision': 0.9190256194876103, 'eval_recall': 0.87528, 'eval_runtime': 393.988, 'eval_samples_per_second': 63.454, 'eval_steps_per_second': 15.863, 'epoch': 2.0}
+{'loss': 0.1133, 'grad_norm': 0.19643454253673553, 'learning_rate': 1.6536000000000002e-05, 'epoch': 2.008}
+{'loss': 0.0441, 'grad_norm': 0.020006030797958374, 'learning_rate': 1.6402666666666665e-05, 'epoch': 2.016}
+{'loss': 0.0669, 'grad_norm': 0.017264680936932564, 'learning_rate': 1.6269333333333334e-05, 'epoch': 2.024}
+{'loss': 0.0532, 'grad_norm': 0.061523064970970154, 'learning_rate': 1.6136000000000003e-05, 'epoch': 2.032}
+{'loss': 0.0882, 'grad_norm': 0.009066939353942871, 'learning_rate': 1.600266666666667e-05, 'epoch': 2.04}
+{'loss': 0.1001, 'grad_norm': 0.03136083111166954, 'learning_rate': 1.5869333333333334e-05, 'epoch': 2.048}
+{'loss': 0.138, 'grad_norm': 0.008202377706766129, 'learning_rate': 1.5736e-05, 'epoch': 2.056}
+{'loss': 0.0569, 'grad_norm': 0.07132015377283096, 'learning_rate': 1.560266666666667e-05, 'epoch': 2.064}
+{'loss': 0.1, 'grad_norm': 0.18235626816749573, 'learning_rate': 1.5469333333333335e-05, 'epoch': 2.072}
+{'loss': 0.0579, 'grad_norm': 0.008501987904310226, 'learning_rate': 1.5336e-05, 'epoch': 2.08}
+{'loss': 0.1893, 'grad_norm': 0.017202647402882576, 'learning_rate': 1.5202666666666668e-05, 'epoch': 2.088}
+{'loss': 0.1071, 'grad_norm': 0.04670681431889534, 'learning_rate': 1.5069333333333335e-05, 'epoch': 2.096}
+{'loss': 0.0846, 'grad_norm': 0.013939165510237217, 'learning_rate': 1.4936e-05, 'epoch': 2.104}
+{'loss': 0.0508, 'grad_norm': 4.487010478973389, 'learning_rate': 1.4802666666666668e-05, 'epoch': 2.112}
+{'loss': 0.166, 'grad_norm': 0.014982378110289574, 'learning_rate': 1.4669333333333335e-05, 'epoch': 2.12}
+{'loss': 0.0941, 'grad_norm': 0.03977168723940849, 'learning_rate': 1.4536e-05, 'epoch': 2.128}
+{'loss': 0.138, 'grad_norm': 0.01852828450500965, 'learning_rate': 1.4402666666666667e-05, 'epoch': 2.136}
+{'loss': 0.0893, 'grad_norm': 0.018985146656632423, 'learning_rate': 1.4269333333333334e-05, 'epoch': 2.144}
+{'loss': 0.0016, 'grad_norm': 0.010966133326292038, 'learning_rate': 1.4136000000000002e-05, 'epoch': 2.152}
+{'loss': 0.026, 'grad_norm': 0.02055787853896618, 'learning_rate': 1.4002666666666667e-05, 'epoch': 2.16}
+{'loss': 0.1055, 'grad_norm': 0.021019885316491127, 'learning_rate': 1.3869333333333335e-05, 'epoch': 2.168}
+{'loss': 0.1479, 'grad_norm': 0.06946071982383728, 'learning_rate': 1.3736000000000002e-05, 'epoch': 2.176}
+{'loss': 0.0808, 'grad_norm': 0.014382677152752876, 'learning_rate': 1.3602666666666666e-05, 'epoch': 2.184}
+{'loss': 0.1624, 'grad_norm': 0.02976427599787712, 'learning_rate': 1.3469333333333333e-05, 'epoch': 2.192}
+{'loss': 0.1299, 'grad_norm': 0.11172953248023987, 'learning_rate': 1.3336e-05, 'epoch': 2.2}
+{'loss': 0.0482, 'grad_norm': 0.08020364493131638, 'learning_rate': 1.3202666666666666e-05, 'epoch': 2.208}
+{'loss': 0.0694, 'grad_norm': 0.013661920092999935, 'learning_rate': 1.3069333333333334e-05, 'epoch': 2.216}
+{'loss': 0.1619, 'grad_norm': 0.02413208782672882, 'learning_rate': 1.2936000000000001e-05, 'epoch': 2.224}
+{'loss': 0.1237, 'grad_norm': 0.007472939323633909, 'learning_rate': 1.2802666666666669e-05, 'epoch': 2.232}
+{'loss': 0.0676, 'grad_norm': 0.02983078546822071, 'learning_rate': 1.2669333333333333e-05, 'epoch': 2.24}
+{'loss': 0.0983, 'grad_norm': 0.04998508095741272, 'learning_rate': 1.2536e-05, 'epoch': 2.248}
+{'loss': 0.1647, 'grad_norm': 13.296645164489746, 'learning_rate': 1.2402666666666667e-05, 'epoch': 2.2560000000000002}
+{'loss': 0.0834, 'grad_norm': 0.016014471650123596, 'learning_rate': 1.2269333333333335e-05, 'epoch': 2.2640000000000002}
+{'loss': 0.1467, 'grad_norm': 0.14326101541519165, 'learning_rate': 1.2136e-05, 'epoch': 2.2720000000000002}
+{'loss': 0.0136, 'grad_norm': 0.014358256943523884, 'learning_rate': 1.2002666666666668e-05, 'epoch': 2.2800000000000002}
+{'loss': 0.2312, 'grad_norm': 0.03325853496789932, 'learning_rate': 1.1869333333333333e-05, 'epoch': 2.288}
+{'loss': 0.0823, 'grad_norm': 0.054809004068374634, 'learning_rate': 1.1736e-05, 'epoch': 2.296}
+{'loss': 0.2533, 'grad_norm': 0.02338593825697899, 'learning_rate': 1.1602666666666666e-05, 'epoch': 2.304}
+{'loss': 0.0905, 'grad_norm': 0.024055376648902893, 'learning_rate': 1.1469333333333334e-05, 'epoch': 2.312}
+{'loss': 0.1688, 'grad_norm': 26.65433120727539, 'learning_rate': 1.1336000000000001e-05, 'epoch': 2.32}
+{'loss': 0.1274, 'grad_norm': 0.05946606397628784, 'learning_rate': 1.1202666666666667e-05, 'epoch': 2.328}
+{'loss': 0.0922, 'grad_norm': 0.018317028880119324, 'learning_rate': 1.1069333333333334e-05, 'epoch': 2.336}
+{'loss': 0.1224, 'grad_norm': 0.014432383701205254, 'learning_rate': 1.0936e-05, 'epoch': 2.344}
+{'loss': 0.0685, 'grad_norm': 0.013095813803374767, 'learning_rate': 1.0802666666666666e-05, 'epoch': 2.352}
+{'loss': 0.0257, 'grad_norm': 0.028074130415916443, 'learning_rate': 1.0669333333333335e-05, 'epoch': 2.36}
+{'loss': 0.1292, 'grad_norm': 0.02423202060163021, 'learning_rate': 1.0536e-05, 'epoch': 2.368}
+{'loss': 0.1137, 'grad_norm': 0.013635743409395218, 'learning_rate': 1.0402666666666668e-05, 'epoch': 2.376}
+{'loss': 0.1745, 'grad_norm': 0.016421562060713768, 'learning_rate': 1.0269333333333333e-05, 'epoch': 2.384}
+{'loss': 0.1689, 'grad_norm': 0.01975177228450775, 'learning_rate': 1.0136000000000001e-05, 'epoch': 2.392}
+{'loss': 0.1267, 'grad_norm': 0.05990523472428322, 'learning_rate': 1.0002666666666667e-05, 'epoch': 2.4}
+{'loss': 0.0714, 'grad_norm': 0.023030275478959084, 'learning_rate': 9.869333333333334e-06, 'epoch': 2.408}
+{'loss': 0.0303, 'grad_norm': 0.17459280788898468, 'learning_rate': 9.736000000000001e-06, 'epoch': 2.416}
+{'loss': 0.0207, 'grad_norm': 0.024825584143400192, 'learning_rate': 9.602666666666667e-06, 'epoch': 2.424}
+{'loss': 0.1338, 'grad_norm': 0.00718740513548255, 'learning_rate': 9.469333333333334e-06, 'epoch': 2.432}
+{'loss': 0.001, 'grad_norm': 0.006329901050776243, 'learning_rate': 9.336e-06, 'epoch': 2.44}
+{'loss': 0.1752, 'grad_norm': 0.016103368252515793, 'learning_rate': 9.202666666666667e-06, 'epoch': 2.448}
+{'loss': 0.1168, 'grad_norm': 0.11804729700088501, 'learning_rate': 9.069333333333333e-06, 'epoch': 2.456}
+{'loss': 0.2117, 'grad_norm': 35.67884826660156, 'learning_rate': 8.936e-06, 'epoch': 2.464}
+{'loss': 0.1755, 'grad_norm': 0.016014249995350838, 'learning_rate': 8.802666666666668e-06, 'epoch': 2.472}
+{'loss': 0.1497, 'grad_norm': 0.22153107821941376, 'learning_rate': 8.669333333333334e-06, 'epoch': 2.48}
+{'loss': 0.1113, 'grad_norm': 0.01318784523755312, 'learning_rate': 8.536000000000001e-06, 'epoch': 2.488}
+{'loss': 0.1143, 'grad_norm': 0.1176510900259018, 'learning_rate': 8.402666666666667e-06, 'epoch': 2.496}
+{'loss': 0.1492, 'grad_norm': 0.06879352778196335, 'learning_rate': 8.269333333333332e-06, 'epoch': 2.504}
+{'loss': 0.1984, 'grad_norm': 0.021879026666283607, 'learning_rate': 8.136000000000001e-06, 'epoch': 2.512}
+{'loss': 0.0812, 'grad_norm': 0.03925799950957298, 'learning_rate': 8.002666666666667e-06, 'epoch': 2.52}
+{'loss': 0.1615, 'grad_norm': 0.0319889560341835, 'learning_rate': 7.869333333333334e-06, 'epoch': 2.528}
+{'loss': 0.0291, 'grad_norm': 0.015960585325956345, 'learning_rate': 7.736e-06, 'epoch': 2.536}
+{'loss': 0.135, 'grad_norm': 0.020564408972859383, 'learning_rate': 7.6026666666666675e-06, 'epoch': 2.544}
+{'loss': 0.1479, 'grad_norm': 0.03615148738026619, 'learning_rate': 7.469333333333334e-06, 'epoch': 2.552}
+{'loss': 0.0368, 'grad_norm': 0.016910186037421227, 'learning_rate': 7.336e-06, 'epoch': 2.56}
+{'loss': 0.2437, 'grad_norm': 8.867321968078613, 'learning_rate': 7.202666666666667e-06, 'epoch': 2.568}
+{'loss': 0.049, 'grad_norm': 10.037091255187988, 'learning_rate': 7.069333333333334e-06, 'epoch': 2.576}
+{'loss': 0.0398, 'grad_norm': 0.25611355900764465, 'learning_rate': 6.936000000000001e-06, 'epoch': 2.584}
+{'loss': 0.0257, 'grad_norm': 0.05507563799619675, 'learning_rate': 6.802666666666667e-06, 'epoch': 2.592}
+{'loss': 0.1173, 'grad_norm': 0.09031017869710922, 'learning_rate': 6.669333333333333e-06, 'epoch': 2.6}
+{'loss': 0.1151, 'grad_norm': 0.013525927439332008, 'learning_rate': 6.536000000000001e-06, 'epoch': 2.608}
+{'loss': 0.0917, 'grad_norm': 0.031039560213685036, 'learning_rate': 6.402666666666666e-06, 'epoch': 2.616}
+{'loss': 0.1611, 'grad_norm': 0.02152109332382679, 'learning_rate': 6.269333333333334e-06, 'epoch': 2.624}
+{'loss': 0.1082, 'grad_norm': 0.02339756488800049, 'learning_rate': 6.136e-06, 'epoch': 2.632}
+{'loss': 0.0367, 'grad_norm': 0.012301336042582989, 'learning_rate': 6.002666666666667e-06, 'epoch': 2.64}
+{'loss': 0.0914, 'grad_norm': 215.3618621826172, 'learning_rate': 5.869333333333333e-06, 'epoch': 2.648}
+{'loss': 0.1905, 'grad_norm': 20.581954956054688, 'learning_rate': 5.736000000000001e-06, 'epoch': 2.656}
+{'loss': 0.0908, 'grad_norm': 0.013410776853561401, 'learning_rate': 5.602666666666667e-06, 'epoch': 2.664}
+{'loss': 0.0603, 'grad_norm': 0.13063132762908936, 'learning_rate': 5.469333333333333e-06, 'epoch': 2.672}
+{'loss': 0.1159, 'grad_norm': 0.05968919396400452, 'learning_rate': 5.336e-06, 'epoch': 2.68}
+{'loss': 0.178, 'grad_norm': 0.07835003733634949, 'learning_rate': 5.202666666666667e-06, 'epoch': 2.6879999999999997}
+{'loss': 0.1182, 'grad_norm': 38.63554000854492, 'learning_rate': 5.069333333333333e-06, 'epoch': 2.6959999999999997}
+{'loss': 0.065, 'grad_norm': 1.430072546005249, 'learning_rate': 4.936000000000001e-06, 'epoch': 2.7039999999999997}
+{'loss': 0.0336, 'grad_norm': 0.009959719143807888, 'learning_rate': 4.802666666666667e-06, 'epoch': 2.7119999999999997}
+{'loss': 0.0495, 'grad_norm': 0.6715738773345947, 'learning_rate': 4.669333333333334e-06, 'epoch': 2.7199999999999998}
+{'loss': 0.0314, 'grad_norm': 0.010251459665596485, 'learning_rate': 4.536e-06, 'epoch': 2.7279999999999998}
+{'loss': 0.0264, 'grad_norm': 0.12389620393514633, 'learning_rate': 4.402666666666667e-06, 'epoch': 2.7359999999999998}
+{'loss': 0.0462, 'grad_norm': 0.008583267219364643, 'learning_rate': 4.269333333333333e-06, 'epoch': 2.7439999999999998}
+{'loss': 0.0661, 'grad_norm': 0.007292643189430237, 'learning_rate': 4.136e-06, 'epoch': 2.752}
+{'loss': 0.1706, 'grad_norm': 0.047004811465740204, 'learning_rate': 4.002666666666667e-06, 'epoch': 2.76}
+{'loss': 0.0777, 'grad_norm': 0.020715517923235893, 'learning_rate': 3.869333333333334e-06, 'epoch': 2.768}
+{'loss': 0.031, 'grad_norm': 0.008295822888612747, 'learning_rate': 3.736e-06, 'epoch': 2.776}
+{'loss': 0.1627, 'grad_norm': 0.015728944912552834, 'learning_rate': 3.602666666666667e-06, 'epoch': 2.784}
+{'loss': 0.0297, 'grad_norm': 0.1496945321559906, 'learning_rate': 3.4693333333333334e-06, 'epoch': 2.792}
+{'loss': 0.1484, 'grad_norm': 0.02585836499929428, 'learning_rate': 3.3360000000000003e-06, 'epoch': 2.8}
+{'loss': 0.0878, 'grad_norm': 0.0086339320987463, 'learning_rate': 3.202666666666667e-06, 'epoch': 2.808}
+{'loss': 0.2442, 'grad_norm': 8.712865829467773, 'learning_rate': 3.0693333333333334e-06, 'epoch': 2.816}
+{'loss': 0.0921, 'grad_norm': 0.02131008356809616, 'learning_rate': 2.9360000000000003e-06, 'epoch': 2.824}
+{'loss': 0.2405, 'grad_norm': 0.00918051227927208, 'learning_rate': 2.8026666666666665e-06, 'epoch': 2.832}
+{'loss': 0.0815, 'grad_norm': 0.020189447328448296, 'learning_rate': 2.6693333333333334e-06, 'epoch': 2.84}
+{'loss': 0.0638, 'grad_norm': 0.18166711926460266, 'learning_rate': 2.5360000000000004e-06, 'epoch': 2.848}
+{'loss': 0.0522, 'grad_norm': 0.00875813141465187, 'learning_rate': 2.402666666666667e-06, 'epoch': 2.856}
+{'loss': 0.0009, 'grad_norm': 0.0431634895503521, 'learning_rate': 2.2693333333333334e-06, 'epoch': 2.864}
+{'loss': 0.1156, 'grad_norm': 0.023334724828600883, 'learning_rate': 2.136e-06, 'epoch': 2.872}
+{'loss': 0.1775, 'grad_norm': 36.20563507080078, 'learning_rate': 2.002666666666667e-06, 'epoch': 2.88}
+{'loss': 0.2143, 'grad_norm': 25.47490882873535, 'learning_rate': 1.8693333333333334e-06, 'epoch': 2.888}
+{'loss': 0.0035, 'grad_norm': 0.013057650066912174, 'learning_rate': 1.7360000000000002e-06, 'epoch': 2.896}
+{'loss': 0.0009, 'grad_norm': 0.01746312901377678, 'learning_rate': 1.602666666666667e-06, 'epoch': 2.904}
+{'loss': 0.1258, 'grad_norm': 0.012785250321030617, 'learning_rate': 1.4693333333333333e-06, 'epoch': 2.912}
+{'loss': 0.1393, 'grad_norm': 0.026742149144411087, 'learning_rate': 1.336e-06, 'epoch': 2.92}
+{'loss': 0.1293, 'grad_norm': 31.66493797302246, 'learning_rate': 1.2026666666666667e-06, 'epoch': 2.928}
+{'loss': 0.0443, 'grad_norm': 0.12351831048727036, 'learning_rate': 1.0693333333333333e-06, 'epoch': 2.936}
+{'loss': 0.0358, 'grad_norm': 0.01323748379945755, 'learning_rate': 9.360000000000001e-07, 'epoch': 2.944}
+{'loss': 0.0679, 'grad_norm': 0.010095755569636822, 'learning_rate': 8.026666666666667e-07, 'epoch': 2.952}
+{'loss': 0.075, 'grad_norm': 0.09313926100730896, 'learning_rate': 6.693333333333334e-07, 'epoch': 2.96}
+{'loss': 0.1417, 'grad_norm': 0.014779884368181229, 'learning_rate': 5.36e-07, 'epoch': 2.968}
+{'loss': 0.0665, 'grad_norm': 0.011904980055987835, 'learning_rate': 4.026666666666666e-07, 'epoch': 2.976}
+{'loss': 0.1183, 'grad_norm': 0.03782917186617851, 'learning_rate': 2.693333333333333e-07, 'epoch': 2.984}
+{'loss': 0.167, 'grad_norm': 8.537976264953613, 'learning_rate': 1.3600000000000003e-07, 'epoch': 2.992}
+{'loss': 0.033, 'grad_norm': 0.010841709561645985, 'learning_rate': 2.666666666666667e-09, 'epoch': 3.0}
+{'eval_loss': 0.5613933801651001, 'eval_accuracy': 0.90404, 'eval_f1': 0.903332393117621, 'eval_precision': 0.9100430299585938, 'eval_recall': 0.89672, 'eval_runtime': 403.0149, 'eval_samples_per_second': 62.032, 'eval_steps_per_second': 15.508, 'epoch': 3.0}
+{'train_runtime': 17654.2392, 'train_samples_per_second': 4.248, 'train_steps_per_second': 1.062, 'train_loss': 0.2737119940789541, 'epoch': 3.0}
+Training completed!
+Evaluating model...
+{'eval_loss': 0.5613933801651001, 'eval_accuracy': 0.90404, 'eval_f1': 0.903332393117621, 'eval_precision': 0.9100430299585938, 'eval_recall': 0.89672, 'eval_runtime': 383.8561, 'eval_samples_per_second': 65.129, 'eval_steps_per_second': 16.282, 'epoch': 3.0}
+=== Evaluation Results ===
+eval_loss: 0.5614
+eval_accuracy: 0.9040
+eval_f1: 0.9033
+eval_precision: 0.9100
+eval_recall: 0.8967
+eval_runtime: 383.8561
+eval_samples_per_second: 65.1290
+eval_steps_per_second: 16.2820
+epoch: 3.0000
+Saving model to ./model...
+Model saved successfully!
+=== Training Pipeline Completed ===
+🎉 Training completed!
+To run the app: python app.py

wandb/run-20250720_155338-0h3fksuy/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,139 @@

+accelerate==1.9.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
+alembic==1.16.2
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+audioop-lts==0.2.1
+blinker==1.9.0
+Bottleneck==1.4.2
+Brotli==1.1.0
+cachetools==6.1.0
+certifi==2025.6.15
+charset-normalizer==3.4.2
+click==8.2.1
+cloudpickle==3.1.1
+colorama==0.4.6
+colorlog==6.9.0
+contourpy==1.3.1
+cycler==0.11.0
+datasets==4.0.0
+dill==0.3.8
+fastapi==0.116.1
+ffmpy==0.6.0
+filelock==3.18.0
+fonttools==4.55.3
+frozenlist==1.7.0
+fsspec==2025.3.0
+gitdb==4.0.12
+GitPython==3.1.44
+gradio==5.37.0
+gradio_client==1.10.4
+greenlet==3.2.3
+groovy==0.1.2
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.33.4
+idna==3.10
+imbalanced-learn==0.13.0
+imblearn==0.0
+Jinja2==3.1.6
+joblib==1.4.2
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+kiwisolver==1.4.8
+llvmlite==0.44.0
+Mako==1.3.10
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.9.2
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.6.3
+multiprocess==0.70.16
+narwhals==1.44.0
+networkx==3.5
+ninja==1.11.1.4
+numba==0.61.2
+numexpr==2.10.2
+numpy==2.1.1
+optuna==4.4.0
+orjson==3.11.0
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+pip==25.1
+platformdirs==4.3.8
+plotly==6.2.0
+propcache==0.3.2
+protobuf==6.31.1
+psutil==7.0.0
+pyarrow==20.0.0
+pybind11==3.0.0
+pydantic==2.11.7
+pydantic_core==2.33.2
+pydeck==0.9.1
+pydub==0.25.1
+Pygments==2.19.2
+pyparsing==3.2.0
+PyQt6==6.7.1
+PyQt6_sip==13.9.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2024.1
+PyYAML==6.0.2
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.4
+rich==14.0.0
+rpds-py==0.26.0
+ruff==0.12.3
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.5.2
+scipy==1.15.2
+seaborn==0.13.2
+semantic-version==2.10.0
+sentry-sdk==2.33.0
+setuptools==78.1.1
+shap==0.48.0
+shellingham==1.5.4
+sip==6.10.0
+six==1.17.0
+sklearn-compat==0.1.3
+slicer==0.0.8
+smmap==5.0.2
+sniffio==1.3.1
+SQLAlchemy==2.0.41
+starlette==0.47.1
+streamlit==1.46.1
+sympy==1.14.0
+tenacity==9.1.2
+threadpoolctl==3.5.0
+tokenizers==0.21.2
+toml==0.10.2
+tomlkit==0.13.3
+torch==2.7.1+cu118
+torchaudio==2.7.1+cu118
+torchvision==0.22.1
+tornado==6.5.1
+tqdm==4.67.1
+transformers==4.53.2
+typer==0.16.0
+typing_extensions==4.14.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
+uvicorn==0.35.0
+wandb==0.21.0
+watchdog==6.0.0
+websockets==15.0.1
+wheel==0.45.1
+xgboost==3.0.2
+xxhash==3.5.0
+yarl==1.20.1

wandb/run-20250720_155338-0h3fksuy/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "os": "Windows-11-10.0.26100-SP0",
+  "python": "CPython 3.13.5",
+  "startedAt": "2025-07-20T10:23:38.923772Z",
+  "program": "C:\\Users\\Legion\\desktop\\distilbert-sentiment\\main.py",
+  "codePath": "main.py",
+  "codePathLocal": "main.py",
+  "email": "shreshthkapai@gmail.com",
+  "root": "C:\\Users\\Legion\\desktop\\distilbert-sentiment",
+  "host": "DESKTOP-EIHJJJL",
+  "executable": "C:\\Users\\Legion\\Miniconda3\\envs\\ML\\python.exe",
+  "cpu_count": 4,
+  "cpu_count_logical": 8,
+  "gpu": "NVIDIA GeForce GTX 1650",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "255230791680",
+      "used": "233129451520"
+    }
+  },
+  "memory": {
+    "total": "8506298368"
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA GeForce GTX 1650",
+      "memoryTotal": "4294967296",
+      "cudaCores": 1024,
+      "architecture": "Turing",
+      "uuid": "GPU-fbcd7647-fb67-66f5-b8c7-1a4198b7e4fa"
+    }
+  ],
+  "cudaVersion": "12.7",
+  "writerId": "fshn6fq4d357dfamunx9x96y44pdzcc6"
+}

wandb/run-20250720_155338-0h3fksuy/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"train/global_step":18750,"train/epoch":3,"_timestamp":1.75302505616767e+09,"eval/steps_per_second":16.282,"train_samples_per_second":4.248,"eval/recall":0.89672,"train_loss":0.2737119940789541,"train_runtime":17654.2392,"total_flos":4.9675274496e+15,"eval/loss":0.5613933801651001,"_wandb":{"runtime":18036},"eval/runtime":383.8561,"train_steps_per_second":1.062,"eval/accuracy":0.90404,"eval/samples_per_second":65.129,"train/loss":0.033,"train/learning_rate":2.666666666666667e-09,"_step":379,"_runtime":18036,"train/grad_norm":0.010841709561645985,"eval/precision":0.9100430299585938,"eval/f1":0.903332393117621}

wandb/run-20250720_155338-0h3fksuy/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,16 @@

+{"time":"2025-07-20T15:53:39.5114812+05:30","level":"INFO","msg":"stream: starting","core version":"0.21.0"}
+{"time":"2025-07-20T15:53:41.0961508+05:30","level":"INFO","msg":"stream: created new stream","id":"0h3fksuy"}
+{"time":"2025-07-20T15:53:41.0967346+05:30","level":"INFO","msg":"stream: started","id":"0h3fksuy"}
+{"time":"2025-07-20T15:53:41.0967346+05:30","level":"INFO","msg":"handler: started","stream_id":"0h3fksuy"}
+{"time":"2025-07-20T15:53:41.0967346+05:30","level":"INFO","msg":"sender: started","stream_id":"0h3fksuy"}
+{"time":"2025-07-20T15:53:41.0967346+05:30","level":"INFO","msg":"writer: Do: started","stream_id":"0h3fksuy"}
+{"time":"2025-07-20T20:27:20.3681207+05:30","level":"WARN","msg":"sender: taking a long time","seconds":11118.8689693,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"qx9z56z7vy8w\" connection_id:\"1(127.0.0.1:59166)\")"}
+{"time":"2025-07-20T20:27:20.6988531+05:30","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
+{"time":"2025-07-20T20:27:20.856851+05:30","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/shreshth/huggingface/0h3fksuy/file_stream\": net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2025-07-20T20:27:24.1042083+05:30","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":11124.0731302,"work":"WorkRecord(*service_go_proto.Request_StopStatus); Control(local:true mailbox_slot:\"qx9z56z7vy8w\" connection_id:\"1(127.0.0.1:59166)\")"}
+{"time":"2025-07-20T20:54:18.4689135+05:30","level":"INFO","msg":"stream: closing","id":"0h3fksuy"}
+{"time":"2025-07-20T20:54:19.7177233+05:30","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-07-20T20:54:20.2321569+05:30","level":"INFO","msg":"handler: closed","stream_id":"0h3fksuy"}
+{"time":"2025-07-20T20:54:20.2321569+05:30","level":"INFO","msg":"sender: closed","stream_id":"0h3fksuy"}
+{"time":"2025-07-20T20:54:20.2321569+05:30","level":"INFO","msg":"writer: Close: closed","stream_id":"0h3fksuy"}
+{"time":"2025-07-20T20:54:20.2327206+05:30","level":"INFO","msg":"stream: closed","id":"0h3fksuy"}

wandb/run-20250720_155338-0h3fksuy/logs/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2025-07-20 15:53:38,929 INFO    MainThread:1648 [wandb_setup.py:_flush():80] Current SDK version is 0.21.0
+2025-07-20 15:53:38,930 INFO    MainThread:1648 [wandb_setup.py:_flush():80] Configure stats pid to 1648
+2025-07-20 15:53:38,930 INFO    MainThread:1648 [wandb_setup.py:_flush():80] Loading settings from C:\Users\Legion\.config\wandb\settings
+2025-07-20 15:53:38,930 INFO    MainThread:1648 [wandb_setup.py:_flush():80] Loading settings from C:\Users\Legion\desktop\distilbert-sentiment\wandb\settings
+2025-07-20 15:53:38,930 INFO    MainThread:1648 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2025-07-20 15:53:38,930 INFO    MainThread:1648 [wandb_init.py:setup_run_log_directory():703] Logging user logs to C:\Users\Legion\desktop\distilbert-sentiment\wandb\run-20250720_155338-0h3fksuy\logs\debug.log
+2025-07-20 15:53:38,931 INFO    MainThread:1648 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to C:\Users\Legion\desktop\distilbert-sentiment\wandb\run-20250720_155338-0h3fksuy\logs\debug-internal.log
+2025-07-20 15:53:38,931 INFO    MainThread:1648 [wandb_init.py:init():830] calling init triggers
+2025-07-20 15:53:38,931 INFO    MainThread:1648 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-07-20 15:53:38,931 INFO    MainThread:1648 [wandb_init.py:init():871] starting backend
+2025-07-20 15:53:39,433 INFO    MainThread:1648 [wandb_init.py:init():874] sending inform_init request
+2025-07-20 15:53:39,505 INFO    MainThread:1648 [wandb_init.py:init():882] backend started and connected
+2025-07-20 15:53:39,507 INFO    MainThread:1648 [wandb_init.py:init():953] updated telemetry
+2025-07-20 15:53:39,511 INFO    MainThread:1648 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
+2025-07-20 15:53:41,579 INFO    MainThread:1648 [wandb_init.py:init():1029] starting run threads in backend
+2025-07-20 15:53:42,183 INFO    MainThread:1648 [wandb_run.py:_console_start():2458] atexit reg
+2025-07-20 15:53:42,183 INFO    MainThread:1648 [wandb_run.py:_redirect():2306] redirect: wrap_raw
+2025-07-20 15:53:42,184 INFO    MainThread:1648 [wandb_run.py:_redirect():2375] Wrapping output streams.
+2025-07-20 15:53:42,184 INFO    MainThread:1648 [wandb_run.py:_redirect():2398] Redirects installed.
+2025-07-20 15:53:42,191 INFO    MainThread:1648 [wandb_init.py:init():1075] run started, returning control to user process
+2025-07-20 15:53:42,195 INFO    MainThread:1648 [wandb_run.py:_config_callback():1363] config_cb None None {'vocab_size': 30522, 'max_position_embeddings': 512, 'sinusoidal_pos_embds': False, 'n_layers': 6, 'n_heads': 12, 'dim': 768, 'hidden_dim': 3072, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation': 'gelu', 'initializer_range': 0.02, 'qa_dropout': 0.1, 'seq_classif_dropout': 0.2, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['DistilBertForMaskedLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 0, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'distilbert-base-uncased', 'transformers_version': '4.53.2', 'model_type': 'distilbert', 'tie_weights_': True, 'output_attentions': False, 'output_dir': './model', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 50, 'logging_nan_inf_filter': True, 'save_strategy': 'epoch', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './model', 'disable_tqdm': True, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'f1', 'greater_is_better': True, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
+2025-07-20 15:53:42,202 INFO    MainThread:1648 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 66955010 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x0000024ABE770590>>
+2025-07-20 15:53:42,202 INFO    MainThread:1648 [wandb_run.py:_config_callback():1363] config_cb model/num_parameters 66955010 None
+2025-07-20 20:54:18,327 INFO    MsgRouterThr:1648 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 1 handles.

wandb/run-20250720_155338-0h3fksuy/run-0h3fksuy.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ee10c6e19f6dde4c416e8ee5e2f7791dacbb667462dbe42de372ff1eaca5b68
+size 703284