{ "_hub_mixin_config": { "optimizer_class": "adamw", "scheduler_class": "cosine", "input_files": null, "dataset_subset_size": 1000000, "block_size": 128, "dataset_type": "fixed", "train_ratio": 0.98, "val_ratio": 0.01, "encoding_name": "gpt2", "n_heads": 12, "n_blocks": 12, "norm": "prenorm", "pos_embed_type": "absolute", "dropout": 0.1, "embed_size": 64, "hidden_size": 64, "k": 3, "num_inner_layers": 1, "embedding_type": "full", "checkpoint": null, "batch_size": 64, "num_epochs": 5, "learning_rate": 0.0003, "warmup_ratio": 0.1, "log_interval_steps": 250, "save_interval_steps": 5625, "save_dir": "./saved_models/lstm_tiny", "save_latest": true, "save_best": true, "loss_metric_for_best_model": "val", "prompt": "Once upon a", "max_new_tokens": 50, "top_p": 0.9, "use_wandb": true, "wandb_entity": "pico-llm", "wandb_project": "training", "wandb_name": "lstm-tiny", "upload_model_to_hub": true, "repo_id": "pico-llm/lstm-tiny", "device": "cuda:0", "seed": 42, "monosemantic_analysis": true, "num_steps": 76565 }, "hf_api": "", "wandb_writer": "", "wandb_table": "", "optimizer": "AdamW (\nParameter Group 0\n amsgrad: False\n betas: (0.9, 0.999)\n capturable: False\n decoupled_weight_decay: True\n differentiable: False\n eps: 1e-08\n foreach: None\n fused: None\n initial_lr: 0.0003\n lr: 7.350326937918619e-06\n maximize: False\n weight_decay: 0.05\n)", "scheduler": "", "optimizer_class": "adamw", "scheduler_class": "cosine", "model": "LSTMSeqModel(\n (embedding): Embedding(50257, 64)\n (lstm): LSTM(64, 64)\n (linear): Linear(in_features=64, out_features=50257, bias=True)\n)", "learning_rate": 0.0003, "_init_kwargs": { "input_files": null, "dataset_subset_size": 1000000, "block_size": 128, "dataset_type": "fixed", "train_ratio": 0.98, "val_ratio": 0.01, "encoding_name": "gpt2", "n_heads": 12, "n_blocks": 12, "norm": "prenorm", "pos_embed_type": "absolute", "dropout": 0.1, "embed_size": 64, "hidden_size": 64, "k": 3, "num_inner_layers": 1, "embedding_type": "full", "checkpoint": null, "batch_size": 64, "num_epochs": 5, "warmup_ratio": 0.1, "log_interval_steps": 250, "save_interval_steps": 5625, "save_dir": "./saved_models/lstm_tiny", "save_latest": true, "save_best": true, "loss_metric_for_best_model": "val", "prompt": "Once upon a", "max_new_tokens": 50, "top_p": 0.9, "use_wandb": true, "wandb_entity": "pico-llm", "wandb_project": "training", "wandb_name": "lstm-tiny", "upload_model_to_hub": true, "repo_id": "pico-llm/lstm-tiny", "device": "cuda:0", "seed": 42, "monosemantic_analysis": true, "num_steps": 76565 } }