Spaces:

ujjwalpardeshi
/

pytorch-training-debugger

Sleeping

File size: 3,138 Bytes

4f58e42
aa0bed2
206438f
aa0bed2
 
 
 
 
4f58e42
 
 
 
 
aa0bed2
 
4f58e42
aa0bed2
 
4f58e42
 
 
 
 
 
 
aa0bed2
4f58e42
 
 
 
 
 
 
 
 
 
 
aa0bed2
 
4f58e42
 
 
 
 
 
 
aa0bed2
 
 
 
4f58e42
 
 
 
 
 
 
 
 
aa0bed2
 
 
4f58e42
 
 
 
 
 
 
 
 
 
 
aa0bed2
4f58e42
 
 
aa0bed2
4f58e42
 
 
aa0bed2
4f58e42
 
 
aa0bed2
4f58e42
 
aa0bed2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f58e42
 
 
 
 
aa0bed2
 
4f58e42

{
  "methodology": "Real PyTorch 20-epoch mini-training with fault injection",
  "torch_version": "2.5.1+cpu",
  "models": [
    "SimpleCNN (~50K params)",
    "SimpleMLP (~20K params)"
  ],
  "training_approach": "Real forward+backward passes on random CIFAR-10 style data, cached per (task_id, seed)",
  "results": [
    {
      "task": "task_001",
      "fault": "exploding_gradients",
      "checks": {
        "gradient_instability_detected": true,
        "loss_shows_instability": true,
        "max_gradient_norm": 111.8,
        "max_loss": 43.27,
        "real_pytorch_training": true
      },
      "pass": true
    },
    {
      "task": "task_002",
      "fault": "vanishing_gradients",
      "checks": {
        "vanishing_detected": true,
        "min_gradient_norm": 0.0,
        "real_pytorch_gradients": true
      },
      "pass": true
    },
    {
      "task": "task_003",
      "fault": "data_leakage",
      "checks": {
        "class_overlap_above_0.5": true,
        "class_overlap_score": 0.83,
        "real_training_runs": true,
        "has_confusion_matrix": true
      },
      "pass": true
    },
    {
      "task": "task_004",
      "fault": "overfitting",
      "checks": {
        "real_training_runs": true,
        "clean_data": true,
        "final_train_loss": 0.1017,
        "final_val_loss": 2.6519
      },
      "pass": true
    },
    {
      "task": "task_005",
      "fault": "batchnorm_eval_mode",
      "checks": {
        "all_layers_in_eval_mode": true,
        "no_layer_is_exploding": true,
        "real_training_runs": true,
        "real_model_eval_mode": true,
        "red_herring_spike_layer": "conv1"
      },
      "pass": true
    },
    {
      "task": "task_006",
      "fault": "code_bug",
      "checks": {
        "variants_tested": 4,
        "variant_results": {
          "eval_mode": {
            "correct_fix_accepted": true,
            "wrong_fix_rejected": true
          },
          "detach_loss": {
            "correct_fix_accepted": true,
            "wrong_fix_rejected": true
          },
          "zero_grad_missing": {
            "correct_fix_accepted": true,
            "wrong_fix_rejected": true
          },
          "inplace_relu": {
            "correct_fix_accepted": true,
            "wrong_fix_rejected": true
          }
        },
        "fix_validation_pipeline": "normalize -> tokenize -> semantic -> AST"
      },
      "pass": true
    },
    {
      "task": "task_007",
      "fault": "scheduler_misconfigured",
      "checks": {
        "real_training_runs": true,
        "scheduler_gamma": 0.0001,
        "scheduler_step_size": 2,
        "final_loss": 2.5911
      },
      "pass": true
    },
    {
      "task": "architecture",
      "fault": "dual_model_support",
      "checks": {
        "cnn_output_shape": [
          4,
          10
        ],
        "mlp_output_shape": [
          4,
          10
        ],
        "cnn_params": 66890,
        "mlp_params": 411658,
        "both_produce_10_classes": true
      },
      "pass": true
    }
  ],
  "summary": {
    "total": 8,
    "passed": 8,
    "failed": 0
  }
}