{ "caption_model": { "architecture": "CNN (ResNet50) + LSTM", "test_loss": 2.6034744274139405, "vocab_size": 2988, "trainable_params": 6501036, "model_file": "caption_model_final.pth", "config_file": "caption_model_config.pkl", "vocab_file": "../vocab/vocab.pkl" }, "action_model": { "architecture": "ResNet50 (Fine-tuned)", "test_accuracy": 78.57142857142857, "test_loss": 0.6965014625951726, "num_classes": 15, "class_names": [ "calling", "clapping", "cycling", "dancing", "drinking", "eating", "fighting", "hugging", "laughing", "listening_to_music", "running", "sitting", "sleeping", "texting", "using_laptop" ], "trainable_params": 24565839, "model_file": "action_model_final.pth", "config_file": "action_model_config.pkl" }, "training_info": { "date": "January 2026", "framework": "PyTorch", "device": "Kaggle 2x T4 GPUs", "caption_epochs": 15, "action_epochs": 10 } }