{ "Visual Narrator VLM": { "training_cost": 344.69, "inference_cost_per_1k": 0.0, "model_size": "3B", "development_time": "11 phases", "infrastructure": "Lambda GPU", "deployment": "Local/Free" }, "GPT-4 Turbo": { "training_cost": "Estimated $10M+", "inference_cost_per_1k": 8.0, "model_size": "~1.7T", "development_time": "Years", "infrastructure": "Proprietary", "deployment": "API/Paid" }, "Claude 3.5 Sonnet": { "training_cost": "Estimated $5M+", "inference_cost_per_1k": 5.0, "model_size": "70B", "development_time": "Years", "infrastructure": "Proprietary", "deployment": "API/Paid" }, "BLIP-2": { "training_cost": "Estimated $50K", "inference_cost_per_1k": 0.0, "model_size": "3.4B", "development_time": "Months", "infrastructure": "Academic", "deployment": "Local/Free" }, "LLaVA": { "training_cost": "Estimated $100K", "inference_cost_per_1k": 0.0, "model_size": "7B", "development_time": "Months", "infrastructure": "Academic", "deployment": "Local/Free" } }