Spaces:

hf-skills
/

llm-trainer

Running

App Files Files Community

llm-trainer / scripts /estimate_cost.py

burtenshaw HF Staff

Upload folder using huggingface_hub

6ab17a7 verified 6 days ago

raw

history blame contribute delete

4.82 kB

	#!/usr/bin/env python3
	# /// script
	# dependencies = []
	# ///
	"""
	Estimate training time and cost for TRL jobs.

	Usage:
	python estimate_cost.py --model <model> --dataset <dataset> --hardware <flavor>

	Example:
	python estimate_cost.py --model Qwen/Qwen2.5-0.5B --dataset trl-lib/Capybara --hardware a10g-large
	"""

	import argparse

	# Hardware costs per hour (approximate)
	HARDWARE_COSTS = {
	"t4-small": 0.75,
	"t4-medium": 1.50,
	"l4x1": 2.50,
	"a10g-small": 3.50,
	"a10g-large": 5.00,
	"a10g-largex2": 10.00,
	"a10g-largex4": 20.00,
	"a100-large": 10.00,
	}

	# Model sizes in billions of parameters
	MODEL_SIZES = {
	"0.5B": 0.5,
	"1.5B": 1.5,
	"3B": 3,
	"7B": 7,
	"13B": 13,
	}

	def estimate_training_time(model_params, dataset_size, epochs, hardware):
	"""Estimate training time in hours."""
	# Rough estimates based on empirical observations
	# These are approximations and actual times will vary

	base_time_per_1k_examples = 0.1 # hours for 1B model on a10g-large

	# Adjust for model size
	time = base_time_per_1k_examples * model_params * (dataset_size / 1000) * epochs

	# Adjust for hardware (relative to a10g-large baseline)
	hardware_multipliers = {
	"t4-small": 2.0,
	"t4-medium": 1.5,
	"l4x1": 1.2,
	"a10g-small": 1.3,
	"a10g-large": 1.0,
	"a10g-largex2": 0.6,
	"a10g-largex4": 0.4,
	"a100-large": 0.7,
	}

	multiplier = hardware_multipliers.get(hardware, 1.0)
	time *= multiplier

	return time

	def parse_args():
	parser = argparse.ArgumentParser(description="Estimate training cost for TRL jobs")
	parser.add_argument("--model", required=True, help="Model name or size (e.g., 'Qwen/Qwen2.5-0.5B' or '0.5B')")
	parser.add_argument("--dataset", required=True, help="Dataset name")
	parser.add_argument("--hardware", required=True, choices=HARDWARE_COSTS.keys(), help="Hardware flavor")
	parser.add_argument("--dataset-size", type=int, help="Override dataset size (number of examples)")
	parser.add_argument("--epochs", type=int, default=3, help="Number of training epochs")
	return parser.parse_args()

	def extract_model_size(model_name):
	"""Extract model size from name or return parsed value."""
	for size_str, size_val in MODEL_SIZES.items():
	if size_str in model_name:
	return size_val

	# Try to parse directly
	try:
	if "B" in model_name:
	return float(model_name.replace("B", ""))
	except:
	pass

	return 1.0 # Default to 1B if can't determine

	def main():
	args = parse_args()

	# Extract model parameters
	model_params = extract_model_size(args.model)
	print(f"📊 Model: {args.model} (~{model_params}B parameters)")

	# Estimate dataset size (would need to load to get real size)
	if args.dataset_size:
	dataset_size = args.dataset_size
	else:
	# Common dataset sizes (approximations)
	dataset_sizes = {
	"trl-lib/Capybara": 16000,
	"Anthropic/hh-rlhf": 160000,
	}
	dataset_size = dataset_sizes.get(args.dataset, 10000)

	print(f"📦 Dataset: {args.dataset} (~{dataset_size} examples)")
	print(f"🔄 Epochs: {args.epochs}")
	print(f"💻 Hardware: {args.hardware}")
	print()

	# Estimate training time
	estimated_hours = estimate_training_time(model_params, dataset_size, args.epochs, args.hardware)
	estimated_cost = estimated_hours * HARDWARE_COSTS[args.hardware]

	# Recommend timeout with buffer
	recommended_timeout_hours = estimated_hours * 1.3 # 30% buffer

	print(f"⏱️ Estimated training time: {estimated_hours:.1f} hours")
	print(f"💰 Estimated cost: ${estimated_cost:.2f}")
	print(f"⏰ Recommended timeout: {recommended_timeout_hours:.1f}h (with 30% buffer)")
	print()

	# Warnings and recommendations
	if estimated_hours > 4:
	print("⚠️ Long training time - consider:")
	print(" - Using faster hardware")
	print(" - Reducing epochs")
	print(" - Using a smaller dataset subset for testing")

	if model_params >= 7 and args.hardware not in ["a10g-largex2", "a10g-largex4", "a100-large"]:
	print("⚠️ Large model - consider using:")
	print(" - Larger GPU (a100-large)")
	print(" - Multi-GPU setup (a10g-largex2 or a10g-largex4)")
	print(" - LoRA/PEFT for memory efficiency")

	print()
	print("📋 Example job configuration:")
	print(f"""
	hf_jobs("uv", {{
	"script": "your_training_script.py",
	"flavor": "{args.hardware}",
	"timeout": "{recommended_timeout_hours:.0f}h",
	"secrets": {{"HF_TOKEN": "$HF_TOKEN"}}
	}})
	""")

	if __name__ == "__main__":
	main()