Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| # /// script | |
| # dependencies = [] | |
| # /// | |
| """ | |
| Estimate training time and cost for TRL jobs. | |
| Usage: | |
| python estimate_cost.py --model <model> --dataset <dataset> --hardware <flavor> | |
| Example: | |
| python estimate_cost.py --model Qwen/Qwen2.5-0.5B --dataset trl-lib/Capybara --hardware a10g-large | |
| """ | |
| import argparse | |
| # Hardware costs per hour (approximate) | |
| HARDWARE_COSTS = { | |
| "t4-small": 0.75, | |
| "t4-medium": 1.50, | |
| "l4x1": 2.50, | |
| "a10g-small": 3.50, | |
| "a10g-large": 5.00, | |
| "a10g-largex2": 10.00, | |
| "a10g-largex4": 20.00, | |
| "a100-large": 10.00, | |
| } | |
| # Model sizes in billions of parameters | |
| MODEL_SIZES = { | |
| "0.5B": 0.5, | |
| "1.5B": 1.5, | |
| "3B": 3, | |
| "7B": 7, | |
| "13B": 13, | |
| } | |
| def estimate_training_time(model_params, dataset_size, epochs, hardware): | |
| """Estimate training time in hours.""" | |
| # Rough estimates based on empirical observations | |
| # These are approximations and actual times will vary | |
| base_time_per_1k_examples = 0.1 # hours for 1B model on a10g-large | |
| # Adjust for model size | |
| time = base_time_per_1k_examples * model_params * (dataset_size / 1000) * epochs | |
| # Adjust for hardware (relative to a10g-large baseline) | |
| hardware_multipliers = { | |
| "t4-small": 2.0, | |
| "t4-medium": 1.5, | |
| "l4x1": 1.2, | |
| "a10g-small": 1.3, | |
| "a10g-large": 1.0, | |
| "a10g-largex2": 0.6, | |
| "a10g-largex4": 0.4, | |
| "a100-large": 0.7, | |
| } | |
| multiplier = hardware_multipliers.get(hardware, 1.0) | |
| time *= multiplier | |
| return time | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="Estimate training cost for TRL jobs") | |
| parser.add_argument("--model", required=True, help="Model name or size (e.g., 'Qwen/Qwen2.5-0.5B' or '0.5B')") | |
| parser.add_argument("--dataset", required=True, help="Dataset name") | |
| parser.add_argument("--hardware", required=True, choices=HARDWARE_COSTS.keys(), help="Hardware flavor") | |
| parser.add_argument("--dataset-size", type=int, help="Override dataset size (number of examples)") | |
| parser.add_argument("--epochs", type=int, default=3, help="Number of training epochs") | |
| return parser.parse_args() | |
| def extract_model_size(model_name): | |
| """Extract model size from name or return parsed value.""" | |
| for size_str, size_val in MODEL_SIZES.items(): | |
| if size_str in model_name: | |
| return size_val | |
| # Try to parse directly | |
| try: | |
| if "B" in model_name: | |
| return float(model_name.replace("B", "")) | |
| except: | |
| pass | |
| return 1.0 # Default to 1B if can't determine | |
| def main(): | |
| args = parse_args() | |
| # Extract model parameters | |
| model_params = extract_model_size(args.model) | |
| print(f"π Model: {args.model} (~{model_params}B parameters)") | |
| # Estimate dataset size (would need to load to get real size) | |
| if args.dataset_size: | |
| dataset_size = args.dataset_size | |
| else: | |
| # Common dataset sizes (approximations) | |
| dataset_sizes = { | |
| "trl-lib/Capybara": 16000, | |
| "Anthropic/hh-rlhf": 160000, | |
| } | |
| dataset_size = dataset_sizes.get(args.dataset, 10000) | |
| print(f"π¦ Dataset: {args.dataset} (~{dataset_size} examples)") | |
| print(f"π Epochs: {args.epochs}") | |
| print(f"π» Hardware: {args.hardware}") | |
| print() | |
| # Estimate training time | |
| estimated_hours = estimate_training_time(model_params, dataset_size, args.epochs, args.hardware) | |
| estimated_cost = estimated_hours * HARDWARE_COSTS[args.hardware] | |
| # Recommend timeout with buffer | |
| recommended_timeout_hours = estimated_hours * 1.3 # 30% buffer | |
| print(f"β±οΈ Estimated training time: {estimated_hours:.1f} hours") | |
| print(f"π° Estimated cost: ${estimated_cost:.2f}") | |
| print(f"β° Recommended timeout: {recommended_timeout_hours:.1f}h (with 30% buffer)") | |
| print() | |
| # Warnings and recommendations | |
| if estimated_hours > 4: | |
| print("β οΈ Long training time - consider:") | |
| print(" - Using faster hardware") | |
| print(" - Reducing epochs") | |
| print(" - Using a smaller dataset subset for testing") | |
| if model_params >= 7 and args.hardware not in ["a10g-largex2", "a10g-largex4", "a100-large"]: | |
| print("β οΈ Large model - consider using:") | |
| print(" - Larger GPU (a100-large)") | |
| print(" - Multi-GPU setup (a10g-largex2 or a10g-largex4)") | |
| print(" - LoRA/PEFT for memory efficiency") | |
| print() | |
| print("π Example job configuration:") | |
| print(f""" | |
| hf_jobs("uv", {{ | |
| "script": "your_training_script.py", | |
| "flavor": "{args.hardware}", | |
| "timeout": "{recommended_timeout_hours:.0f}h", | |
| "secrets": {{"HF_TOKEN": "$HF_TOKEN"}} | |
| }}) | |
| """) | |
| if __name__ == "__main__": | |
| main() | |