trainer / rp_handler.py

Add RunPod handler with cleanup support

4994fd0 verified about 1 month ago

14.2 kB

	"""
	RunPod Serverless Handler - Wrapper for AI-Toolkit
	Does NOT modify ai-toolkit code, only wraps it

	Supports RunPod model caching via HuggingFace integration.
	"""

	import os
	import sys
	import subprocess
	import traceback
	import logging
	import uuid
	from pathlib import Path

	# =============================================================================
	# Environment Setup (must be before other imports)
	# =============================================================================

	# RunPod cache paths
	RUNPOD_CACHE_BASE = "/runpod-volume/huggingface-cache"
	RUNPOD_HF_CACHE = "/runpod-volume/huggingface-cache/hub"

	# Check if running on RunPod with cache available
	IS_RUNPOD_CACHE = os.path.exists("/runpod-volume")

	if IS_RUNPOD_CACHE:
	# Use RunPod's cache directory for HuggingFace downloads
	os.environ["HF_HOME"] = RUNPOD_CACHE_BASE
	os.environ["HUGGINGFACE_HUB_CACHE"] = RUNPOD_HF_CACHE
	os.environ["TRANSFORMERS_CACHE"] = RUNPOD_HF_CACHE
	os.environ["HF_DATASETS_CACHE"] = f"{RUNPOD_CACHE_BASE}/datasets"

	# Performance and telemetry settings
	os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
	os.environ["NO_ALBUMENTATIONS_UPDATE"] = "1"
	os.environ["DISABLE_TELEMETRY"] = "YES"

	# Get HF token from environment
	HF_TOKEN = os.environ.get("HF_TOKEN", "")
	if HF_TOKEN:
	os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN

	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	AI_TOOLKIT_DIR = os.path.join(SCRIPT_DIR, "ai-toolkit")

	import runpod
	import torch
	import yaml
	import gc
	import shutil

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Track current loaded model for cleanup
	CURRENT_MODEL = None

	# =============================================================================
	# Model Configuration
	# =============================================================================

	# Model configs matching ai-toolkit/config/examples exactly
	MODEL_PRESETS = {
	"wan21_1b": "train_lora_wan21_1b_24gb.yaml",
	"wan21_14b": "train_lora_wan21_14b_24gb.yaml",
	"wan22_14b": "train_lora_wan22_14b_24gb.yaml",
	"qwen_image": "train_lora_qwen_image_24gb.yaml",
	"qwen_image_edit": "train_lora_qwen_image_edit_32gb.yaml",
	"qwen_image_edit_2509": "train_lora_qwen_image_edit_2509_32gb.yaml",
	"flux_dev": "train_lora_flux_24gb.yaml",
	"flux_schnell": "train_lora_flux_schnell_24gb.yaml",
	}

	# HuggingFace repos used by each model (for pre-warming)
	MODEL_HF_REPOS = {
	"wan21_1b": ["Wan-AI/Wan2.1-T2V-1.3B-Diffusers"],
	"wan21_14b": ["Wan-AI/Wan2.1-T2V-14B-Diffusers"],
	"wan22_14b": ["ai-toolkit/Wan2.2-T2V-A14B-Diffusers-bf16"],
	"qwen_image": ["Qwen/Qwen-Image"],
	"qwen_image_edit": ["Qwen/Qwen-Image-Edit"],
	"qwen_image_edit_2509": ["Qwen/Qwen-Image-Edit"],
	"flux_dev": ["black-forest-labs/FLUX.1-dev"],
	"flux_schnell": ["black-forest-labs/FLUX.1-schnell"],
	}

	# Accuracy Recovery Adapters (smaller files, can be pre-downloaded)
	ARA_FILES = {
	"wan22_14b": "ostris/accuracy_recovery_adapters/wan22_14b_t2i_torchao_uint4.safetensors",
	"qwen_image": "ostris/accuracy_recovery_adapters/qwen_image_torchao_uint3.safetensors",
	}


	# =============================================================================
	# Cleanup Functions
	# =============================================================================

	def cleanup_gpu_memory():
	"""Aggressively clean up GPU memory."""
	logger.info("Cleaning up GPU memory...")

	# Clear PyTorch cache
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.synchronize()

	# Force garbage collection
	gc.collect()

	# Clear again after GC
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	logger.info(f"GPU memory after cleanup: {get_gpu_info()}")


	def cleanup_temp_files():
	"""Clean up temporary training files."""
	logger.info("Cleaning up temporary files...")

	# Clean up generated configs (keep example configs)
	config_dir = os.path.join(AI_TOOLKIT_DIR, "config")
	for f in os.listdir(config_dir):
	if f.endswith('.yaml') and f.startswith(('lora_', 'test_', 'my_')):
	try:
	os.remove(os.path.join(config_dir, f))
	logger.info(f"Removed temp config: {f}")
	except Exception as e:
	logger.warning(f"Failed to remove {f}: {e}")

	# Clean up latent cache directories in workspace
	workspace_dirs = ["/workspace/dataset", "/workspace/output"]
	for ws_dir in workspace_dirs:
	if os.path.exists(ws_dir):
	for item in os.listdir(ws_dir):
	item_path = os.path.join(ws_dir, item)
	if item.startswith(('_latent_cache', '_t_e_cache', '.aitk')):
	try:
	if os.path.isdir(item_path):
	shutil.rmtree(item_path)
	else:
	os.remove(item_path)
	logger.info(f"Removed cache: {item_path}")
	except Exception as e:
	logger.warning(f"Failed to remove {item_path}: {e}")


	def cleanup_before_training(new_model: str):
	"""Full cleanup before starting new model training."""
	global CURRENT_MODEL

	if CURRENT_MODEL and CURRENT_MODEL != new_model:
	logger.info(f"Switching from {CURRENT_MODEL} to {new_model} - performing full cleanup")
	cleanup_gpu_memory()
	cleanup_temp_files()
	elif CURRENT_MODEL == new_model:
	logger.info(f"Same model {new_model} - light cleanup only")
	cleanup_gpu_memory()
	else:
	logger.info(f"First training run with {new_model}")

	CURRENT_MODEL = new_model

	# Final memory check
	gpu_info = get_gpu_info()
	logger.info(f"Ready for training. GPU: {gpu_info['name']}, Free: {gpu_info['free_gb']}GB")


	# =============================================================================
	# Utility Functions
	# =============================================================================

	def get_gpu_info():
	"""Get GPU information."""
	if not torch.cuda.is_available():
	return {"available": False}
	props = torch.cuda.get_device_properties(0)
	free_mem, total_mem = torch.cuda.mem_get_info(0)
	return {
	"available": True,
	"name": props.name,
	"total_gb": round(total_mem / (1024**3), 2),
	"free_gb": round(free_mem / (1024**3), 2),
	}


	def get_environment_info():
	"""Get environment information for debugging."""
	return {
	"is_runpod_cache": IS_RUNPOD_CACHE,
	"hf_home": os.environ.get("HF_HOME", "not set"),
	"hf_token_set": bool(HF_TOKEN),
	"gpu": get_gpu_info(),
	"ai_toolkit_dir": AI_TOOLKIT_DIR,
	"cache_exists": os.path.exists(RUNPOD_HF_CACHE) if IS_RUNPOD_CACHE else False,
	}


	def find_cached_model(hf_repo: str) -> str:
	"""
	Find cached model path on RunPod.

	Args:
	hf_repo: HuggingFace repo ID (e.g., 'black-forest-labs/FLUX.1-dev')

	Returns:
	Path to cached model, or original repo ID if not cached
	"""
	if not IS_RUNPOD_CACHE:
	return hf_repo

	# Convert "Org/Repo" -> "models--Org--Repo"
	cache_name = hf_repo.replace("/", "--")
	snapshots_dir = Path(RUNPOD_HF_CACHE) / f"models--{cache_name}" / "snapshots"

	if snapshots_dir.exists():
	snapshots = list(snapshots_dir.iterdir())
	if snapshots:
	cached_path = str(snapshots[0])
	logger.info(f"Using cached model: {hf_repo} -> {cached_path}")
	return cached_path

	logger.info(f"Model not cached, will download: {hf_repo}")
	return hf_repo


	def check_model_cache_status(model_key: str) -> dict:
	"""Check if model files are cached."""
	if model_key not in MODEL_HF_REPOS:
	return {"cached": False, "reason": "unknown model"}

	repos = MODEL_HF_REPOS[model_key]
	status = {"repos": {}}

	for repo in repos:
	cache_name = repo.replace("/", "--")
	snapshots_dir = Path(RUNPOD_HF_CACHE) / f"models--{cache_name}" / "snapshots"

	if snapshots_dir.exists() and list(snapshots_dir.iterdir()):
	status["repos"][repo] = "cached"
	else:
	status["repos"][repo] = "not cached"

	status["all_cached"] = all(s == "cached" for s in status["repos"].values())
	return status


	# =============================================================================
	# Config Loading and Training
	# =============================================================================

	def load_example_config(model_key):
	"""Load example config from ai-toolkit."""
	if model_key not in MODEL_PRESETS:
	raise ValueError(f"Unknown model: {model_key}. Available: {list(MODEL_PRESETS.keys())}")

	config_file = MODEL_PRESETS[model_key]
	config_path = os.path.join(AI_TOOLKIT_DIR, "config", "examples", config_file)

	with open(config_path, 'r') as f:
	return yaml.safe_load(f)


	def run_training(params):
	"""Run training using ai-toolkit."""
	model_key = params.get("model", "wan22_14b")

	# Cleanup before starting new training
	cleanup_before_training(model_key)

	# Load base config from ai-toolkit examples
	config = load_example_config(model_key)

	# Override with user params
	job_name = params.get("name", f"lora_{model_key}_{uuid.uuid4().hex[:6]}")
	config["config"]["name"] = job_name

	process = config["config"]["process"][0]

	# Dataset
	process["datasets"][0]["folder_path"] = params.get("dataset_path", "/workspace/dataset")

	# Output
	process["training_folder"] = params.get("output_path", "/workspace/output")

	# Training params (only override if provided)
	if "steps" in params:
	process["train"]["steps"] = params["steps"]
	if "batch_size" in params:
	process["train"]["batch_size"] = params["batch_size"]
	if "learning_rate" in params:
	process["train"]["lr"] = params["learning_rate"]
	if "lora_rank" in params:
	process["network"]["linear"] = params["lora_rank"]
	process["network"]["linear_alpha"] = params.get("lora_alpha", params["lora_rank"])
	if "save_every" in params:
	process["save"]["save_every"] = params["save_every"]
	if "sample_every" in params:
	process["sample"]["sample_every"] = params["sample_every"]
	if "resolution" in params:
	process["datasets"][0]["resolution"] = params["resolution"]
	if "num_frames" in params:
	process["datasets"][0]["num_frames"] = params["num_frames"]
	if "sample_prompts" in params:
	process["sample"]["prompts"] = params["sample_prompts"]
	if "trigger_word" in params:
	process["trigger_word"] = params["trigger_word"]

	# Check if we should use cached model path
	if IS_RUNPOD_CACHE and "model" in process:
	original_path = process["model"].get("name_or_path", "")
	if original_path:
	cached_path = find_cached_model(original_path)
	if cached_path != original_path:
	process["model"]["name_or_path"] = cached_path
	logger.info(f"Using cached model path: {cached_path}")

	# Save config
	config_dir = os.path.join(AI_TOOLKIT_DIR, "config")
	config_path = os.path.join(config_dir, f"{job_name}.yaml")

	with open(config_path, 'w') as f:
	yaml.dump(config, f, default_flow_style=False)

	logger.info(f"Config saved: {config_path}")
	logger.info(f"Starting: {job_name}")

	# Run ai-toolkit
	cmd = [sys.executable, os.path.join(AI_TOOLKIT_DIR, "run.py"), config_path]
	logger.info(f"Command: {' '.join(cmd)}")

	proc = subprocess.Popen(
	cmd,
	cwd=AI_TOOLKIT_DIR,
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT,
	text=True,
	bufsize=1,
	)

	for line in proc.stdout:
	logger.info(line.rstrip())

	proc.wait()

	# Cleanup after training (success or fail)
	cleanup_gpu_memory()

	if proc.returncode != 0:
	raise RuntimeError(f"Training failed with code {proc.returncode}")

	return {
	"status": "success",
	"job_name": job_name,
	"output_path": process["training_folder"],
	"model": model_key,
	}


	# =============================================================================
	# Handler
	# =============================================================================

	def handler(job):
	"""RunPod handler."""
	job_input = job.get("input", {})
	action = job_input.get("action", "train")

	logger.info(f"Action: {action}, GPU: {get_gpu_info()}")

	try:
	if action == "list_models":
	return {"status": "success", "models": list(MODEL_PRESETS.keys())}

	elif action == "status":
	return {
	"status": "success",
	"environment": get_environment_info(),
	}

	elif action == "check_cache":
	model_key = job_input.get("model")
	if model_key:
	cache_status = check_model_cache_status(model_key)
	else:
	cache_status = {m: check_model_cache_status(m) for m in MODEL_PRESETS.keys()}
	return {"status": "success", "cache": cache_status}

	elif action == "cleanup":
	# Manual cleanup action
	cleanup_gpu_memory()
	cleanup_temp_files()
	global CURRENT_MODEL
	CURRENT_MODEL = None
	return {
	"status": "success",
	"message": "Cleanup complete",
	"gpu": get_gpu_info(),
	}

	elif action == "train":
	params = job_input.get("params", {})
	params["model"] = job_input.get("model", params.get("model", "wan22_14b"))
	return run_training(params)

	else:
	return {"status": "error", "error": f"Unknown action: {action}"}

	except Exception as e:
	logger.error(traceback.format_exc())
	return {"status": "error", "error": str(e)}


	if __name__ == "__main__":
	logger.info("Starting AI-Toolkit RunPod Handler")
	logger.info(f"Environment: {get_environment_info()}")
	runpod.serverless.start({"handler": handler})