llava_lora / train_lora.py

Create train_lora.py

4d217d6 verified 8 months ago

8.06 kB

	import argparse
	import json
	from pathlib import Path
	from datetime import datetime
	import numpy as np
	import torch
	from PIL import Image, ImageDraw, ImageFont
	from torchvision.utils import make_grid
	from diffusers import StableDiffusionXLPipeline, AutoencoderKL

	try:
	from pytorch_msssim import ssim, ms_ssim
	except ImportError:
	print("Installing pytorch-msssim...")
	import subprocess
	subprocess.check_call(["pip", "install", "pytorch-msssim"])
	from pytorch_msssim import ssim, ms_ssim


	def add_caption_to_image(image, caption, font_size=20):
	"""Add caption to image and return as tensor"""
	# Convert tensor to PIL Image if needed
	if isinstance(image, torch.Tensor):
	image = (image * 255).clamp(0, 255).to(torch.uint8)
	image = image.permute(1, 2, 0).cpu().numpy()
	image = Image.fromarray(image)

	# Create new image with space for caption
	margin = 10
	width = image.width
	height = image.height + font_size + 2*margin
	new_image = Image.new('RGB', (width, height), 'white')
	new_image.paste(image, (0, 0))

	# Add caption
	draw = ImageDraw.Draw(new_image)
	try:
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", font_size)
	except:
	font = ImageFont.load_default()

	# Center the text
	text_width = draw.textlength(caption, font=font)
	x = (width - text_width) // 2
	y = height - font_size - margin

	draw.text((x, y), caption, fill='black', font=font)

	# Convert back to tensor
	new_image = torch.from_numpy(np.array(new_image)).permute(2, 0, 1).float() / 255.0
	return new_image


	def create_image_grid(images, prompts, images_per_prompt, font_size=20):
	"""Create a grid of images with captions"""
	# First add captions to all images
	captioned_images = []
	for i, img in enumerate(images):
	prompt_idx = i // images_per_prompt
	img_idx = i % images_per_prompt + 1
	caption = f"{prompts[prompt_idx]} ({img_idx}/{images_per_prompt})"
	img_tensor = torch.from_numpy(np.array(img)).permute(2, 0, 1).float() / 255.0
	captioned_img = add_caption_to_image(img_tensor, caption, font_size)
	captioned_images.append(captioned_img)

	# Convert to tensor and create grid
	image_tensor = torch.stack(captioned_images)
	grid = make_grid(image_tensor, nrow=images_per_prompt, padding=10)

	return grid


	def parse_args():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--output_path", type=str, required=True, help="path to save the images"
	)
	parser.add_argument(
	"--content_LoRA", type=str, default=None, help="path for the content LoRA"
	)
	parser.add_argument(
	"--content_alpha", type=float, default=1.0, help="scale factor for content LoRA weights"
	)
	parser.add_argument(
	"--style_LoRA", type=str, default=None, help="path for the style LoRA"
	)
	parser.add_argument(
	"--style_alpha", type=float, default=1.0, help="scale factor for style LoRA weights"
	)
	parser.add_argument(
	"--num_images_per_prompt", type=int, default=4, help="number of images per prompt"
	)
	parser.add_argument(
	"--evaluation_prompt_file", type=str, required=True, help="path to evaluation prompts file"
	)
	parser.add_argument(
	"--placeholder_style", type=str, required=True, help="placeholder for the style prompt"
	)
	parser.add_argument(
	"--placeholder_content", type=str, required=True, help="placeholder for the content prompt"
	)
	parser.add_argument(
	"--name_concept", type=str, required=True, help="name of the concept being evaluated"
	)
	parser.add_argument(
	"--font_size", type=int, default=20, help="font size for image captions"
	)
	return parser.parse_args()


	def process_prompts(pipeline, prompts, output_dir, args, prompt_type, lora_type, start_idx=0):
	"""Process a set of prompts and save results"""
	all_images = []
	current_idx = start_idx

	for prompt in prompts:
	formatted_prompt = prompt.replace("{}", args.placeholder_style if lora_type == "style" else args.placeholder_content)

	# Update config to use new argument names
	config = {
	"gen_prompt": formatted_prompt,
	"content_LoRA": args.content_LoRA if lora_type == "content" else None,
	"content_alpha": args.content_alpha if lora_type == "content" else None,
	"style_LoRA": args.style_LoRA if lora_type == "style" else None,
	"style_alpha": args.style_alpha if lora_type == "style" else None
	}

	# Save config with consecutive numbering
	config_path = output_dir / f'prompt_{current_idx}_params.json'
	with open(config_path, 'w') as f:
	json.dump(config, f, indent=4)

	# Generate images
	images = pipeline(formatted_prompt, num_images_per_prompt=args.num_images_per_prompt).images
	all_images.extend(images)

	# Save individual images with consecutive numbering
	prompt_dir = output_dir / 'output' / 'ours' / f'prompt_{current_idx}_{prompt_type}'
	prompt_dir.mkdir(parents=True, exist_ok=True)

	for img_idx, img in enumerate(images):
	img.save(prompt_dir / f'{img_idx:03d}.jpg')

	current_idx += 1

	return all_images, [p.replace("{}", args.placeholder_style if lora_type == "style" else args.placeholder_content) for p in prompts], current_idx


	if __name__ == '__main__':
	args = parse_args()

	# Create timestamped output directory
	timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
	result_dir = Path(args.output_path) / f'{args.name_concept}_{timestamp}'
	result_dir.mkdir(parents=True, exist_ok=True)

	# Load benchmark prompts
	with open(args.evaluation_prompt_file, 'r') as f:
	benchmark_prompts = json.load(f)

	# Initialize pipeline
	vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
	pipeline = StableDiffusionXLPipeline.from_pretrained(
	"stabilityai/stable-diffusion-xl-base-1.0",
	vae=vae,
	torch_dtype=torch.float16
	).to("cuda")

	current_prompt_idx = 0

	# Process content prompts if content LoRA is provided
	if args.content_LoRA is not None:
	print("Loading content LoRA...")
	pipeline.load_lora_weights(args.content_LoRA, scale=args.content_alpha)

	for category, prompts in benchmark_prompts["content"].items():
	print(f"Processing content {category} prompts...")
	images, formatted_prompts, current_prompt_idx = process_prompts(
	pipeline, prompts, result_dir, args, f"content_{category}", "content",
	start_idx=current_prompt_idx
	)

	grid = create_image_grid(images, formatted_prompts, args.num_images_per_prompt, args.font_size)
	grid_image = Image.fromarray((grid.permute(1, 2, 0).numpy() * 255).astype(np.uint8))
	grid_path = result_dir / f'grid_content_{category}.png'
	grid_image.save(grid_path)

	# Unload content LoRA
	pipeline.unload_lora_weights()

	# Process style prompts if style LoRA is provided
	if args.style_LoRA is not None:
	print("Loading style LoRA...")
	pipeline.load_lora_weights(args.style_LoRA, scale=args.style_alpha)

	print("Processing style prompts...")
	images, formatted_prompts, _ = process_prompts(
	pipeline, benchmark_prompts["style"], result_dir, args, "style", "style",
	start_idx=current_prompt_idx
	)

	grid = create_image_grid(images, formatted_prompts, args.num_images_per_prompt, args.font_size)
	grid_image = Image.fromarray((grid.permute(1, 2, 0).numpy() * 255).astype(np.uint8))
	grid_path = result_dir / 'grid_style.png'
	grid_image.save(grid_path)

	print(f"Results saved to {result_dir}")