flux-blaze-2 / src /pipeline.py

Upload folder using huggingface_hub

e94238e verified about 1 year ago

6.05 kB

	import os
	from diffusers import FluxPipeline, AutoencoderKL, FluxTransformer2DModel
	from diffusers.image_processor import VaeImageProcessor
	from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel, CLIPTextConfig, T5Config
	import torch
	import gc
	from PIL.Image import Image
	from pipelines.models import TextToImageRequest
	from torch import Generator
	from torchao.quantization import quantize_, int8_weight_only
	from time import perf_counter


	HOME = os.environ["HOME"]
	QUANTIZED_MODEL = ["text_encoder_2", "vae"]
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False,garbage_collection_threshold:0.01"
	FLUX_CHECKPOINT = "jokerbit/flux.1-schnell-Robert-int8wo"
	FLUX_CACHE = os.path.join(HOME, ".cache/huggingface/hub/models--jokerbit--flux.1-schnell-Robert-int8wo/snapshots/5ef0012f11a863e5111ec56540302a023bc8587b")
	torch.backends.cudnn.benchmark = True
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.cuda.set_per_process_memory_fraction(0.99)

	QUANT_CONFIG = int8_weight_only()
	DTYPE = torch.bfloat16
	NUM_STEPS = 4
	PROMPT = 'martyr, semiconformity, peregrination, quip, twineless, emotionless, tawa, depickle'


	def empty_cache():
	gc.collect()
	torch.cuda.empty_cache()
	torch.cuda.reset_max_memory_allocated()
	torch.cuda.reset_peak_memory_stats()


	def quantize(pipe, config):
	if "text_encoder" in QUANTIZED_MODEL:
	quantize_(pipe.text_encoder, config)
	if "text_encoder_2" in QUANTIZED_MODEL:
	quantize_(pipe.text_encoder_2, config)
	if "transformer" in QUANTIZED_MODEL:
	quantize_(pipe.transformer, config, device="cuda")
	if "vae" in QUANTIZED_MODEL:
	quantize_(pipe.vae, config)
	return pipe


	def load_pipeline() -> FluxPipeline:
	empty_cache()
	transformer = FluxTransformer2DModel.from_pretrained(os.path.join(FLUX_CACHE, "transformer"), use_safetensors=False, torch_dtype=DTYPE)
	pipe = FluxPipeline.from_pretrained(FLUX_CHECKPOINT,
	transformer=transformer,
	torch_dtype=DTYPE)
	pipe.vae.enable_tiling()
	pipe.vae.enable_slicing()
	quantize(pipe, QUANT_CONFIG)
	pipe.to("cuda")
	request = TextToImageRequest(prompt=PROMPT, height=1024, width=1024, seed=666)
	infer(request, pipe)
	# pipe.enable_model_cpu_offload()
	return pipe


	def encode_prompt(_pipeline, prompt: str):
	pipeline = FluxPipeline.from_pipe(
	_pipeline,
	transformer=None,
	vae=None,
	).to("cuda")
	with torch.no_grad():
	outputs = pipeline.encode_prompt(
	prompt=prompt,
	prompt_2=None,
	max_sequence_length=256)
	del pipeline
	empty_cache()
	return outputs


	def infer_latents(_pipeline, prompt_embeds, pooled_prompt_embeds, width: int \| None, height: int \| None, seed: int \| None):
	pipeline = FluxPipeline.from_pipe(
	_pipeline,
	text_encoder=None,
	text_encoder_2=None,
	tokenizer=None,
	tokenizer_2=None,
	vae=None,
	).to("cuda")

	if seed is None:
	generator = None
	else:
	generator = Generator(pipeline.device).manual_seed(seed)
	outputs = pipeline(
	prompt_embeds=prompt_embeds,
	pooled_prompt_embeds=pooled_prompt_embeds,
	num_inference_steps=4,
	guidance_scale=0.0,
	width=width,
	height=height,
	generator=generator,
	output_type="latent",
	).images
	del pipeline
	empty_cache()
	return outputs


	def decode_latents(vae, latents, width, height):
	vae.to("cuda")
	vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
	width = width or 64 * vae_scale_factor
	height = height or 64 * vae_scale_factor
	image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
	with torch.no_grad():
	latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
	latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
	image = vae.decode(latents, return_dict=False)[0]
	return image_processor.postprocess(image, output_type="pil")[0]


	# def infer(request: TextToImageRequest, _pipeline: FluxPipeline) -> Image:
	# prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(_pipeline, request.prompt)
	# latents = infer_latents(_pipeline, prompt_embeds, pooled_prompt_embeds, request.width, request.height, request.seed)
	# del prompt_embeds
	# del pooled_prompt_embeds
	# del text_ids
	# # _pipeline.transformer.to("cpu")
	# image = decode_latents(_pipeline.vae, latents, request.width, request.height)
	# torch.cuda.reset_peak_memory_stats()
	# return image


	def infer(request: TextToImageRequest, _pipeline: FluxPipeline) -> Image:
	if request.seed is None:
	generator = None
	else:
	generator = Generator(device="cuda").manual_seed(request.seed)
	torch.cuda.reset_peak_memory_stats()
	image = _pipeline(prompt=request.prompt,
	width=request.width,
	height=request.height,
	guidance_scale=0.0,
	generator=generator,
	output_type="pil",
	max_sequence_length=256,
	num_inference_steps=NUM_STEPS).images[0]
	return image

	if __name__ == "__main__":
	request = TextToImageRequest(prompt=PROMPT,
	height=None,
	width=None,
	seed=666)
	start_time = perf_counter()
	pipe_ = load_pipeline()
	stop_time = perf_counter()
	print(f"Pipeline is loaded in {stop_time - start_time}s")
	for _ in range(4):
	start_time = perf_counter()
	infer(request, pipe_)
	stop_time = perf_counter()
	print(f"Request in {stop_time - start_time}s")

	# pipe("cat holding a womboai sign", num_inference_steps=4, guidance_scale=0, generator=torch.Generator(pipe.device).manual_seed(666)).images[0].save("sample.png")