ohthisonesnice1 / src /pipeline.py
manbeast3b's picture
Update src/pipeline.py
3e5429f verified
import os
from diffusers import FluxPipeline
from diffusers.image_processor import VaeImageProcessor
from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel, CLIPTextConfig, T5Config
import torch
import gc
from PIL.Image import Image
from pipelines.models import TextToImageRequest
from torch import Generator
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False,garbage_collection_threshold:0.01"
FLUX_CHECKPOINT = "jokerbit/flux.1-schnell-city96"
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.cuda.set_per_process_memory_fraction(0.98)
DTYPE = torch.bfloat16
def clear():
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_peak_memory_stats()
def load_pipeline() -> FluxPipeline:
clear()
pipe = FluxPipeline.from_pretrained(FLUX_CHECKPOINT,
torch_dtype=DTYPE)
pipe.text_encoder.to(memory_format=torch.channels_last)
pipe.text_encoder_2.to(memory_format=torch.channels_last)
pipe.transformer.to(memory_format=torch.channels_last)
pipe.vae.to(memory_format=torch.channels_last)
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()
# pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead")
pipe._exclude_from_cpu_offload = ["vae"]
pipe.enable_sequential_cpu_offload()
for _ in range(3):
clear()
pipe("testing speed warmups ", guidance_scale=0., max_sequence_length=256, num_inference_steps=4)
clear()
return pipe
sample = True
@torch.inference_mode()
def infer(request: TextToImageRequest, pipeline: FluxPipeline) -> Image:
global sample
if sample:
clear()
sample = None
torch.cuda.reset_peak_memory_stats()
generator = Generator("cuda").manual_seed(request.seed)
image=pipeline(request.prompt,generator=generator, guidance_scale=0.0, num_inference_steps=4, max_sequence_length=256, height=request.height, width=request.width, output_type="pil").images[0]
return(image)