MyApricity
/

OpsTorch

Model card Files Files and versions

xet

Community

Update src/pipeline.py

by Shinichie - opened Feb 9, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+105

-107

Files changed (1) hide show

src/pipeline.py +105 -107

src/pipeline.py CHANGED Viewed

@@ -2,139 +2,137 @@ import os
 import torch
 import torch._dynamo
 import gc
 import json
 import transformers
 from huggingface_hub.constants import HF_HUB_CACHE
 from transformers import T5EncoderModel, T5TokenizerFast
 from PIL.Image import Image
-from diffusers import FluxPipeline, AutoencoderKL, AutoencoderTiny
 from pipelines.models import TextToImageRequest
 from optimum.quanto import requantize
-import json
 from torch import Generator
-from diffusers import FluxTransformer2DModel, DiffusionPipeline
-# MYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMYMY
-# ApricityApricityApricityApricityApricityApricityApricityApricityApricityApricityApricityApricityApricity
 from torch._dynamo import config
 from torch._inductor import config as ind_config
-import torch
-import math
-from typing import Dict, Any
-torch._dynamo.config.suppress_errors = True
-os.environ['PYTORCH_CUDA_ALLOC_CONF']="expandable_segments:True"
-os.environ["TOKENIZERS_PARALLELISM"] = "True"
-ckpt_root = "MyApricity/FLUX_OPT_SCHNELL_1.2"
-revision_root = "488528b6f815bff1bbc747cf1e0947c77c544665"
-Pipeline = None
-use_com = False
-def optimize_torch():
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-    torch.backends.cudnn.benchmark = True
-    # torch.backends.cudnn.benchmark_limit = 20
-    torch.set_float32_matmul_precision("high")
-    # config.cache_size_limit = 10000000000
-    # ind_config.shape_padding = True
-try:
-    optimize_torch()
-except:
-    print("nothing wrong")
-def delete_ca_che():
-    torch.cuda.empty_cache()
-    torch.cuda.reset_max_memory_allocated()
-    torch.cuda.reset_peak_memory_stats()
-def pipeline_loader() -> Pipeline:
-    print("Loading text encoder...")
-    en = T5EncoderModel.from_pretrained(
-        "city96/t5-v1_1-xxl-encoder-bf16",
-        revision="1b9c856aadb864af93c1dcdc226c2774fa67bc86",
-        torch_dtype=torch.bfloat16,
-    )
-    transformer_path_main = os.path.join(HF_HUB_CACHE, "models--MyApricity--FLUX_OPT_SCHNELL_1.2/snapshots/488528b6f815bff1bbc747cf1e0947c77c544665")
-    transformer_model = FluxTransformer2DModel.from_pretrained(transformer_path_main, torch_dtype=torch.bfloat16, use_safetensors=False)
-    pipe = DiffusionPipeline.from_pretrained(ckpt_root,
-                        revision=revision_root,
-                        transformer=transformer_model,
-                        torch_dtype=torch.bfloat16)
-    pipe.to("cuda")
-    try:
-        # fuse QKV projections in Transformer and VAE
         pipe.transformer.fuse_qkv_projections()
         pipe.vae.fuse_qkv_projections()
-        # switch memory layout to Torch's preferred, channels_last
         pipe.transformer.to(memory_format=torch.channels_last)
         pipe.vae.to(memory_format=torch.channels_last)
-        # set torch compile flags
         config = torch._inductor.config
-        config.disable_progress = False  # show progress bar
-        config.conv_1x1_as_mm = True  # treat 1x1 convolutions as matrix muls
-        # tag the compute-intensive modules, the Transformer and VAE decoder, for compilation
         pipe.transformer = torch.compile(
-            pipe.transformer, mode="max-autotune", fullgraph=True
         )
         pipe.vae.decode = torch.compile(
-            pipe.vae.decode, mode="max-autotune", fullgraph=True
         )
-        # trigger torch compilation
-        print("running torch compiliation..")
-        pipe(
-            "dummy prompt to trigger torch compilation",
-            output_type="pil",
-            num_inference_steps=4,  # use ~50 for [dev], smaller for [schnell]
-        ).images[0]
-        print("finished torch compilation")
-    except:
         pipe(
-            "a beautiful girl",
             output_type="pil",
-            num_inference_steps=4,  # use ~50 for [dev], smaller for [schnell]
         ).images[0]
-        print("Pass error")
-    return pipe
-@torch.no_grad()
-def inference(request: TextToImageRequest, pipeline: Pipeline) -> Image:
-    delete_ca_che()
-    generator = Generator(pipeline.device).manual_seed(request.seed)
-    return pipeline(
-        request.prompt,
-        generator=generator,
-        guidance_scale=0.0,
-        num_inference_steps=4,
-        max_sequence_length=256,
-        height=request.height,
-        width=request.width,
-    ).images[0]

 import torch
 import torch._dynamo
 import gc
 import json
 import transformers
 from huggingface_hub.constants import HF_HUB_CACHE
 from transformers import T5EncoderModel, T5TokenizerFast
 from PIL.Image import Image
+from diffusers import FluxPipeline, AutoencoderKL, AutoencoderTiny, FluxTransformer2DModel, DiffusionPipeline
 from pipelines.models import TextToImageRequest
 from optimum.quanto import requantize
 from torch import Generator
 from torch._dynamo import config
 from torch._inductor import config as ind_config
+from typing import Dict, Any, Callable
+from functools import wraps
+def error_handler(func: Callable):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            print(f"Error in {func.__name__}: {str(e)}")
+    return wrapper
+class TorchOptimizer:
+    def optimize_settings(self):
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cudnn.benchmark = True
+        torch.set_float32_matmul_precision("high")
+    def clear_cache(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+class PipelineManager:
+    def __init__(self):
+        self.ckpt_root = "MyApricity/FLUX_OPT_SCHNELL_1.2"
+        self.revision_root = "488528b6f815bff1bbc747cf1e0947c77c544665"
+        self.pipeline = None
+        self.optimizer = TorchOptimizer()
+        # Configure environment
+        torch._dynamo.config.suppress_errors = True
+        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = "expandable_segments:True"
+        os.environ["TOKENIZERS_PARALLELISM"] = "True"
+        # Initialize torch settings
+        self.optimizer.optimize_settings()
+    def load_transformer(self):
+        transformer_path = os.path.join(
+            HF_HUB_CACHE,
+            "models--MyApricity--FLUX_OPT_SCHNELL_1.2/snapshots/488528b6f815bff1bbc747cf1e0947c77c544665"
+        )
+        return FluxTransformer2DModel.from_pretrained(
+            transformer_path,
+            torch_dtype=torch.bfloat16,
+            use_safetensors=False
+        )
+    @error_handler
+    def optimize_pipeline(self, pipe):
+        # Fuse QKV projections
         pipe.transformer.fuse_qkv_projections()
         pipe.vae.fuse_qkv_projections()
+        # Optimize memory layout
         pipe.transformer.to(memory_format=torch.channels_last)
         pipe.vae.to(memory_format=torch.channels_last)
+        # Configure torch inductor
         config = torch._inductor.config
+        config.disable_progress = False
+        config.conv_1x1_as_mm = True
+        # Compile modules
         pipe.transformer = torch.compile(
+            pipe.transformer,
+            mode="max-autotune",
+            fullgraph=True
         )
         pipe.vae.decode = torch.compile(
+            pipe.vae.decode,
+            mode="max-autotune",
+            fullgraph=True
         )
+        return pipe
+    def load_pipeline(self):
+        # Load transformer model
+        transformer_model = self.load_transformer()
+        # Create pipeline
+        pipe = DiffusionPipeline.from_pretrained(
+            self.ckpt_root,
+            revision=self.revision_root,
+            transformer=transformer_model,
+            torch_dtype=torch.bfloat16
+        )
+        pipe.to("cuda")
+        # Optimize pipeline
+        pipe = self.optimize_pipeline(pipe)
+        # Trigger compilation
+        print("Running torch compilation...")
         pipe(
+            "dummy prompt to trigger torch compilation",
             output_type="pil",
+            num_inference_steps=4
         ).images[0]
+        print("Finished torch compilation")
+        return pipe
+    def run_inference(self, request: TextToImageRequest) -> Image:
+        if self.pipeline is None:
+            self.pipeline = self.load_pipeline()
+        self.optimizer.clear_cache()
+        generator = Generator(self.pipeline.device).manual_seed(request.seed)
+        return self.pipeline(
+            request.prompt,
+            generator=generator,
+            guidance_scale=0.0,
+            num_inference_steps=4,
+            max_sequence_length=256,
+            height=request.height,
+            width=request.width,
+        ).images[0]