MyApricity
/

transquant_3

Model card Files Files and versions

xet

Community

Your Name commited on Jan 26, 2025

Commit

d25aa2d

1 Parent(s): 24f9b3f

x

Browse files

Files changed (1) hide show

src/pipeline.py +22 -32

src/pipeline.py CHANGED Viewed

@@ -7,9 +7,8 @@ from bitsandbytes.nn.modules import Params4bit, QuantState
 import json
 import transformers
 from huggingface_hub.constants import HF_HUB_CACHE
-from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel
-from torchao.quantization import quantize_, int8_weight_only, fpx_weight_only
 from torch import Generator
 from diffusers import FluxTransformer2DModel, DiffusionPipeline
@@ -20,7 +19,6 @@ import json
 torch._dynamo.config.suppress_errors = True
 os.environ['PYTORCH_CUDA_ALLOC_CONF']="expandable_segments:True"
 os.environ["TOKENIZERS_PARALLELISM"] = "True"
@@ -30,16 +28,6 @@ REVISION = "741f7c3ce8b383c54771c7003378a50191e9efe9"
 Pipeline = None
-import torch
-import math
-from typing import Dict, Any
-def remove_cache():
-    gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.reset_max_memory_allocated()
-    torch.cuda.reset_peak_memory_stats()
 # ---------------- NF4 ----------------
 def functional_linear_4bits(x, weight, bias):
     out = bnb.matmul_4bit(x, weight.t(), bias=bias, quant_state=weight.quant_state)
@@ -47,7 +35,7 @@ def functional_linear_4bits(x, weight, bias):
     return out
-def copy_quant_state(state, device=None):
     if state is None:
         return None
@@ -78,16 +66,16 @@ def copy_quant_state(state, device=None):
     )
-class ForgeParams4bit(Params4bit):
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
         if device is not None and device.type == "cuda" and not self.bnb_quantized:
             return self._quantize(device)
         else:
-            n = ForgeParams4bit(
                 torch.nn.Parameter.to(self, device=device, dtype=dtype, non_blocking=non_blocking),
                 requires_grad=self.requires_grad,
-                quant_state=copy_quant_state(self.quant_state, device),
                 compress_statistics=False,
                 blocksize=64,
                 quant_type=self.quant_type,
@@ -101,7 +89,7 @@ class ForgeParams4bit(Params4bit):
             return n
-class ForgeLoader4Bit(torch.nn.Module):
     def __init__(self, *, device, dtype, quant_type, **kwargs):
         super().__init__()
         self.dummy = torch.nn.Parameter(torch.empty(1, device=device, dtype=dtype))
@@ -124,7 +112,7 @@ class ForgeLoader4Bit(torch.nn.Module):
         if any('bitsandbytes' in k for k in quant_state_keys):
             quant_state_dict = {k: state_dict[prefix + "weight." + k] for k in quant_state_keys}
-            self.weight = ForgeParams4bit.from_prequantized(
                 data=state_dict[prefix + 'weight'],
                 quantized_stats=quant_state_dict,
                 requires_grad=False,
@@ -139,7 +127,7 @@ class ForgeLoader4Bit(torch.nn.Module):
             del self.dummy
         elif hasattr(self, 'dummy'):
             if prefix + 'weight' in state_dict:
-                self.weight = ForgeParams4bit(
                     state_dict[prefix + 'weight'].to(self.dummy),
                     requires_grad=False,
                     compress_statistics=True,
@@ -157,7 +145,7 @@ class ForgeLoader4Bit(torch.nn.Module):
             super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
-class Linear(ForgeLoader4Bit):
     def __init__(self, *args, device=None, dtype=None, **kwargs):
         super().__init__(device=device, dtype=dtype, quant_type='nf4')
@@ -170,9 +158,6 @@ class Linear(ForgeLoader4Bit):
         return functional_linear_4bits(x, self.weight, self.bias)
-# Replace nn.Linear with the 4-bit quantized Linear
-# torch.nn.Linear = Linear
 class InitModel:
     @staticmethod
@@ -209,26 +194,28 @@ class InitModel:
 def load_pipeline() -> Pipeline:
     transformer_path = os.path.join(HF_HUB_CACHE, "models--MyApricity--Flux_Transformer_float8/snapshots/66c5f182385555a00ec90272ab711bb6d3c197db")
     transformer = InitModel.load_transformer(transformer_path)
-    text_encoder_2 = InitModel.load_text_encoder()
-    vae = InitModel.load_vae()
     pipeline = DiffusionPipeline.from_pretrained(CHECKPOINT,
                         revision=REVISION,
                         vae=vae,
                         transformer=transformer,
-                        text_encoder_2=text_encoder_2,
                         torch_dtype=torch.bfloat16)
     pipeline.to("cuda")
     try:
         pipeline.enable_vae_slicing()
-        torch.nn.LinearLayer = Linear
     except:
-        print("Using origin pipeline")
     prms = [
@@ -252,7 +239,10 @@ def load_pipeline() -> Pipeline:
 @torch.no_grad()
 def infer(request: TextToImageRequest, pipeline: Pipeline) -> Image:
-    remove_cache()
     # remove cache here for better result
     generator = Generator(pipeline.device).manual_seed(request.seed)

 import json
 import transformers
 from huggingface_hub.constants import HF_HUB_CACHE
+from transformers import T5EncoderModel, T5TokenizerFast
 from torch import Generator
 from diffusers import FluxTransformer2DModel, DiffusionPipeline
 torch._dynamo.config.suppress_errors = True
 os.environ['PYTORCH_CUDA_ALLOC_CONF']="expandable_segments:True"
 os.environ["TOKENIZERS_PARALLELISM"] = "True"
 Pipeline = None
 # ---------------- NF4 ----------------
 def functional_linear_4bits(x, weight, bias):
     out = bnb.matmul_4bit(x, weight.t(), bias=bias, quant_state=weight.quant_state)
     return out
+def quant_state_copier(state, device=None):
     if state is None:
         return None
     )
+class Forge_Params_4Bit(Params4bit):
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
         if device is not None and device.type == "cuda" and not self.bnb_quantized:
             return self._quantize(device)
         else:
+            n = Forge_Params_4Bit(
                 torch.nn.Parameter.to(self, device=device, dtype=dtype, non_blocking=non_blocking),
                 requires_grad=self.requires_grad,
+                quant_state=quant_state_copier(self.quant_state, device),
                 compress_statistics=False,
                 blocksize=64,
                 quant_type=self.quant_type,
             return n
+class Force_Loader_4Bits(torch.nn.Module):
     def __init__(self, *, device, dtype, quant_type, **kwargs):
         super().__init__()
         self.dummy = torch.nn.Parameter(torch.empty(1, device=device, dtype=dtype))
         if any('bitsandbytes' in k for k in quant_state_keys):
             quant_state_dict = {k: state_dict[prefix + "weight." + k] for k in quant_state_keys}
+            self.weight = Forge_Params_4Bit.from_prequantized(
                 data=state_dict[prefix + 'weight'],
                 quantized_stats=quant_state_dict,
                 requires_grad=False,
             del self.dummy
         elif hasattr(self, 'dummy'):
             if prefix + 'weight' in state_dict:
+                self.weight = Forge_Params_4Bit(
                     state_dict[prefix + 'weight'].to(self.dummy),
                     requires_grad=False,
                     compress_statistics=True,
             super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+class CustomLinear(Force_Loader_4Bits):
     def __init__(self, *args, device=None, dtype=None, **kwargs):
         super().__init__(device=device, dtype=dtype, quant_type='nf4')
         return functional_linear_4bits(x, self.weight, self.bias)
 class InitModel:
     @staticmethod
 def load_pipeline() -> Pipeline:
+    t5_encoder_2 = InitModel.load_text_encoder()
+    vae = InitModel.load_vae()
     transformer_path = os.path.join(HF_HUB_CACHE, "models--MyApricity--Flux_Transformer_float8/snapshots/66c5f182385555a00ec90272ab711bb6d3c197db")
     transformer = InitModel.load_transformer(transformer_path)
     pipeline = DiffusionPipeline.from_pretrained(CHECKPOINT,
                         revision=REVISION,
                         vae=vae,
                         transformer=transformer,
+                        text_encoder_2=t5_encoder_2,
                         torch_dtype=torch.bfloat16)
     pipeline.to("cuda")
     try:
+        # Enable some options for better vae
         pipeline.enable_vae_slicing()
+        pipeline.enable_vae_tiling()
+        torch.nn.LinearLayer = CustomLinear
     except:
+        print("Debug here")
     prms = [
 @torch.no_grad()
 def infer(request: TextToImageRequest, pipeline: Pipeline) -> Image:
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    torch.cuda.reset_peak_memory_stats()
     # remove cache here for better result
     generator = Generator(pipeline.device).manual_seed(request.seed)