manbeast3b
/

perfbench0test1

Model card Files Files and versions

xet

Community

manbeast3b commited on Dec 3, 2024

Commit

fbd72be

verified ·

1 Parent(s): 6005ab8

Update src/pipeline.py

Browse files

Files changed (1) hide show

src/pipeline.py +48 -21

src/pipeline.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from diffusers import FluxPipeline, AutoencoderKL #AutoencoderTiny
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 import torch.nn.functional as F
@@ -16,36 +16,59 @@ import torch.nn as nn
 # from torchao.quantization import quantize_,  float8_dynamic_activation_float8_weight #PerRow,
 import os
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False,garbage_collection_threshold:0.01"
 Pipeline = None
-def w8_a16_forward(weight, input, scales, bias=None):
-    casted_weights = weight.to(input.dtype)
-    output = F.linear(input, casted_weights) * scales # overhead
-    if bias is not None:
-        output = output + bias
-    return output
 class W8A16LinearLayer(nn.Module):
     def __init__(self, in_features, out_features, bias=True, dtype=torch.float32):
         super().__init__()
-        self.register_buffer(
-            "int8_weights",
-            torch.randint(-128, 127, (out_features, in_features), dtype=torch.int8))
-        self.register_buffer("scales", torch.randn((out_features), dtype=dtype))
         if bias:
-            self.register_buffer("bias", torch.randn((1, out_features), dtype=dtype))
     def quantize(self, weights):
         w_fp32 = weights.clone().to(torch.float32)
         scales = w_fp32.abs().max(dim=-1).values / 127
         scales = scales.to(weights.dtype)
-        int8_weights = torch.round(weights/scales.unsqueeze(1)).to(torch.int8)
-        self.int8_weights = int8_weights
-        self.scales = scales
-        self.bias = None
     def forward(self, input):
-        return w8_a16_forward(self.int8_weights, input, self.scales, self.bias)
 def replace_linear_with_target_and_quantize(module, target_class, module_name_to_exclude):
     # with open("/root/.cache/huggingface/hub/output_layers.txt", "a") as f:
@@ -54,8 +77,9 @@ def replace_linear_with_target_and_quantize(module, target_class, module_name_to
             old_bias = child.bias
             old_weight = child.weight
             new_module = target_class(child.in_features, child.out_features, old_bias is not None, child.weight.dtype)
             setattr(module, name, new_module)
-            getattr(module, name).quantize(old_weight)
             if old_bias is not None:
               getattr(module, name).bias = old_bias
@@ -84,10 +108,12 @@ def load_pipeline() -> Pipeline:
         "city96/t5-v1_1-xxl-encoder-bf16", torch_dtype=torch.bfloat16
     )
     vae=AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype)
     pipeline = DiffusionPipeline.from_pretrained(
         ckpt_id,
         vae=vae,
         text_encoder_2 = text_encoder_2,
         torch_dtype=dtype,
         )
     # quantize_(pipeline.transformer, float8_dynamic_activation_float8_weight())
@@ -98,8 +124,9 @@ def load_pipeline() -> Pipeline:
     pipeline.text_encoder.to(memory_format=torch.channels_last)
     pipeline.transformer.to(memory_format=torch.channels_last)
     replace_linear_with_target_and_quantize(pipeline.transformer, W8A16LinearLayer, [])
-    pipeline.transformer.save_pretrained("/root/.cache/huggingface/hub/transformer-flux")
-    exit()
     pipeline.vae.to(memory_format=torch.channels_last)
     pipeline.vae = torch.compile(pipeline.vae)

+from diffusers import FluxPipeline, AutoencoderKL, FluxTransformer2DModel #AutoencoderTiny
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 import torch.nn.functional as F
 # from torchao.quantization import quantize_,  float8_dynamic_activation_float8_weight #PerRow,
 import os
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False,garbage_collection_threshold:0.01"
+os.environ["HUGGINGFACE_HUB_TOKEN"] = ""
 Pipeline = None
+# def w8_a16_forward(weight, input, scales, bias=None):
+#     casted_weights = weight.to(input.dtype)
+#     output = F.linear(input, casted_weights) * scales # overhead
+#     if bias is not None:
+#         output = output + bias
+#     return output
+# class W8A16LinearLayer(nn.Module):
+#     def __init__(self, in_features, out_features, bias=True, dtype=torch.float32):
+#         super().__init__()
+#         self.register_buffer(
+#             "int8_weights",
+#             torch.randint(-128, 127, (out_features, in_features), dtype=torch.int8))
+#         self.register_buffer("scales", torch.randn((out_features), dtype=dtype))
+#         if bias:
+#             self.register_buffer("bias", torch.randn((1, out_features), dtype=dtype))
+#     def quantize(self, weights):
+#         w_fp32 = weights.clone().to(torch.float32)
+#         scales = w_fp32.abs().max(dim=-1).values / 127
+#         scales = scales.to(weights.dtype)
+#         int8_weights = torch.round(weights/scales.unsqueeze(1)).to(torch.int8)
+#         self.int8_weights = int8_weights
+#         self.scales = scales
+#         self.bias = None
+#     def forward(self, input):
+#         return w8_a16_forward(self.int8_weights, input, self.scales, self.bias)
 class W8A16LinearLayer(nn.Module):
     def __init__(self, in_features, out_features, bias=True, dtype=torch.float32):
         super().__init__()
+        self.weight = nn.Parameter(torch.randn(out_features, in_features, dtype=dtype))
         if bias:
+            self.bias = nn.Parameter(torch.randn(1, out_features, dtype=dtype))
+        self.scales = nn.Parameter(torch.randn(out_features, dtype=dtype))
     def quantize(self, weights):
         w_fp32 = weights.clone().to(torch.float32)
         scales = w_fp32.abs().max(dim=-1).values / 127
         scales = scales.to(weights.dtype)
+        self.weight.data = torch.round(weights/scales.unsqueeze(1)).to(torch.int8)
+        self.scales.data = scales
     def forward(self, input):
+        casted_weights = self.weight.to(input.dtype)
+        output = F.linear(input, casted_weights) * self.scales
+        if self.bias is not None:
+            output = output + self.bias
+        return output
 def replace_linear_with_target_and_quantize(module, target_class, module_name_to_exclude):
     # with open("/root/.cache/huggingface/hub/output_layers.txt", "a") as f:
             old_bias = child.bias
             old_weight = child.weight
             new_module = target_class(child.in_features, child.out_features, old_bias is not None, child.weight.dtype)
+            new_module.quantize(old_weight)
+            delattr(module, name)
             setattr(module, name, new_module)
             if old_bias is not None:
               getattr(module, name).bias = old_bias
         "city96/t5-v1_1-xxl-encoder-bf16", torch_dtype=torch.bfloat16
     )
     vae=AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype)
+    # transformer = FluxTransformer2DModel.from_pretrined("manbeast3b/transfomer-flux-schnell-int8") # torch_dtype=dtype
     pipeline = DiffusionPipeline.from_pretrained(
         ckpt_id,
         vae=vae,
         text_encoder_2 = text_encoder_2,
+        transformer=transformer,
         torch_dtype=dtype,
         )
     # quantize_(pipeline.transformer, float8_dynamic_activation_float8_weight())
     pipeline.text_encoder.to(memory_format=torch.channels_last)
     pipeline.transformer.to(memory_format=torch.channels_last)
     replace_linear_with_target_and_quantize(pipeline.transformer, W8A16LinearLayer, [])
+    # pipeline.transformer.save_pretrained("manbeast3b/transfomer-flux-schnell-int8-new", push_to_hub=True, token="")
+    # pipeline.transformer.save_pretrained("/root/.cache/huggingface/hub/transformer-flux")
+    # exit()
     pipeline.vae.to(memory_format=torch.channels_last)
     pipeline.vae = torch.compile(pipeline.vae)