8clabs
/

sketch-model-3

+from time import perf_counter
+class SequentialTimer:
+    def __init__(self, make_print=False):
+        self.timings = []
+        self.make_print = make_print
+    def time(self, message: str):
+        if self.make_print:
+            print(message)
+        self.timings.append((perf_counter(), message))
+    def to_str(self) -> str:
+        s = ""
+        if len(self.timings) <= 1:
+            s = "No timings"
+            return s
+        t0 = self.timings[0][0]
+        for ((t1, m1), (t2, _)) in zip(self.timings, self.timings[1:]):
+            s += f"TIME: step: {t2 - t1:06.3f} | cum {t2 - t0:06.3f} - {m1}\n"
+        s += f"ALL TIME: {self.timings[-1][0] - self.timings[0][0]:07.3f}\n"
+        return s
+    def printall(self):
+        print(self.to_str())

serve_loras.py CHANGED Viewed

@@ -5,7 +5,7 @@ import uuid
 import diffusers
 import torch
-from diffusers import StableDiffusionXLPipeline
 import numpy as np
 import threading
@@ -14,13 +14,15 @@ import base64
 from io import BytesIO
 from PIL import Image
 import numpy as np
-import uuid
 from tempfile import TemporaryFile
 from google.cloud import storage
 import sys
 import sentry_sdk
 from flask import Flask, request, jsonify
 import os
 logger = logging.getLogger(__name__)
 logger.info("Diffusers version %s", diffusers.__version__)
@@ -34,6 +36,24 @@ sentry_sdk.init(
 LORAS_DIR = './safetensors'
 class DiffusersHandler(ABC):
     """
     Diffusers handler class for text to image generation.
@@ -65,8 +85,31 @@ class DiffusersHandler(ABC):
             torch_dtype=torch.float16,
             use_safetensors=True,
         )
-        logger.info("moving model to device: %s", device_str)
         self.pipe.to(self.device)
         logger.info(self.device)
@@ -86,20 +129,30 @@ class DiffusersHandler(ABC):
         logger.info("Received requests: '%s'", raw_requests)
         self.working = True
-        processed_request = {
             "prompt": raw_requests[0]["prompt"],
             "negative_prompt": raw_requests[0].get("negative_prompt"),
             "width": raw_requests[0].get("width"),
             "height": raw_requests[0].get("height"),
-            "num_inference_steps": raw_requests[0].get("num_inference_steps", 30),
-            "guidance_scale": raw_requests[0].get("guidance_scale", 7.5),
-            "lora_weights": raw_requests[0].get("lora_name", None),
-            "cross_attention_kwargs": {"scale": raw_requests[0].get("lora_scale", 0.6)}
         }
-        logger.info("Processed request: '%s'", processed_request)
-        axiom_logger.info("Processed request:" + str(processed_request), request_id=self.req_id, device=self.device_str)
-        return processed_request
     def inference(self, request):
@@ -111,29 +164,70 @@ class DiffusersHandler(ABC):
         """
         # Handling inference for sequence_classification.
-        compel = Compel(tokenizer=[self.pipe.tokenizer, self.pipe.tokenizer_2] , text_encoder=[self.pipe.text_encoder, self.pipe.text_encoder_2], returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, requires_pooled=[False, True])
-        self.prompt = request.pop("prompt")
-        conditioning, pooled = compel(self.prompt)
-        lora_weights = request.pop("lora_weights")
-        if lora_weights is not None:
-            lora_path = os.path.join(LORAS_DIR, lora_weights + '.safetensors')
-            logger.info('LOADING LORA FROM: ' + lora_path)
-            self.pipe.load_lora_weights(lora_path)
         # Handling inference for sequence_classification.
         inferences = self.pipe(
             prompt_embeds=conditioning,
             pooled_prompt_embeds=pooled,
-            **request
         ).images
-        if lora_weights is not None:
             self.pipe.unload_lora_weights()
-        logger.info("Generated image: '%s'", inferences)
-        axiom_logger.info("Generated images", request_id=self.req_id, device=self.device_str)
         return inferences
     def postprocess(self, inference_outputs):
@@ -178,16 +272,19 @@ handlers = [DiffusersHandler() for i in range(gpu_count)]
 for i in range(gpu_count):
     handlers[i].initialize({"gpu_id": i})
-handler_lock = threading.Lock()
-handler_index = 0
 @app.route('/generate', methods=['POST'])
 def generate_image():
     req_id = str(uuid.uuid4())
     global handler_index
     try:
         # Extract raw requests from HTTP POST body
         raw_requests = request.json
         with handler_lock:
             selected_handler = handlers[handler_index]
@@ -202,7 +299,7 @@ def generate_image():
         return jsonify({"image_urls": outputs})
     except Exception as e:
         logger.error("Error during image generation: %s", str(e))
-        axiom_logger.critical("Error during image generation: " + str(e), request_id=req_id)
         return jsonify({"error": "Failed to generate image", "details": str(e)}), 500
 if __name__ == '__main__':

 import diffusers
 import torch
+from diffusers import StableDiffusionXLPipeline, DiffusionPipeline
 import numpy as np
 import threading
 from io import BytesIO
 from PIL import Image
 import numpy as np
 from tempfile import TemporaryFile
 from google.cloud import storage
 import sys
 import sentry_sdk
 from flask import Flask, request, jsonify
 import os
+from sequential_timer import SequentialTimer
+from safetensors.torch import load_file
+import copy
 logger = logging.getLogger(__name__)
 logger.info("Diffusers version %s", diffusers.__version__)
 LORAS_DIR = './safetensors'
+handler_lock = threading.Lock()
+handler_index = 0
+class LoraCache():
+    def __init__(self, loras_dir: str = LORAS_DIR):
+        self.loras_dir = loras_dir
+        self.cache = {}
+    def load_lora(self, lora_name: str):
+        if lora_name.endswith('.safetensors'):
+            lora_name = lora_name.rstrip('.safetensors')
+        if lora_name not in self.cache:
+            lora = load_file(os.path.join(self.loras_dir, lora_name+'.safetensors'))
+            self.cache[lora_name] = lora
+        return copy.deepcopy(self.cache[lora_name])
+lora_cache = LoraCache()
 class DiffusersHandler(ABC):
     """
     Diffusers handler class for text to image generation.
             torch_dtype=torch.float16,
             use_safetensors=True,
         )
+        # self.refiner = DiffusionPipeline.from_pretrained(
+        #     "stabilityai/stable-diffusion-xl-refiner-1.0",
+        #     text_encoder_2=self.pipe.text_encoder_2,
+        #     vae=self.pipe.vae,
+        #     torch_dtype=torch.float16,
+        #     use_safetensors=True,
+        #     variant="fp16",
+        # )
+        # self.refiner.enable_model_cpu_offload(properties.get("gpu_id"))
+        # logger.info("Refiner initialized and o")
+        self.compel_base = Compel(
+                    tokenizer=[self.pipe.tokenizer, self.pipe.tokenizer_2],
+                    text_encoder=[self.pipe.text_encoder, self.pipe.text_encoder_2],
+                    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
+                    requires_pooled=[False, True])
+        logger.info("Compel initialized")
+        # self.compel_refiner = Compel(
+        #     tokenizer=[self.refiner.tokenizer_2],
+        #     text_encoder=[self.refiner.text_encoder_2],
+        #     returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
+        #     requires_pooled=[True])
+        logger.info("moving base model to device: %s", device_str)
         self.pipe.to(self.device)
         logger.info(self.device)
         logger.info("Received requests: '%s'", raw_requests)
         self.working = True
+        model_args = {
             "prompt": raw_requests[0]["prompt"],
             "negative_prompt": raw_requests[0].get("negative_prompt"),
             "width": raw_requests[0].get("width"),
             "height": raw_requests[0].get("height"),
+            "num_inference_steps": raw_requests[0].get("num_inference_steps", 25),
+            "guidance_scale": raw_requests[0].get("guidance_scale", 8.5)
+            # "lora_weights": raw_requests[0].get("lora_name", None)
+            # "cross_attention_kwargs": {"scale": raw_requests[0].get("lora_scale", 0.0)}
         }
+        extra_args = {
+            "seed": raw_requests[0].get("seed", None),
+            "style_lora": raw_requests[0].get("style_lora", None),
+            "style_scale": raw_requests[0].get("style_scale", 1.0),
+            "char_lora": raw_requests[0].get("char_lora", None),
+            "char_scale": raw_requests[0].get("char_scale", 1.0)
+        }
+        logger.info("Processed request: '%s'", model_args)
+        axiom_logger.info("Processed request:" + str(model_args), request_id=self.req_id, device=self.device_str)
+        return model_args, extra_args
     def inference(self, request):
         """
         # Handling inference for sequence_classification.
+        # compel = Compel(tokenizer=[self.pipe.tokenizer, self.pipe.tokenizer_2] , text_encoder=[self.pipe.text_encoder, self.pipe.text_encoder_2], returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, requires_pooled=[False, True])
+        st = SequentialTimer()
+        model_args, extra_args = request
+        use_char_lora = extra_args['char_lora'] is not None
+        use_style_lora = extra_args['style_lora'] is not None
+        style_lora = extra_args['style_lora']
+        char_lora = extra_args['char_lora']
+        cross_attention_kwargs = {"scale": extra_args['char_scale'] if use_char_lora else extra_args['style_scale']}
+        generator = torch.Generator(device="cuda").manual_seed(extra_args['seed']) if extra_args['seed'] else None
+        self.prompt = model_args.pop("prompt")
+        st.time("Base compel embedding")
+        conditioning, pooled = self.compel_base(self.prompt)
+        if use_style_lora:
+            style_lora = os.path.join(LORAS_DIR, style_lora + '.safetensors')
+            st.time("Load style lora")
+            self.pipe.load_lora_weights(style_lora)
+            if use_char_lora:
+                st.time("Fuse style lora into model")
+                self.pipe.fuse_lora(lora_scale=extra_args['style_scale'], fuse_text_encoder=False)
+        if use_char_lora:
+            char_lora = os.path.join(LORAS_DIR, char_lora + '.safetensors')
+            st.time('load character lora')
+            self.pipe.load_lora_weights(char_lora)
+        # lora_weights = model_args.pop("lora_weights")
+        # if lora_weights is not None:
+        #     lora_path = os.path.join(LORAS_DIR, lora_weights + '.safetensors')
+        #     logger.info('LOADING LORA FROM: ' + lora_path)
+        #     self.pipe.load_lora_weights(lora_path)
         # Handling inference for sequence_classification.
+        st.time("base model inference")
         inferences = self.pipe(
             prompt_embeds=conditioning,
             pooled_prompt_embeds=pooled,
+            generator=generator,
+            cross_attention_kwargs=cross_attention_kwargs,
+            **model_args
         ).images
+        # if lora_weights is not None:
+        #     self.pipe.unload_lora_weights()
+        if use_style_lora and use_char_lora:
+            st.time("unfuse lora weights")
+            self.pipe.unfuse_lora(unfuse_text_encoder=False)
+        if use_style_lora or use_char_lora:
+            st.time("unload lora weights")
             self.pipe.unload_lora_weights()
+        st.time('end')
+        # logger.info("Generated image: '%s'", inferences)
+        axiom_logger.info("Generated images", request_id=self.req_id, device=self.device_str, timings=st.to_str())
         return inferences
     def postprocess(self, inference_outputs):
 for i in range(gpu_count):
     handlers[i].initialize({"gpu_id": i})
 @app.route('/generate', methods=['POST'])
 def generate_image():
     req_id = str(uuid.uuid4())
     global handler_index
+    selected_handler = None
     try:
         # Extract raw requests from HTTP POST body
         raw_requests = request.json
+        axiom_logger.info(message="Received request", request_id=req_id, **raw_requests)
         with handler_lock:
             selected_handler = handlers[handler_index]
         return jsonify({"image_urls": outputs})
     except Exception as e:
         logger.error("Error during image generation: %s", str(e))
+        axiom_logger.critical("Error during image generation: " + str(e), request_id=req_id, device=selected_handler.device_str)
         return jsonify({"error": "Failed to generate image", "details": str(e)}), 500
 if __name__ == '__main__':