Spaces:

fffiloni
/

Music-To-Image

Paused

fffiloni commited on Aug 7, 2023

Commit

d18abca

1 Parent(s): dc3228c

SD-XL max tokens optimization (with compel)

Files changed (1) hide show

app.py CHANGED Viewed

@@ -8,11 +8,23 @@ from gradio_client import Client
 client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
 from diffusers import DiffusionPipeline
 import torch
-pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
 pipe.to("cuda")
 #pipe.enable_model_cpu_offload()
 # if using torch < 2.0
@@ -79,9 +91,7 @@ def infer(audio_file):
     I'll give you a music description, from i want you to provide an illustrative image description that would fit well with the music.
     Do not processs each segment or song, but provide a summary for the whole instead.
     Answer with only one image description. Never do lists. Maximum 77 tokens.
     Here's the music description :
     {cap_result}
     """
@@ -95,7 +105,11 @@ def infer(audio_file):
     print(f"Llama2 result: {result}")
-    images = pipe(prompt=result).images[0]
     print("Finished")

 client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
+from compel import Compel, ReturnedEmbeddingsType
 from diffusers import DiffusionPipeline
 import torch
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0",
+                                         torch_dtype=torch.float16,
+                                         use_safetensors=True,
+                                         variant="fp16")
 pipe.to("cuda")
+compel = Compel(
+    tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
+    text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
+    returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
+    requires_pooled=[False, True]
+)
 #pipe.enable_model_cpu_offload()
 # if using torch < 2.0
     I'll give you a music description, from i want you to provide an illustrative image description that would fit well with the music.
     Do not processs each segment or song, but provide a summary for the whole instead.
     Answer with only one image description. Never do lists. Maximum 77 tokens.
     Here's the music description :
     {cap_result}
     """
     print(f"Llama2 result: {result}")
+    # ———
+    prompt = result
+    conditioning, pooled = compel(prompt)
+    images = pipe(prompt_embeds=conditioning, pooled_prompt_embeds=pooled).images[0]
     print("Finished")