Spaces:
Paused
Paused
SD-XL max tokens optimization (with compel)
Browse files
app.py
CHANGED
|
@@ -8,11 +8,23 @@ from gradio_client import Client
|
|
| 8 |
|
| 9 |
client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
|
| 10 |
|
|
|
|
| 11 |
from diffusers import DiffusionPipeline
|
| 12 |
import torch
|
| 13 |
|
| 14 |
-
pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0",
|
|
|
|
|
|
|
|
|
|
| 15 |
pipe.to("cuda")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
#pipe.enable_model_cpu_offload()
|
| 17 |
|
| 18 |
# if using torch < 2.0
|
|
@@ -79,9 +91,7 @@ def infer(audio_file):
|
|
| 79 |
I'll give you a music description, from i want you to provide an illustrative image description that would fit well with the music.
|
| 80 |
Do not processs each segment or song, but provide a summary for the whole instead.
|
| 81 |
Answer with only one image description. Never do lists. Maximum 77 tokens.
|
| 82 |
-
|
| 83 |
Here's the music description :
|
| 84 |
-
|
| 85 |
{cap_result}
|
| 86 |
|
| 87 |
"""
|
|
@@ -95,7 +105,11 @@ def infer(audio_file):
|
|
| 95 |
|
| 96 |
print(f"Llama2 result: {result}")
|
| 97 |
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
print("Finished")
|
| 101 |
|
|
|
|
| 8 |
|
| 9 |
client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
|
| 10 |
|
| 11 |
+
from compel import Compel, ReturnedEmbeddingsType
|
| 12 |
from diffusers import DiffusionPipeline
|
| 13 |
import torch
|
| 14 |
|
| 15 |
+
pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0",
|
| 16 |
+
torch_dtype=torch.float16,
|
| 17 |
+
use_safetensors=True,
|
| 18 |
+
variant="fp16")
|
| 19 |
pipe.to("cuda")
|
| 20 |
+
|
| 21 |
+
compel = Compel(
|
| 22 |
+
tokenizer=[pipe.tokenizer, pipe.tokenizer_2],
|
| 23 |
+
text_encoder=[pipe.text_encoder, pipe.text_encoder_2],
|
| 24 |
+
returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
|
| 25 |
+
requires_pooled=[False, True]
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
#pipe.enable_model_cpu_offload()
|
| 29 |
|
| 30 |
# if using torch < 2.0
|
|
|
|
| 91 |
I'll give you a music description, from i want you to provide an illustrative image description that would fit well with the music.
|
| 92 |
Do not processs each segment or song, but provide a summary for the whole instead.
|
| 93 |
Answer with only one image description. Never do lists. Maximum 77 tokens.
|
|
|
|
| 94 |
Here's the music description :
|
|
|
|
| 95 |
{cap_result}
|
| 96 |
|
| 97 |
"""
|
|
|
|
| 105 |
|
| 106 |
print(f"Llama2 result: {result}")
|
| 107 |
|
| 108 |
+
# βββ
|
| 109 |
+
|
| 110 |
+
prompt = result
|
| 111 |
+
conditioning, pooled = compel(prompt)
|
| 112 |
+
images = pipe(prompt_embeds=conditioning, pooled_prompt_embeds=pooled).images[0]
|
| 113 |
|
| 114 |
print("Finished")
|
| 115 |
|