Update README.md
Browse files
README.md
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
---
|
| 2 |
-
license:
|
| 3 |
base_model:
|
| 4 |
-
- black-forest-labs/FLUX.1-
|
| 5 |
base_model_relation: quantized
|
| 6 |
pipeline_tag: text-to-image
|
| 7 |
---
|
| 8 |
|
| 9 |
-
# Elastic model: FLUX.1-
|
| 10 |
|
| 11 |
|
| 12 |
## Overview
|
|
@@ -66,13 +66,13 @@ pip install 'thestage-elastic-models[nvidia]' \
|
|
| 66 |
---
|
| 67 |
|
| 68 |
|
| 69 |
-
Elastic Models provides the same interface as HuggingFace Diffusers. Here is an example of how to use the FLUX.1-
|
| 70 |
|
| 71 |
```python
|
| 72 |
import torch
|
| 73 |
from elastic_models.diffusers import FluxPipeline
|
| 74 |
|
| 75 |
-
mode_name = 'black-forest-labs/FLUX.1-
|
| 76 |
hf_token = ''
|
| 77 |
device = torch.device("cuda")
|
| 78 |
|
|
@@ -99,9 +99,9 @@ for prompt, output_image in zip(prompts, output.images):
|
|
| 99 |
---
|
| 100 |
|
| 101 |
|
| 102 |
-
We have used PartiPrompts and DrawBench datasets to evaluate the quality of images generated by different sizes of FLUX.1-
|
| 103 |
|
| 104 |
-
** |
|
| 113 |
-
| **ARNIQA (DrawBench)** |
|
| 114 |
-
| **CLIP IQA (PartiPrompts)** |
|
| 115 |
-
| **CLIP IQA (DrawBench)** |
|
| 116 |
-
| **VQA Faithfulness (PartiPrompts)** | 87
|
| 117 |
-
| **VQA Faithfulness (DrawBench)** |
|
| 118 |
-
| **PSNR (PartiPrompts)** |
|
| 119 |
-
| **SSIM (PartiPrompts)** | 0.
|
| 120 |
|
| 121 |
|
| 122 |
## Datasets
|
|
@@ -145,9 +145,9 @@ We have used PartiPrompts and DrawBench datasets to evaluate the quality of imag
|
|
| 145 |
---
|
| 146 |
|
| 147 |
|
| 148 |
-
We have measured the latency of different sizes of FLUX.1-
|
| 149 |
|
| 150 |
-
 for generating a 1024x1024 image using different model size
|
|
| 157 |
|
| 158 |
| **GPU/Model Size**| **S**| **M**| **L**| **XL**| **Original** |
|
| 159 |
| --- | --- | --- | --- | --- | --- |
|
| 160 |
-
| **H100** |
|
| 161 |
-
| **L40s** |
|
| 162 |
-
| **B200** |
|
| 163 |
-
| **GeForce RTX 5090** |
|
| 164 |
|
| 165 |
|
| 166 |
## Benchmarking Methodology
|
|
@@ -171,7 +171,7 @@ Latency (in seconds) for generating a 1024x1024 image using different model size
|
|
| 171 |
The benchmarking was performed on a single GPU with a batch size of 1. Each model was run for 10 iterations, and the average latency was calculated.
|
| 172 |
|
| 173 |
> **Algorithm summary:**
|
| 174 |
-
> 1. Load the FLUX.1-
|
| 175 |
> 2. Move the model to the GPU.
|
| 176 |
> 3. Prepare a sample prompt for image generation.
|
| 177 |
> 4. Run the model for a number of iterations (e.g., 10) and measure the time taken for each iteration. On each iteration:
|
|
@@ -191,7 +191,7 @@ The benchmarking was performed on a single GPU with a batch size of 1. Each mode
|
|
| 191 |
import torch
|
| 192 |
from elastic_models.diffusers import FluxPipeline
|
| 193 |
|
| 194 |
-
mode_name = 'black-forest-labs/FLUX.1-
|
| 195 |
hf_token = ''
|
| 196 |
device = torch.device("cuda")
|
| 197 |
|
|
@@ -209,7 +209,7 @@ prompt = ["Kitten eating a banana"]
|
|
| 209 |
generate_kwargs={
|
| 210 |
"height": 1024,
|
| 211 |
"width": 1024,
|
| 212 |
-
"num_inference_steps":
|
| 213 |
"cfg_scale": 0.0
|
| 214 |
}
|
| 215 |
|
|
@@ -244,6 +244,69 @@ print(f"Average Latency over {num_runs} runs: {average_latency} seconds")
|
|
| 244 |
```
|
| 245 |
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
## Serving with Docker Image
|
| 248 |
|
| 249 |
---
|
|
@@ -267,7 +330,7 @@ docker run --rm -ti \
|
|
| 267 |
--name serving_thestage_model \
|
| 268 |
-p 8000:80 \
|
| 269 |
-e AUTH_TOKEN=<AUTH_TOKEN> \
|
| 270 |
-
-e MODEL_REPO=black-forest-labs/FLUX.1-
|
| 271 |
-e MODEL_SIZE=<MODEL_SIZE> \
|
| 272 |
-e MODEL_BATCH=<MAX_BATCH_SIZE> \
|
| 273 |
-e HUGGINGFACE_ACCESS_TOKEN=<HUGGINGFACE_ACCESS_TOKEN> \
|
|
@@ -295,13 +358,13 @@ You can invoke the endpoint using CURL as follows:
|
|
| 295 |
curl -X POST <http://127.0.0.1:8000/v1/images/generations> \
|
| 296 |
-H "Authorization: Bearer <AUTH_TOKEN>" \
|
| 297 |
-H "Content-Type: application/json" \
|
| 298 |
-
-H "X-Model-Name: flux-1-
|
| 299 |
-d '{
|
| 300 |
"prompt": "Cat eating banana",
|
| 301 |
"seed": 12,
|
| 302 |
"aspect_ratio": "1:1",
|
| 303 |
"guidance_scale": 6.5,
|
| 304 |
-
"num_inference_steps":
|
| 305 |
}' \
|
| 306 |
--output sunset.webp -D -
|
| 307 |
```
|
|
@@ -317,12 +380,12 @@ payload = json.dumps({
|
|
| 317 |
"seed": 12,
|
| 318 |
"aspect_ratio": "1:1",
|
| 319 |
"guidance_scale": 6.5,
|
| 320 |
-
"num_inference_steps":
|
| 321 |
})
|
| 322 |
headers = {
|
| 323 |
'Authorization': 'Bearer <AUTH_TOKEN>',
|
| 324 |
'Content-Type': 'application/json',
|
| 325 |
-
'X-Model-Name': 'flux-1-
|
| 326 |
}
|
| 327 |
response = requests.request("POST", url, headers=headers, data=payload)
|
| 328 |
with open("sunset.webp", "wb") as f:
|
|
@@ -337,7 +400,7 @@ from openai import OpenAI
|
|
| 337 |
|
| 338 |
BASE_URL = "http://<your_ip>/v1"
|
| 339 |
API_KEY = ""
|
| 340 |
-
MODEL = "flux-1-
|
| 341 |
|
| 342 |
client = OpenAI(
|
| 343 |
api_key=API_KEY,
|
|
@@ -353,7 +416,7 @@ response = client.with_raw_response.images.generate(
|
|
| 353 |
"seed": 111,
|
| 354 |
"aspect_ratio": "1:1",
|
| 355 |
"guidance_scale": 3.5,
|
| 356 |
-
"num_inference_steps":
|
| 357 |
},
|
| 358 |
)
|
| 359 |
|
|
@@ -386,7 +449,7 @@ with open("thestage_image.webp", "wb") as f:
|
|
| 386 |
|
| 387 |
> `X-Model-Name`: `string`
|
| 388 |
>
|
| 389 |
-
> Specifies the model to use for generation. Format: `flux-1-
|
| 390 |
|
| 391 |
### Input Body
|
| 392 |
|
|
@@ -402,7 +465,7 @@ with open("thestage_image.webp", "wb") as f:
|
|
| 402 |
|
| 403 |
> `num_inference_steps`: `int32`
|
| 404 |
>
|
| 405 |
-
> Number of diffusion steps to use for generation. Higher values yield better quality but take longer. Default is
|
| 406 |
|
| 407 |
> `aspect_ratio`: `string`
|
| 408 |
>
|
|
@@ -451,7 +514,7 @@ Set your environment variables in `modal_serving.py`:
|
|
| 451 |
# modal_serving.py
|
| 452 |
|
| 453 |
ENVS = {
|
| 454 |
-
"MODEL_REPO": "black-forest-labs/FLUX.1-
|
| 455 |
"MODEL_BATCH": "4",
|
| 456 |
"THESTAGE_AUTH_TOKEN": "",
|
| 457 |
"HUGGINGFACE_ACCESS_TOKEN": "",
|
|
@@ -482,7 +545,7 @@ Set your desired GPU type and autoscaling variables in `modal_serving.py`:
|
|
| 482 |
)
|
| 483 |
@modal.web_server(
|
| 484 |
80,
|
| 485 |
-
label="black-forest-labs/FLUX.1-
|
| 486 |
startup_timeout=60*20
|
| 487 |
)
|
| 488 |
def serve():
|
|
|
|
| 1 |
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
base_model:
|
| 4 |
+
- black-forest-labs/FLUX.1-schnell
|
| 5 |
base_model_relation: quantized
|
| 6 |
pipeline_tag: text-to-image
|
| 7 |
---
|
| 8 |
|
| 9 |
+
# Elastic model: FLUX.1-schnell
|
| 10 |
|
| 11 |
|
| 12 |
## Overview
|
|
|
|
| 66 |
---
|
| 67 |
|
| 68 |
|
| 69 |
+
Elastic Models provides the same interface as HuggingFace Diffusers. Here is an example of how to use the FLUX.1-schnell model:
|
| 70 |
|
| 71 |
```python
|
| 72 |
import torch
|
| 73 |
from elastic_models.diffusers import FluxPipeline
|
| 74 |
|
| 75 |
+
mode_name = 'black-forest-labs/FLUX.1-schnell'
|
| 76 |
hf_token = ''
|
| 77 |
device = torch.device("cuda")
|
| 78 |
|
|
|
|
| 99 |
---
|
| 100 |
|
| 101 |
|
| 102 |
+
We have used PartiPrompts and DrawBench datasets to evaluate the quality of images generated by different sizes of FLUX.1-schnell models (S, M, L, XL) compared to the original model. The evaluation metrics include ARNIQA, CLIP IQA, PSNR, SSIM, and VQA Faithfulness.
|
| 103 |
|
| 104 |
+

|
| 105 |
|
| 106 |
### Quality Benchmark Results
|
| 107 |
|
|
|
|
| 109 |
|
| 110 |
| **Metric/Model Size**| **S**| **M**| **L**| **XL**| **Original** |
|
| 111 |
| --- | --- | --- | --- | --- | --- |
|
| 112 |
+
| **ARNIQA (PartiPrompts)** | 62.8 | 63.2 | 64.3 | 65.2 | 65.2 |
|
| 113 |
+
| **ARNIQA (DrawBench)** | 61.4 | 62.5 | 63.9 | 64 | 64 |
|
| 114 |
+
| **CLIP IQA (PartiPrompts)** | 83.6 | 84.1 | 84.9 | 85.7 | 85.7 |
|
| 115 |
+
| **CLIP IQA (DrawBench)** | 82.7 | 84 | 84.4 | 84.5 | 84.5 |
|
| 116 |
+
| **VQA Faithfulness (PartiPrompts)** | 87 | 86 | 86.2 | 85.7 | 85.7 |
|
| 117 |
+
| **VQA Faithfulness (DrawBench)** | 73.8 | 72.7 | 74.4 | 74.3 | 74.3 |
|
| 118 |
+
| **PSNR (PartiPrompts)** | 29.9 | 30.2 | 31 | N/A | N/A |
|
| 119 |
+
| **SSIM (PartiPrompts)** | 0.66 | 0.71 | 0.86 | 1.0 | 1.0 |
|
| 120 |
|
| 121 |
|
| 122 |
## Datasets
|
|
|
|
| 145 |
---
|
| 146 |
|
| 147 |
|
| 148 |
+
We have measured the latency of different sizes of FLUX.1-schnell model (S, M, L, XL, original) on various GPUs. The measurements were taken for generating images of size 1024x1024 pixels.
|
| 149 |
|
| 150 |
+

|
| 151 |
|
| 152 |
### Latency Benchmark Results
|
| 153 |
|
|
|
|
| 157 |
|
| 158 |
| **GPU/Model Size**| **S**| **M**| **L**| **XL**| **Original** |
|
| 159 |
| --- | --- | --- | --- | --- | --- |
|
| 160 |
+
| **H100** | 0.51 | 0.51 | 0.51 | 0.71 | 1.04 |
|
| 161 |
+
| **L40s** | 1.59 | 1.6 | 1.6 | 2.19 | 2.5 |
|
| 162 |
+
| **B200** | 0.38 | 0.38 | 0.38 | 0.39 | 0.75 |
|
| 163 |
+
| **GeForce RTX 5090** | 1.19 | N/A | N/A | N/A | N/A |
|
| 164 |
|
| 165 |
|
| 166 |
## Benchmarking Methodology
|
|
|
|
| 171 |
The benchmarking was performed on a single GPU with a batch size of 1. Each model was run for 10 iterations, and the average latency was calculated.
|
| 172 |
|
| 173 |
> **Algorithm summary:**
|
| 174 |
+
> 1. Load the FLUX.1-schnell model with the specified size (S, M, L, XL, original).
|
| 175 |
> 2. Move the model to the GPU.
|
| 176 |
> 3. Prepare a sample prompt for image generation.
|
| 177 |
> 4. Run the model for a number of iterations (e.g., 10) and measure the time taken for each iteration. On each iteration:
|
|
|
|
| 191 |
import torch
|
| 192 |
from elastic_models.diffusers import FluxPipeline
|
| 193 |
|
| 194 |
+
mode_name = 'black-forest-labs/FLUX.1-schnell'
|
| 195 |
hf_token = ''
|
| 196 |
device = torch.device("cuda")
|
| 197 |
|
|
|
|
| 209 |
generate_kwargs={
|
| 210 |
"height": 1024,
|
| 211 |
"width": 1024,
|
| 212 |
+
"num_inference_steps": 4,
|
| 213 |
"cfg_scale": 0.0
|
| 214 |
}
|
| 215 |
|
|
|
|
| 244 |
```
|
| 245 |
|
| 246 |
|
| 247 |
+
## LoRA Support
|
| 248 |
+
|
| 249 |
+
---
|
| 250 |
+
|
| 251 |
+
Elastic FLUX.1-schnell engines support **runtime LoRA hot-swap** — load, switch, or disable LoRA files without recompilation or engine reload. LoRA weights are dynamic tensor inputs to the compiled engine.
|
| 252 |
+
|
| 253 |
+
- **Supported ranks**: 1–256 (compiled with dynamic rank)
|
| 254 |
+
- **Supported formats**: XLabs, diffusers, BFL Control (auto-detected)
|
| 255 |
+
- **Hot-swap**: switch LoRA instantly by calling `load_lora_weights()`
|
| 256 |
+
- **Disable**: `unload_lora_weights()` removes LoRA with minimal overhead
|
| 257 |
+
|
| 258 |
+
> LoRA adds ~5-15% latency overhead. LoRA files must be downloaded locally before use (e.g. via `huggingface-cli download`).
|
| 259 |
+
|
| 260 |
+
### Usage with LoRA
|
| 261 |
+
|
| 262 |
+
---
|
| 263 |
+
|
| 264 |
+
```python
|
| 265 |
+
import torch
|
| 266 |
+
from elastic_models.diffusers import FluxPipeline
|
| 267 |
+
|
| 268 |
+
model_name = "black-forest-labs/FLUX.1-schnell"
|
| 269 |
+
device = torch.device("cuda")
|
| 270 |
+
|
| 271 |
+
pipeline = FluxPipeline.from_pretrained(
|
| 272 |
+
model_name,
|
| 273 |
+
torch_dtype=torch.bfloat16,
|
| 274 |
+
mode="S",
|
| 275 |
+
lora_support=True,
|
| 276 |
+
)
|
| 277 |
+
pipeline.to(device)
|
| 278 |
+
|
| 279 |
+
# Load a LoRA and generate
|
| 280 |
+
pipeline.load_lora_weights("./loras/realism_lora.safetensors", strength=1.0)
|
| 281 |
+
output = pipeline(prompt=["A portrait photo of a woman in golden hour light"])
|
| 282 |
+
output.images[0].save("realism_lora.png")
|
| 283 |
+
|
| 284 |
+
# Hot-swap to a different LoRA (no engine reload)
|
| 285 |
+
pipeline.load_lora_weights("./loras/anime_lora.safetensors", strength=1.0)
|
| 286 |
+
output = pipeline(prompt=["Anime girl with blue hair in a garden"])
|
| 287 |
+
output.images[0].save("anime_lora.png")
|
| 288 |
+
|
| 289 |
+
# Disable LoRA
|
| 290 |
+
pipeline.unload_lora_weights()
|
| 291 |
+
output = pipeline(prompt=["A castle on a hill at sunset"])
|
| 292 |
+
output.images[0].save("no_lora.png")
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
### LoRA Latency Benchmarks
|
| 296 |
+
|
| 297 |
+
---
|
| 298 |
+
|
| 299 |
+
Time in seconds to generate one 1024x1024 image (average over 3 LoRAs — rank 32, 32, 256).
|
| 300 |
+
|
| 301 |
+
| **GPU/Model Size**| **S**| **M**| **L**| **XL**| **Original (unfused)** |
|
| 302 |
+
| --- | --- | --- | --- | --- | --- |
|
| 303 |
+
| **H100** | 0.71 | 0.71 | 0.71 | 0.87 | 1.24 |
|
| 304 |
+
| **L40s** | 1.9 | 1.9 | 1.9 | 2.4 | 2.93 |
|
| 305 |
+
| **B200** | 0.59 | 0.59 | 0.59 | 0.53 | 0.89 |
|
| 306 |
+
| **GeForce RTX 5090** | 1.46 | N/A | N/A | N/A | N/A |
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
|
| 310 |
## Serving with Docker Image
|
| 311 |
|
| 312 |
---
|
|
|
|
| 330 |
--name serving_thestage_model \
|
| 331 |
-p 8000:80 \
|
| 332 |
-e AUTH_TOKEN=<AUTH_TOKEN> \
|
| 333 |
+
-e MODEL_REPO=black-forest-labs/FLUX.1-schnell \
|
| 334 |
-e MODEL_SIZE=<MODEL_SIZE> \
|
| 335 |
-e MODEL_BATCH=<MAX_BATCH_SIZE> \
|
| 336 |
-e HUGGINGFACE_ACCESS_TOKEN=<HUGGINGFACE_ACCESS_TOKEN> \
|
|
|
|
| 358 |
curl -X POST <http://127.0.0.1:8000/v1/images/generations> \
|
| 359 |
-H "Authorization: Bearer <AUTH_TOKEN>" \
|
| 360 |
-H "Content-Type: application/json" \
|
| 361 |
+
-H "X-Model-Name: flux-1-schnell-<MODEL_SIZE>-bs<MAX_BATCH_SIZE>" \
|
| 362 |
-d '{
|
| 363 |
"prompt": "Cat eating banana",
|
| 364 |
"seed": 12,
|
| 365 |
"aspect_ratio": "1:1",
|
| 366 |
"guidance_scale": 6.5,
|
| 367 |
+
"num_inference_steps": 4
|
| 368 |
}' \
|
| 369 |
--output sunset.webp -D -
|
| 370 |
```
|
|
|
|
| 380 |
"seed": 12,
|
| 381 |
"aspect_ratio": "1:1",
|
| 382 |
"guidance_scale": 6.5,
|
| 383 |
+
"num_inference_steps": 4
|
| 384 |
})
|
| 385 |
headers = {
|
| 386 |
'Authorization': 'Bearer <AUTH_TOKEN>',
|
| 387 |
'Content-Type': 'application/json',
|
| 388 |
+
'X-Model-Name': 'flux-1-schnell-<MODEL_SIZE>-bs<MAX_BATCH_SIZE>'
|
| 389 |
}
|
| 390 |
response = requests.request("POST", url, headers=headers, data=payload)
|
| 391 |
with open("sunset.webp", "wb") as f:
|
|
|
|
| 400 |
|
| 401 |
BASE_URL = "http://<your_ip>/v1"
|
| 402 |
API_KEY = ""
|
| 403 |
+
MODEL = "flux-1-schnell-<MODEL_SIZE>-bs<MAX_BATCH_SIZE>"
|
| 404 |
|
| 405 |
client = OpenAI(
|
| 406 |
api_key=API_KEY,
|
|
|
|
| 416 |
"seed": 111,
|
| 417 |
"aspect_ratio": "1:1",
|
| 418 |
"guidance_scale": 3.5,
|
| 419 |
+
"num_inference_steps": 4
|
| 420 |
},
|
| 421 |
)
|
| 422 |
|
|
|
|
| 449 |
|
| 450 |
> `X-Model-Name`: `string`
|
| 451 |
>
|
| 452 |
+
> Specifies the model to use for generation. Format: `flux-1-schnell-<size>-bs<batch_size>`, where `<size>` is one of `S`, `M`, `L`, `XL`, `original` and `<batch_size>` is the maximum batch size configured during container startup.
|
| 453 |
|
| 454 |
### Input Body
|
| 455 |
|
|
|
|
| 465 |
|
| 466 |
> `num_inference_steps`: `int32`
|
| 467 |
>
|
| 468 |
+
> Number of diffusion steps to use for generation. Higher values yield better quality but take longer. Default is 4
|
| 469 |
|
| 470 |
> `aspect_ratio`: `string`
|
| 471 |
>
|
|
|
|
| 514 |
# modal_serving.py
|
| 515 |
|
| 516 |
ENVS = {
|
| 517 |
+
"MODEL_REPO": "black-forest-labs/FLUX.1-schnell",
|
| 518 |
"MODEL_BATCH": "4",
|
| 519 |
"THESTAGE_AUTH_TOKEN": "",
|
| 520 |
"HUGGINGFACE_ACCESS_TOKEN": "",
|
|
|
|
| 545 |
)
|
| 546 |
@modal.web_server(
|
| 547 |
80,
|
| 548 |
+
label="black-forest-labs/FLUX.1-schnell-test",
|
| 549 |
startup_timeout=60*20
|
| 550 |
)
|
| 551 |
def serve():
|