Update README.md
Browse files
README.md
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
---
|
| 2 |
-
license:
|
| 3 |
base_model:
|
| 4 |
-
- black-forest-labs/FLUX.1-
|
| 5 |
base_model_relation: quantized
|
| 6 |
pipeline_tag: text-to-image
|
| 7 |
---
|
| 8 |
|
| 9 |
-
# Elastic model: FLUX.1-
|
| 10 |
|
| 11 |
|
| 12 |
## Overview
|
|
@@ -66,13 +66,13 @@ pip install 'thestage-elastic-models[nvidia]' \
|
|
| 66 |
---
|
| 67 |
|
| 68 |
|
| 69 |
-
Elastic Models provides the same interface as HuggingFace Diffusers. Here is an example of how to use the FLUX.1-
|
| 70 |
|
| 71 |
```python
|
| 72 |
import torch
|
| 73 |
from elastic_models.diffusers import FluxPipeline
|
| 74 |
|
| 75 |
-
mode_name = 'black-forest-labs/FLUX.1-
|
| 76 |
hf_token = ''
|
| 77 |
device = torch.device("cuda")
|
| 78 |
|
|
@@ -99,9 +99,9 @@ for prompt, output_image in zip(prompts, output.images):
|
|
| 99 |
---
|
| 100 |
|
| 101 |
|
| 102 |
-
We have used PartiPrompts and DrawBench datasets to evaluate the quality of images generated by different sizes of FLUX.1-
|
| 103 |
|
| 104 |
-
** |
|
| 113 |
-
| **ARNIQA (DrawBench)** |
|
| 114 |
-
| **CLIP IQA (PartiPrompts)** |
|
| 115 |
-
| **CLIP IQA (DrawBench)** |
|
| 116 |
-
| **VQA Faithfulness (PartiPrompts)** | 87 |
|
| 117 |
-
| **VQA Faithfulness (DrawBench)** |
|
| 118 |
-
| **PSNR (PartiPrompts)** |
|
| 119 |
-
| **SSIM (PartiPrompts)** | 0.
|
| 120 |
|
| 121 |
|
| 122 |
## Datasets
|
|
@@ -145,9 +145,9 @@ We have used PartiPrompts and DrawBench datasets to evaluate the quality of imag
|
|
| 145 |
---
|
| 146 |
|
| 147 |
|
| 148 |
-
We have measured the latency of different sizes of FLUX.1-
|
| 149 |
|
| 150 |
-
 for generating a 1024x1024 image using different model size
|
|
| 157 |
|
| 158 |
| **GPU/Model Size**| **S**| **M**| **L**| **XL**| **Original** |
|
| 159 |
| --- | --- | --- | --- | --- | --- |
|
| 160 |
-
| **H100** |
|
| 161 |
-
| **L40s** |
|
| 162 |
-
| **B200** |
|
| 163 |
-
| **GeForce RTX 5090** |
|
| 164 |
|
| 165 |
|
| 166 |
## Benchmarking Methodology
|
|
@@ -171,7 +171,7 @@ Latency (in seconds) for generating a 1024x1024 image using different model size
|
|
| 171 |
The benchmarking was performed on a single GPU with a batch size of 1. Each model was run for 10 iterations, and the average latency was calculated.
|
| 172 |
|
| 173 |
> **Algorithm summary:**
|
| 174 |
-
> 1. Load the FLUX.1-
|
| 175 |
> 2. Move the model to the GPU.
|
| 176 |
> 3. Prepare a sample prompt for image generation.
|
| 177 |
> 4. Run the model for a number of iterations (e.g., 10) and measure the time taken for each iteration. On each iteration:
|
|
@@ -191,7 +191,7 @@ The benchmarking was performed on a single GPU with a batch size of 1. Each mode
|
|
| 191 |
import torch
|
| 192 |
from elastic_models.diffusers import FluxPipeline
|
| 193 |
|
| 194 |
-
mode_name = 'black-forest-labs/FLUX.1-
|
| 195 |
hf_token = ''
|
| 196 |
device = torch.device("cuda")
|
| 197 |
|
|
@@ -209,7 +209,7 @@ prompt = ["Kitten eating a banana"]
|
|
| 209 |
generate_kwargs={
|
| 210 |
"height": 1024,
|
| 211 |
"width": 1024,
|
| 212 |
-
"num_inference_steps":
|
| 213 |
"cfg_scale": 0.0
|
| 214 |
}
|
| 215 |
|
|
@@ -248,7 +248,7 @@ print(f"Average Latency over {num_runs} runs: {average_latency} seconds")
|
|
| 248 |
|
| 249 |
---
|
| 250 |
|
| 251 |
-
Elastic FLUX.1-
|
| 252 |
|
| 253 |
- **Supported ranks**: 1–256 (compiled with dynamic rank)
|
| 254 |
- **Supported formats**: XLabs, diffusers, BFL Control (auto-detected)
|
|
@@ -265,7 +265,7 @@ Elastic FLUX.1-schnell engines support **runtime LoRA hot-swap** — load, switc
|
|
| 265 |
import torch
|
| 266 |
from elastic_models.diffusers import FluxPipeline
|
| 267 |
|
| 268 |
-
model_name = "black-forest-labs/FLUX.1-
|
| 269 |
device = torch.device("cuda")
|
| 270 |
|
| 271 |
pipeline = FluxPipeline.from_pretrained(
|
|
@@ -300,10 +300,10 @@ Time in seconds to generate one 1024x1024 image (average over 3 LoRAs — rank 3
|
|
| 300 |
|
| 301 |
| **GPU/Model Size**| **S**| **M**| **L**| **XL**| **Original (unfused)** |
|
| 302 |
| --- | --- | --- | --- | --- | --- |
|
| 303 |
-
| **H100** |
|
| 304 |
-
| **L40s** |
|
| 305 |
-
| **B200** |
|
| 306 |
-
| **GeForce RTX 5090** |
|
| 307 |
|
| 308 |
|
| 309 |
|
|
@@ -330,7 +330,7 @@ docker run --rm -ti \
|
|
| 330 |
--name serving_thestage_model \
|
| 331 |
-p 8000:80 \
|
| 332 |
-e AUTH_TOKEN=<AUTH_TOKEN> \
|
| 333 |
-
-e MODEL_REPO=black-forest-labs/FLUX.1-
|
| 334 |
-e MODEL_SIZE=<MODEL_SIZE> \
|
| 335 |
-e MODEL_BATCH=<MAX_BATCH_SIZE> \
|
| 336 |
-e HUGGINGFACE_ACCESS_TOKEN=<HUGGINGFACE_ACCESS_TOKEN> \
|
|
@@ -358,13 +358,13 @@ You can invoke the endpoint using CURL as follows:
|
|
| 358 |
curl -X POST <http://127.0.0.1:8000/v1/images/generations> \
|
| 359 |
-H "Authorization: Bearer <AUTH_TOKEN>" \
|
| 360 |
-H "Content-Type: application/json" \
|
| 361 |
-
-H "X-Model-Name: flux-1-
|
| 362 |
-d '{
|
| 363 |
"prompt": "Cat eating banana",
|
| 364 |
"seed": 12,
|
| 365 |
"aspect_ratio": "1:1",
|
| 366 |
"guidance_scale": 6.5,
|
| 367 |
-
"num_inference_steps":
|
| 368 |
}' \
|
| 369 |
--output sunset.webp -D -
|
| 370 |
```
|
|
@@ -380,12 +380,12 @@ payload = json.dumps({
|
|
| 380 |
"seed": 12,
|
| 381 |
"aspect_ratio": "1:1",
|
| 382 |
"guidance_scale": 6.5,
|
| 383 |
-
"num_inference_steps":
|
| 384 |
})
|
| 385 |
headers = {
|
| 386 |
'Authorization': 'Bearer <AUTH_TOKEN>',
|
| 387 |
'Content-Type': 'application/json',
|
| 388 |
-
'X-Model-Name': 'flux-1-
|
| 389 |
}
|
| 390 |
response = requests.request("POST", url, headers=headers, data=payload)
|
| 391 |
with open("sunset.webp", "wb") as f:
|
|
@@ -400,7 +400,7 @@ from openai import OpenAI
|
|
| 400 |
|
| 401 |
BASE_URL = "http://<your_ip>/v1"
|
| 402 |
API_KEY = ""
|
| 403 |
-
MODEL = "flux-1-
|
| 404 |
|
| 405 |
client = OpenAI(
|
| 406 |
api_key=API_KEY,
|
|
@@ -416,7 +416,7 @@ response = client.with_raw_response.images.generate(
|
|
| 416 |
"seed": 111,
|
| 417 |
"aspect_ratio": "1:1",
|
| 418 |
"guidance_scale": 3.5,
|
| 419 |
-
"num_inference_steps":
|
| 420 |
},
|
| 421 |
)
|
| 422 |
|
|
@@ -449,7 +449,7 @@ with open("thestage_image.webp", "wb") as f:
|
|
| 449 |
|
| 450 |
> `X-Model-Name`: `string`
|
| 451 |
>
|
| 452 |
-
> Specifies the model to use for generation. Format: `flux-1-
|
| 453 |
|
| 454 |
### Input Body
|
| 455 |
|
|
@@ -465,7 +465,7 @@ with open("thestage_image.webp", "wb") as f:
|
|
| 465 |
|
| 466 |
> `num_inference_steps`: `int32`
|
| 467 |
>
|
| 468 |
-
> Number of diffusion steps to use for generation. Higher values yield better quality but take longer. Default is
|
| 469 |
|
| 470 |
> `aspect_ratio`: `string`
|
| 471 |
>
|
|
@@ -514,7 +514,7 @@ Set your environment variables in `modal_serving.py`:
|
|
| 514 |
# modal_serving.py
|
| 515 |
|
| 516 |
ENVS = {
|
| 517 |
-
"MODEL_REPO": "black-forest-labs/FLUX.1-
|
| 518 |
"MODEL_BATCH": "4",
|
| 519 |
"THESTAGE_AUTH_TOKEN": "",
|
| 520 |
"HUGGINGFACE_ACCESS_TOKEN": "",
|
|
@@ -545,7 +545,7 @@ Set your desired GPU type and autoscaling variables in `modal_serving.py`:
|
|
| 545 |
)
|
| 546 |
@modal.web_server(
|
| 547 |
80,
|
| 548 |
-
label="black-forest-labs/FLUX.1-
|
| 549 |
startup_timeout=60*20
|
| 550 |
)
|
| 551 |
def serve():
|
|
|
|
| 1 |
---
|
| 2 |
+
license: other
|
| 3 |
base_model:
|
| 4 |
+
- black-forest-labs/FLUX.1-dev
|
| 5 |
base_model_relation: quantized
|
| 6 |
pipeline_tag: text-to-image
|
| 7 |
---
|
| 8 |
|
| 9 |
+
# Elastic model: FLUX.1-dev
|
| 10 |
|
| 11 |
|
| 12 |
## Overview
|
|
|
|
| 66 |
---
|
| 67 |
|
| 68 |
|
| 69 |
+
Elastic Models provides the same interface as HuggingFace Diffusers. Here is an example of how to use the FLUX.1-dev model:
|
| 70 |
|
| 71 |
```python
|
| 72 |
import torch
|
| 73 |
from elastic_models.diffusers import FluxPipeline
|
| 74 |
|
| 75 |
+
mode_name = 'black-forest-labs/FLUX.1-dev'
|
| 76 |
hf_token = ''
|
| 77 |
device = torch.device("cuda")
|
| 78 |
|
|
|
|
| 99 |
---
|
| 100 |
|
| 101 |
|
| 102 |
+
We have used PartiPrompts and DrawBench datasets to evaluate the quality of images generated by different sizes of FLUX.1-dev models (S, M, L, XL) compared to the original model. The evaluation metrics include ARNIQA, CLIP IQA, PSNR, SSIM, and VQA Faithfulness.
|
| 103 |
|
| 104 |
+

|
| 105 |
|
| 106 |
### Quality Benchmark Results
|
| 107 |
|
|
|
|
| 109 |
|
| 110 |
| **Metric/Model Size**| **S**| **M**| **L**| **XL**| **Original** |
|
| 111 |
| --- | --- | --- | --- | --- | --- |
|
| 112 |
+
| **ARNIQA (PartiPrompts)** | 64.1 | 63.2 | 61.9 | 66.8 | 66.9 |
|
| 113 |
+
| **ARNIQA (DrawBench)** | 64.3 | 63.5 | 63.6 | 68.2 | 68.5 |
|
| 114 |
+
| **CLIP IQA (PartiPrompts)** | 85.5 | 86.4 | 83.8 | 88.3 | 87.9 |
|
| 115 |
+
| **CLIP IQA (DrawBench)** | 86.4 | 86.5 | 84.5 | 89.5 | 90.0 |
|
| 116 |
+
| **VQA Faithfulness (PartiPrompts)** | 87.5 | 85.5 | 85.5 | 85.5 | 88.6 |
|
| 117 |
+
| **VQA Faithfulness (DrawBench)** | 69.3 | 64.7 | 64.8 | 67.8 | 65.2 |
|
| 118 |
+
| **PSNR (PartiPrompts)** | 30.22 | 30.24 | 30.38 | N/A | N/A |
|
| 119 |
+
| **SSIM (PartiPrompts)** | 0.72 | 0.72 | 0.76 | 1.0 | 1.0 |
|
| 120 |
|
| 121 |
|
| 122 |
## Datasets
|
|
|
|
| 145 |
---
|
| 146 |
|
| 147 |
|
| 148 |
+
We have measured the latency of different sizes of FLUX.1-dev model (S, M, L, XL, original) on various GPUs. The measurements were taken for generating images of size 1024x1024 pixels.
|
| 149 |
|
| 150 |
+

|
| 151 |
|
| 152 |
### Latency Benchmark Results
|
| 153 |
|
|
|
|
| 157 |
|
| 158 |
| **GPU/Model Size**| **S**| **M**| **L**| **XL**| **Original** |
|
| 159 |
| --- | --- | --- | --- | --- | --- |
|
| 160 |
+
| **H100** | 2.88 | 3.06 | 3.25 | 4.18 | 6.46 |
|
| 161 |
+
| **L40s** | 9.22 | 10.07 | 10.67 | 14.39 | 16 |
|
| 162 |
+
| **B200** | 1.93 | 2.04 | 2.15 | 2.77 | 4.52 |
|
| 163 |
+
| **GeForce RTX 5090** | 5.79 | N/A | N/A | N/A | N/A |
|
| 164 |
|
| 165 |
|
| 166 |
## Benchmarking Methodology
|
|
|
|
| 171 |
The benchmarking was performed on a single GPU with a batch size of 1. Each model was run for 10 iterations, and the average latency was calculated.
|
| 172 |
|
| 173 |
> **Algorithm summary:**
|
| 174 |
+
> 1. Load the FLUX.1-dev model with the specified size (S, M, L, XL, original).
|
| 175 |
> 2. Move the model to the GPU.
|
| 176 |
> 3. Prepare a sample prompt for image generation.
|
| 177 |
> 4. Run the model for a number of iterations (e.g., 10) and measure the time taken for each iteration. On each iteration:
|
|
|
|
| 191 |
import torch
|
| 192 |
from elastic_models.diffusers import FluxPipeline
|
| 193 |
|
| 194 |
+
mode_name = 'black-forest-labs/FLUX.1-dev'
|
| 195 |
hf_token = ''
|
| 196 |
device = torch.device("cuda")
|
| 197 |
|
|
|
|
| 209 |
generate_kwargs={
|
| 210 |
"height": 1024,
|
| 211 |
"width": 1024,
|
| 212 |
+
"num_inference_steps": 28,
|
| 213 |
"cfg_scale": 0.0
|
| 214 |
}
|
| 215 |
|
|
|
|
| 248 |
|
| 249 |
---
|
| 250 |
|
| 251 |
+
Elastic FLUX.1-dev engines support **runtime LoRA hot-swap** — load, switch, or disable LoRA files without recompilation or engine reload. LoRA weights are dynamic tensor inputs to the compiled engine.
|
| 252 |
|
| 253 |
- **Supported ranks**: 1–256 (compiled with dynamic rank)
|
| 254 |
- **Supported formats**: XLabs, diffusers, BFL Control (auto-detected)
|
|
|
|
| 265 |
import torch
|
| 266 |
from elastic_models.diffusers import FluxPipeline
|
| 267 |
|
| 268 |
+
model_name = "black-forest-labs/FLUX.1-dev"
|
| 269 |
device = torch.device("cuda")
|
| 270 |
|
| 271 |
pipeline = FluxPipeline.from_pretrained(
|
|
|
|
| 300 |
|
| 301 |
| **GPU/Model Size**| **S**| **M**| **L**| **XL**| **Original (unfused)** |
|
| 302 |
| --- | --- | --- | --- | --- | --- |
|
| 303 |
+
| **H100** | 4.45 | 4.56 | 4.69 | 5.38 | 7.64 |
|
| 304 |
+
| **L40s** | 11.36 | 11.99 | 12.59 | 15.63 | 19.02 |
|
| 305 |
+
| **B200** | 3.16 | 3.23 | 3.29 | 2.79 | 5.2 |
|
| 306 |
+
| **GeForce RTX 5090** | 7.54 | N/A | N/A | N/A | N/A |
|
| 307 |
|
| 308 |
|
| 309 |
|
|
|
|
| 330 |
--name serving_thestage_model \
|
| 331 |
-p 8000:80 \
|
| 332 |
-e AUTH_TOKEN=<AUTH_TOKEN> \
|
| 333 |
+
-e MODEL_REPO=black-forest-labs/FLUX.1-dev \
|
| 334 |
-e MODEL_SIZE=<MODEL_SIZE> \
|
| 335 |
-e MODEL_BATCH=<MAX_BATCH_SIZE> \
|
| 336 |
-e HUGGINGFACE_ACCESS_TOKEN=<HUGGINGFACE_ACCESS_TOKEN> \
|
|
|
|
| 358 |
curl -X POST <http://127.0.0.1:8000/v1/images/generations> \
|
| 359 |
-H "Authorization: Bearer <AUTH_TOKEN>" \
|
| 360 |
-H "Content-Type: application/json" \
|
| 361 |
+
-H "X-Model-Name: flux-1-dev-<MODEL_SIZE>-bs<MAX_BATCH_SIZE>" \
|
| 362 |
-d '{
|
| 363 |
"prompt": "Cat eating banana",
|
| 364 |
"seed": 12,
|
| 365 |
"aspect_ratio": "1:1",
|
| 366 |
"guidance_scale": 6.5,
|
| 367 |
+
"num_inference_steps": 28
|
| 368 |
}' \
|
| 369 |
--output sunset.webp -D -
|
| 370 |
```
|
|
|
|
| 380 |
"seed": 12,
|
| 381 |
"aspect_ratio": "1:1",
|
| 382 |
"guidance_scale": 6.5,
|
| 383 |
+
"num_inference_steps": 28
|
| 384 |
})
|
| 385 |
headers = {
|
| 386 |
'Authorization': 'Bearer <AUTH_TOKEN>',
|
| 387 |
'Content-Type': 'application/json',
|
| 388 |
+
'X-Model-Name': 'flux-1-dev-<MODEL_SIZE>-bs<MAX_BATCH_SIZE>'
|
| 389 |
}
|
| 390 |
response = requests.request("POST", url, headers=headers, data=payload)
|
| 391 |
with open("sunset.webp", "wb") as f:
|
|
|
|
| 400 |
|
| 401 |
BASE_URL = "http://<your_ip>/v1"
|
| 402 |
API_KEY = ""
|
| 403 |
+
MODEL = "flux-1-dev-<MODEL_SIZE>-bs<MAX_BATCH_SIZE>"
|
| 404 |
|
| 405 |
client = OpenAI(
|
| 406 |
api_key=API_KEY,
|
|
|
|
| 416 |
"seed": 111,
|
| 417 |
"aspect_ratio": "1:1",
|
| 418 |
"guidance_scale": 3.5,
|
| 419 |
+
"num_inference_steps": 28
|
| 420 |
},
|
| 421 |
)
|
| 422 |
|
|
|
|
| 449 |
|
| 450 |
> `X-Model-Name`: `string`
|
| 451 |
>
|
| 452 |
+
> Specifies the model to use for generation. Format: `flux-1-dev-<size>-bs<batch_size>`, where `<size>` is one of `S`, `M`, `L`, `XL`, `original` and `<batch_size>` is the maximum batch size configured during container startup.
|
| 453 |
|
| 454 |
### Input Body
|
| 455 |
|
|
|
|
| 465 |
|
| 466 |
> `num_inference_steps`: `int32`
|
| 467 |
>
|
| 468 |
+
> Number of diffusion steps to use for generation. Higher values yield better quality but take longer. Default is 28
|
| 469 |
|
| 470 |
> `aspect_ratio`: `string`
|
| 471 |
>
|
|
|
|
| 514 |
# modal_serving.py
|
| 515 |
|
| 516 |
ENVS = {
|
| 517 |
+
"MODEL_REPO": "black-forest-labs/FLUX.1-dev",
|
| 518 |
"MODEL_BATCH": "4",
|
| 519 |
"THESTAGE_AUTH_TOKEN": "",
|
| 520 |
"HUGGINGFACE_ACCESS_TOKEN": "",
|
|
|
|
| 545 |
)
|
| 546 |
@modal.web_server(
|
| 547 |
80,
|
| 548 |
+
label="black-forest-labs/FLUX.1-dev-test",
|
| 549 |
startup_timeout=60*20
|
| 550 |
)
|
| 551 |
def serve():
|