Spaces:
Running on Zero
Running on Zero
Commit ·
0c0b8b9
1
Parent(s): b2e1f99
added optional SR
Browse files- app.py +23 -4
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -48,6 +48,13 @@ MODEL_NAME = 'FlexTok d18-d28 (DFN)'
|
|
| 48 |
# Load FlexTok model from HF Hub
|
| 49 |
flextok_model = FlexTokFromHub.from_pretrained(MODEL_ID).to(device).eval()
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
def img_from_path(
|
| 53 |
path: str,
|
|
@@ -71,7 +78,7 @@ def img_from_path(
|
|
| 71 |
|
| 72 |
|
| 73 |
@spaces.GPU(duration=20)
|
| 74 |
-
def infer(img_path, seed=
|
| 75 |
if randomize_seed:
|
| 76 |
seed = None
|
| 77 |
|
|
@@ -102,6 +109,9 @@ def infer(img_path, seed=0, randomize_seed=False, timesteps=20, cfg_scale=7.5, p
|
|
| 102 |
for reconst_k, k_keep in zip(all_reconst, K_KEEP_LIST)
|
| 103 |
]
|
| 104 |
|
|
|
|
|
|
|
|
|
|
| 105 |
return all_images
|
| 106 |
|
| 107 |
|
|
@@ -143,7 +153,10 @@ with gr.Blocks(css=css, theme=gr.themes.Base()) as demo:
|
|
| 143 |
Official demo for: <br>
|
| 144 |
[**FlexTok: Resampling Images into 1D Token Sequences of Flexible Length**](https://arxiv.org/abs/2502.13967), arXiv 2025 <br>
|
| 145 |
|
| 146 |
-
This demo uses the FlexTok tokenizer to autoencode the given RGB input, using [{MODEL_ID}](https://huggingface.co/{MODEL_ID}), running on *{power_device}*.
|
|
|
|
|
|
|
|
|
|
| 147 |
""")
|
| 148 |
|
| 149 |
img_path = gr.Image(label='RGB input image', type='filepath')
|
|
@@ -151,13 +164,19 @@ with gr.Blocks(css=css, theme=gr.themes.Base()) as demo:
|
|
| 151 |
|
| 152 |
with gr.Accordion("Advanced Settings", open=False):
|
| 153 |
gr.Markdown(f"""
|
| 154 |
-
The FlexTok decoder is a rectified flow model. The following settings control the seed of the initial noise, the number of denoising timesteps,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
""")
|
| 156 |
seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=1000)
|
| 157 |
randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
|
| 158 |
timesteps = gr.Slider(label="Denoising timesteps", minimum=1, maximum=1000, step=1, value=25)
|
| 159 |
cfg_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=15.0, step=0.1, value=7.5)
|
| 160 |
perform_norm_guidance = gr.Checkbox(label="Perform Adaptive Projected Guidance", value=True)
|
|
|
|
| 161 |
|
| 162 |
result = gr.Gallery(
|
| 163 |
label="Reconstructions", show_label=True, elem_id="gallery", type='pil',
|
|
@@ -174,7 +193,7 @@ with gr.Blocks(css=css, theme=gr.themes.Base()) as demo:
|
|
| 174 |
|
| 175 |
run_button.click(
|
| 176 |
fn = infer,
|
| 177 |
-
inputs = [img_path, seed, randomize_seed, timesteps, cfg_scale, perform_norm_guidance],
|
| 178 |
outputs = [result]
|
| 179 |
)
|
| 180 |
|
|
|
|
| 48 |
# Load FlexTok model from HF Hub
|
| 49 |
flextok_model = FlexTokFromHub.from_pretrained(MODEL_ID).to(device).eval()
|
| 50 |
|
| 51 |
+
# Load AuraSR model from HF Hub
|
| 52 |
+
try:
|
| 53 |
+
from aura_sr import AuraSR
|
| 54 |
+
aura_sr = AuraSR.from_pretrained("fal-ai/AuraSR")
|
| 55 |
+
except ImportError:
|
| 56 |
+
aura_sr = None
|
| 57 |
+
|
| 58 |
|
| 59 |
def img_from_path(
|
| 60 |
path: str,
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
@spaces.GPU(duration=20)
|
| 81 |
+
def infer(img_path, seed=1000, randomize_seed=False, timesteps=25, cfg_scale=7.5, perform_norm_guidance=True, super_res=True):
|
| 82 |
if randomize_seed:
|
| 83 |
seed = None
|
| 84 |
|
|
|
|
| 109 |
for reconst_k, k_keep in zip(all_reconst, K_KEEP_LIST)
|
| 110 |
]
|
| 111 |
|
| 112 |
+
if super_res:
|
| 113 |
+
all_images = [(aura_sr.upscale_4x(img), label) for img, label in all_images]
|
| 114 |
+
|
| 115 |
return all_images
|
| 116 |
|
| 117 |
|
|
|
|
| 153 |
Official demo for: <br>
|
| 154 |
[**FlexTok: Resampling Images into 1D Token Sequences of Flexible Length**](https://arxiv.org/abs/2502.13967), arXiv 2025 <br>
|
| 155 |
|
| 156 |
+
This demo uses the FlexTok tokenizer to autoencode the given RGB input, using [{MODEL_ID}](https://huggingface.co/{MODEL_ID}), running on *{power_device}*.
|
| 157 |
+
The FlexTok encoder produces a 1D sequence of discrete tokens that are ordered in a coarse-to-fine manner.
|
| 158 |
+
We show reconstructions from truncated subsequences, using the first 1, 2, 4, 8, ..., 256 tokens.
|
| 159 |
+
As you will see, the first tokens capture more high-level semantic content, while subsequent ones add fine-grained detail.
|
| 160 |
""")
|
| 161 |
|
| 162 |
img_path = gr.Image(label='RGB input image', type='filepath')
|
|
|
|
| 164 |
|
| 165 |
with gr.Accordion("Advanced Settings", open=False):
|
| 166 |
gr.Markdown(f"""
|
| 167 |
+
The FlexTok decoder is a rectified flow model. The following settings control the seed of the initial noise, the number of denoising timesteps,
|
| 168 |
+
the guidance scale, and whether to perform [Adaptive Projected Guidance](https://arxiv.org/abs/2410.02416) (we recommend enabling it).
|
| 169 |
+
|
| 170 |
+
This FlexTok model operates at 256x256 resolution. You can optionally super-resolve the reconstructions to 1024x1024 using Aura-SR for
|
| 171 |
+
sharper details, whithout changing the underlying reconstructed image too much. We enable it by default, but you can disable it if you would
|
| 172 |
+
like to see the raw 256x256 FlexTok reconstructions.
|
| 173 |
""")
|
| 174 |
seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=1000)
|
| 175 |
randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
|
| 176 |
timesteps = gr.Slider(label="Denoising timesteps", minimum=1, maximum=1000, step=1, value=25)
|
| 177 |
cfg_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=15.0, step=0.1, value=7.5)
|
| 178 |
perform_norm_guidance = gr.Checkbox(label="Perform Adaptive Projected Guidance", value=True)
|
| 179 |
+
super_res = gr.Checkbox(label="Super-resolve reconstructions from 256x256 to 1024x1024 with Aura-SR", value=True)
|
| 180 |
|
| 181 |
result = gr.Gallery(
|
| 182 |
label="Reconstructions", show_label=True, elem_id="gallery", type='pil',
|
|
|
|
| 193 |
|
| 194 |
run_button.click(
|
| 195 |
fn = infer,
|
| 196 |
+
inputs = [img_path, seed, randomize_seed, timesteps, cfg_scale, perform_norm_guidance, super_res],
|
| 197 |
outputs = [result]
|
| 198 |
)
|
| 199 |
|
requirements.txt
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
flextok @ git+https://github.com/apple/ml-flextok
|
|
|
|
| 2 |
spaces
|
|
|
|
| 1 |
flextok @ git+https://github.com/apple/ml-flextok
|
| 2 |
+
aura-sr
|
| 3 |
spaces
|