programmersd commited on
Commit
9cd03c5
·
verified ·
1 Parent(s): a0d0bbb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -82
app.py CHANGED
@@ -1,29 +1,26 @@
1
  import os
 
2
  import time
3
  import random
4
- import gc
5
  import torch
6
  import gradio as gr
7
 
8
  from huggingface_hub import hf_hub_download
9
- from transformers import AutoTokenizer, AutoModel
10
  from diffusers import (
11
  ZImagePipeline,
12
  ZImageTransformer2DModel,
13
  GGUFQuantizationConfig,
14
- AutoencoderKL,
15
  FlowMatchEulerDiscreteScheduler
16
  )
17
 
18
  # =========================
19
- # FORCE CPU ENV
20
  # =========================
21
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
22
  os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
23
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
24
 
25
  cpu_cores = os.cpu_count() or 1
26
-
27
  torch.set_num_threads(cpu_cores)
28
  torch.set_num_interop_threads(cpu_cores)
29
 
@@ -32,41 +29,33 @@ os.environ["MKL_NUM_THREADS"] = str(cpu_cores)
32
 
33
  torch.backends.mkldnn.enabled = True
34
  torch.backends.quantized.engine = "fbgemm"
35
- torch.backends.cudnn.enabled = False
36
- torch.set_float32_matmul_precision("high")
37
 
38
- dtype = torch.float32
39
  device = torch.device("cpu")
 
40
 
41
  # =========================
42
  # MODEL CONFIG
43
  # =========================
44
  BASE_MODEL_ID = "Tongyi-MAI/Z-Image-Turbo"
45
- TEXT_ENCODER_ID = "Qwen/Qwen3-4B"
46
  GGUF_REPO_ID = "unsloth/Z-Image-Turbo-GGUF"
47
  GGUF_FILENAME = "z-image-turbo-Q2_K.gguf"
48
  CACHE_DIR = "models"
49
 
50
  os.makedirs(CACHE_DIR, exist_ok=True)
51
 
52
- def download_if_needed(repo_id, filename):
53
- local_path = os.path.join(CACHE_DIR, filename)
54
  if os.path.exists(local_path):
55
- print("Model cached locally.")
56
  return local_path
57
-
58
- print("Downloading model (first run)...")
59
- path = hf_hub_download(
60
- repo_id=repo_id,
61
- filename=filename,
62
  cache_dir=CACHE_DIR,
63
  resume_download=True
64
  )
65
- print("Download finished.")
66
- return path
67
 
68
  # =========================
69
- # LOAD PIPELINE CPU ONLY
70
  # =========================
71
  def load_pipeline():
72
  scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
@@ -75,25 +64,15 @@ def load_pipeline():
75
  cache_dir=CACHE_DIR
76
  )
77
 
78
- vae = AutoencoderKL.from_pretrained(
79
  BASE_MODEL_ID,
80
- subfolder="vae",
81
  torch_dtype=dtype,
82
- cache_dir=CACHE_DIR
 
83
  )
84
 
85
- tokenizer = AutoTokenizer.from_pretrained(
86
- TEXT_ENCODER_ID,
87
- cache_dir=CACHE_DIR
88
- )
89
-
90
- text_encoder = AutoModel.from_pretrained(
91
- TEXT_ENCODER_ID,
92
- torch_dtype=dtype,
93
- cache_dir=CACHE_DIR
94
- ).to(device)
95
-
96
- gguf_path = download_if_needed(GGUF_REPO_ID, GGUF_FILENAME)
97
 
98
  transformer = ZImageTransformer2DModel.from_single_file(
99
  gguf_path,
@@ -101,36 +80,20 @@ def load_pipeline():
101
  torch_dtype=dtype
102
  ).to(device)
103
 
104
- pipe = ZImagePipeline(
105
- vae=vae.to(device),
106
- text_encoder=text_encoder,
107
- tokenizer=tokenizer,
108
- transformer=transformer,
109
- scheduler=scheduler
110
- ).to(device)
111
 
112
- pipe.unet.to(memory_format=torch.channels_last)
113
- pipe.text_encoder.to(memory_format=torch.channels_last)
 
114
 
115
- pipe.unet = torch.compile(pipe.unet, mode="max-autotune", fullgraph=True)
116
- pipe.text_encoder = torch.compile(pipe.text_encoder, mode="max-autotune", fullgraph=True)
117
 
118
  return pipe
119
 
120
  pipe = load_pipeline()
121
 
122
- # Warmup compile
123
- with torch.inference_mode():
124
- _ = pipe(
125
- prompt="warmup",
126
- width=256,
127
- height=256,
128
- num_inference_steps=1,
129
- guidance_scale=1.0
130
- )
131
-
132
  # =========================
133
- # GENERATION WITH PROGRESS
134
  # =========================
135
  def generate(prompt, seed, progress=gr.Progress()):
136
  if not prompt:
@@ -141,39 +104,40 @@ def generate(prompt, seed, progress=gr.Progress()):
141
 
142
  generator = torch.Generator(device=device).manual_seed(seed)
143
 
144
- total_steps = 4
145
- start_time = time.time()
 
 
 
146
 
147
- def step_callback(step, timestep, latents):
148
- elapsed = time.time() - start_time
149
  done = step + 1
 
150
  avg = elapsed / done
151
- eta = avg * (total_steps - done)
152
- progress(done / total_steps, desc=f"Step {done}/{total_steps} | ETA {eta:.1f}s")
153
 
154
  with torch.inference_mode():
155
- gc.disable()
156
- try:
157
- image = pipe(
158
- prompt=prompt,
159
- width=256,
160
- height=256,
161
- num_inference_steps=total_steps,
162
- guidance_scale=1.0,
163
- generator=generator,
164
- callback=step_callback,
165
- callback_steps=1
166
- ).images[0]
167
- finally:
168
- gc.enable()
169
 
170
  return image, seed
171
 
172
  # =========================
173
- # UI + QUEUE
174
  # =========================
175
- with gr.Blocks(title="Z-Image Turbo Q2_K CPU MAX") as demo:
176
- gr.Markdown("# Z-Image Turbo Q2_K — FULL CPU MAX MODE")
177
 
178
  prompt = gr.Textbox(label="Prompt", lines=3)
179
  seed = gr.Number(label="Seed (-1 random)", value=-1, precision=0)
@@ -184,7 +148,8 @@ with gr.Blocks(title="Z-Image Turbo Q2_K CPU MAX") as demo:
184
 
185
  btn.click(generate, inputs=[prompt, seed], outputs=[image_out, seed_out])
186
 
187
- demo.queue(max_size=10, concurrency_count=1)
188
 
189
  if __name__ == "__main__":
190
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
1
  import os
2
+ import gc
3
  import time
4
  import random
 
5
  import torch
6
  import gradio as gr
7
 
8
  from huggingface_hub import hf_hub_download
 
9
  from diffusers import (
10
  ZImagePipeline,
11
  ZImageTransformer2DModel,
12
  GGUFQuantizationConfig,
 
13
  FlowMatchEulerDiscreteScheduler
14
  )
15
 
16
  # =========================
17
+ # HARD CPU MODE
18
  # =========================
19
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
20
  os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
21
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
22
 
23
  cpu_cores = os.cpu_count() or 1
 
24
  torch.set_num_threads(cpu_cores)
25
  torch.set_num_interop_threads(cpu_cores)
26
 
 
29
 
30
  torch.backends.mkldnn.enabled = True
31
  torch.backends.quantized.engine = "fbgemm"
 
 
32
 
 
33
  device = torch.device("cpu")
34
+ dtype = torch.float16
35
 
36
  # =========================
37
  # MODEL CONFIG
38
  # =========================
39
  BASE_MODEL_ID = "Tongyi-MAI/Z-Image-Turbo"
 
40
  GGUF_REPO_ID = "unsloth/Z-Image-Turbo-GGUF"
41
  GGUF_FILENAME = "z-image-turbo-Q2_K.gguf"
42
  CACHE_DIR = "models"
43
 
44
  os.makedirs(CACHE_DIR, exist_ok=True)
45
 
46
+ def download_gguf():
47
+ local_path = os.path.join(CACHE_DIR, GGUF_FILENAME)
48
  if os.path.exists(local_path):
 
49
  return local_path
50
+ return hf_hub_download(
51
+ repo_id=GGUF_REPO_ID,
52
+ filename=GGUF_FILENAME,
 
 
53
  cache_dir=CACHE_DIR,
54
  resume_download=True
55
  )
 
 
56
 
57
  # =========================
58
+ # LOAD PIPELINE ULTRA LEAN
59
  # =========================
60
  def load_pipeline():
61
  scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
 
64
  cache_dir=CACHE_DIR
65
  )
66
 
67
+ pipe = ZImagePipeline.from_pretrained(
68
  BASE_MODEL_ID,
69
+ scheduler=scheduler,
70
  torch_dtype=dtype,
71
+ cache_dir=CACHE_DIR,
72
+ low_cpu_mem_usage=True
73
  )
74
 
75
+ gguf_path = download_gguf()
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  transformer = ZImageTransformer2DModel.from_single_file(
78
  gguf_path,
 
80
  torch_dtype=dtype
81
  ).to(device)
82
 
83
+ pipe.transformer = transformer
 
 
 
 
 
 
84
 
85
+ pipe.enable_attention_slicing()
86
+ pipe.enable_vae_slicing()
87
+ pipe.enable_sequential_cpu_offload()
88
 
89
+ pipe = pipe.to(device)
 
90
 
91
  return pipe
92
 
93
  pipe = load_pipeline()
94
 
 
 
 
 
 
 
 
 
 
 
95
  # =========================
96
+ # GENERATION (MIN RAM)
97
  # =========================
98
  def generate(prompt, seed, progress=gr.Progress()):
99
  if not prompt:
 
104
 
105
  generator = torch.Generator(device=device).manual_seed(seed)
106
 
107
+ steps = 4
108
+ width = 256
109
+ height = 256
110
+
111
+ start = time.time()
112
 
113
+ def callback(step, timestep, latents):
 
114
  done = step + 1
115
+ elapsed = time.time() - start
116
  avg = elapsed / done
117
+ eta = avg * (steps - done)
118
+ progress(done / steps, desc=f"Step {done}/{steps} | ETA {eta:.1f}s")
119
 
120
  with torch.inference_mode():
121
+ gc.collect()
122
+ image = pipe(
123
+ prompt=prompt,
124
+ width=width,
125
+ height=height,
126
+ num_inference_steps=steps,
127
+ guidance_scale=1.0,
128
+ generator=generator,
129
+ callback=callback,
130
+ callback_steps=1
131
+ ).images[0]
132
+ gc.collect()
 
 
133
 
134
  return image, seed
135
 
136
  # =========================
137
+ # UI
138
  # =========================
139
+ with gr.Blocks(title="Z-Image Turbo Ultra Lean CPU") as demo:
140
+ gr.Markdown("# Z-Image Turbo Q2_K — Ultra Lean 16GB CPU Mode")
141
 
142
  prompt = gr.Textbox(label="Prompt", lines=3)
143
  seed = gr.Number(label="Seed (-1 random)", value=-1, precision=0)
 
148
 
149
  btn.click(generate, inputs=[prompt, seed], outputs=[image_out, seed_out])
150
 
151
+ demo.queue(max_size=5, concurrency_count=1)
152
 
153
  if __name__ == "__main__":
154
+ demo.launch(server_name="0.0.0.0", server_port=7860)
155
+