programmersd commited on
Commit
3ef2431
·
verified ·
1 Parent(s): d14589e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -206
app.py CHANGED
@@ -6,16 +6,14 @@ import torch
6
  import gradio as gr
7
 
8
  # =====================================================
9
- # 🔥 EXTREME CPU + RAM CONTROL - ULTIMATE OPTIMIZATION
10
  # =====================================================
11
 
12
- CPU_THREADS = 1 # Minimum safe value for HF Spaces
13
- MAX_RESOLUTION = 512
14
- MAX_STEPS = 4
15
-
16
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
17
  os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
18
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
 
 
19
  os.environ["OMP_NUM_THREADS"] = str(CPU_THREADS)
20
  os.environ["MKL_NUM_THREADS"] = str(CPU_THREADS)
21
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -24,18 +22,13 @@ os.environ["HF_DATASETS_CACHE"] = "./hf_cache"
24
 
25
  torch.set_num_threads(CPU_THREADS)
26
  torch.set_grad_enabled(False)
27
- torch.set_float32_matmul_precision('lowest')
28
 
29
  DEVICE = "cpu"
30
- DTYPE = torch.float16 # CRITICAL: Use float16 to save 50% memory
31
  CACHE_DIR = "./hf_cache"
32
  os.makedirs(CACHE_DIR, exist_ok=True)
33
 
34
- print("⚡ Z-Image Turbo ULTRA CPU - EXTREME MODE (HF Spaces 16GB)")
35
-
36
- # =====================================================
37
- # 📦 MINIMAL IMPORTS
38
- # =====================================================
39
 
40
  try:
41
  from huggingface_hub import hf_hub_download
@@ -46,83 +39,34 @@ try:
46
  AutoencoderKL,
47
  FlowMatchEulerDiscreteScheduler
48
  )
49
- from transformers import (
50
- AutoTokenizer,
51
- CLIPTextModel,
52
- BertModel,
53
- BertTokenizer
54
- )
55
  except ImportError as e:
56
- print(f"⚠️ Import error (models may not load): {e}")
57
-
58
- # =====================================================
59
- # 🧠 GLOBAL PIPELINE STATE (Lazy Loading)
60
- # =====================================================
61
 
62
  pipe = None
63
  _pipe_lock = False
64
 
65
- # =====================================================
66
- # 🎯 LIGHTWEIGHT TEXT ENCODER LOADER
67
- # =====================================================
68
-
69
- def load_text_encoder_lightweight():
70
- """Load absolute minimum text encoder"""
71
- print("📝 Loading lightweight text encoder...")
72
- try:
73
- # Try tiny CLIP first
74
- from transformers import CLIPTokenizer, CLIPTextModel
75
- tokenizer = CLIPTokenizer.from_pretrained(
76
- "openai/clip-vit-base-patch32",
77
- cache_dir=CACHE_DIR,
78
- local_files_only=False
79
- )
80
- text_encoder = CLIPTextModel.from_pretrained(
81
- "openai/clip-vit-base-patch32",
82
- torch_dtype=DTYPE,
83
- low_cpu_mem_usage=True,
84
- cache_dir=CACHE_DIR,
85
- local_files_only=False
86
- )
87
- return tokenizer, text_encoder
88
- except Exception as e:
89
- print(f"⚠️ CLIP failed: {e}, using fallback...")
90
- # Fallback: Use BERT-tiny (much smaller)
91
- from transformers import AutoTokenizer, AutoModel
92
- try:
93
- tokenizer = AutoTokenizer.from_pretrained(
94
- "prajjwal1/bert-tiny",
95
- cache_dir=CACHE_DIR
96
- )
97
- text_encoder = AutoModel.from_pretrained(
98
- "prajjwal1/bert-tiny",
99
- torch_dtype=DTYPE,
100
- low_cpu_mem_usage=True,
101
- cache_dir=CACHE_DIR
102
- )
103
- return tokenizer, text_encoder
104
- except Exception as e2:
105
- print(f"❌ Both encoders failed: {e2}")
106
- raise
107
-
108
- # =====================================================
109
- # 🚀 LAZY-LOADED PIPELINE WITH MEMORY CONTROL
110
- # =====================================================
111
 
112
  def load_pipeline():
113
- """Load pipeline once, keep in memory"""
114
  global pipe, _pipe_lock
115
-
116
  if pipe is not None:
117
  return pipe
118
-
119
  if _pipe_lock:
120
- raise gr.Error("Pipeline already loading. Please wait...")
121
-
122
  _pipe_lock = True
123
-
124
  try:
125
- print("⚡ Loading scheduler...")
126
  scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
127
  "Tongyi-MAI/Z-Image-Turbo",
128
  subfolder="scheduler",
@@ -130,26 +74,21 @@ def load_pipeline():
130
  low_cpu_mem_usage=True
131
  )
132
 
133
- print("⚡ Loading VAE (memory-optimized)...")
134
  vae = AutoencoderKL.from_pretrained(
135
  "Tongyi-MAI/Z-Image-Turbo",
136
  subfolder="vae",
137
- torch_dtype=DTYPE,
138
- low_cpu_mem_usage=True,
139
  cache_dir=CACHE_DIR,
140
- variant="fp16" # Force fp16 variant
 
141
  )
142
 
143
- print("⚡ Loading text encoder (lightweight)...")
144
- tokenizer, text_encoder = load_text_encoder_lightweight()
145
 
146
- print("⚡ Loading transformer (GGUF quantized)...")
147
  gguf_path = hf_hub_download(
148
  repo_id="unsloth/Z-Image-Turbo-GGUF",
149
  filename="z-image-turbo-Q2_K.gguf",
150
  cache_dir=CACHE_DIR,
151
- resume_download=True,
152
- local_files_only=False
153
  )
154
 
155
  transformer = ZImageTransformer2DModel.from_single_file(
@@ -159,7 +98,6 @@ def load_pipeline():
159
  low_cpu_mem_usage=True
160
  )
161
 
162
- # Build pipeline
163
  pipe = ZImagePipeline(
164
  vae=vae,
165
  text_encoder=text_encoder,
@@ -168,161 +106,89 @@ def load_pipeline():
168
  scheduler=scheduler
169
  ).to(DEVICE)
170
 
171
- # EXTREME memory optimization
172
  pipe.enable_attention_slicing()
173
  pipe.enable_vae_slicing()
174
  pipe.enable_vae_tiling()
175
  pipe.set_progress_bar_config(disable=True)
176
-
177
- # Explicitly set to eval mode and disable gradients
178
  pipe.vae.eval()
179
  pipe.text_encoder.eval()
180
  pipe.transformer.eval()
181
 
182
- print("✅ Pipeline loaded successfully")
183
  return pipe
184
 
185
  except Exception as e:
186
- print(f" Pipeline load failed: {e}")
187
- raise gr.Error(f"Failed to load model: {str(e)}")
188
  finally:
189
  _pipe_lock = False
190
 
191
-
192
- # =====================================================
193
- # 🎨 ULTRA-OPTIMIZED GENERATION
194
- # =====================================================
195
-
196
  @torch.inference_mode()
197
  def generate(prompt, width, height, steps, seed, progress=gr.Progress()):
198
- """Generate image with aggressive memory management"""
199
-
200
- if not prompt or not prompt.strip():
201
- raise gr.Error("❌ Prompt is required")
202
-
203
- # HARD safety limits for HF Spaces
204
- width = max(256, min(int(width), 512))
205
- height = max(256, min(int(height), 512))
206
- steps = max(1, min(int(steps), 4))
207
 
208
- # Reduce to multiple of 64
209
- width = (width // 64) * 64
210
- height = (height // 64) * 64
211
 
212
  if seed < 0 or seed == "":
213
  seed = random.randint(0, 2**31 - 1)
214
  else:
215
  seed = int(seed)
216
 
217
- # Pre-generation cleanup
218
  gc.collect()
219
- if torch.cuda.is_available():
220
- torch.cuda.empty_cache()
221
-
222
- try:
223
- # Load pipeline on first use
224
- pipe = load_pipeline()
225
-
226
- generator = torch.Generator(device=DEVICE).manual_seed(seed)
227
-
228
- start_time = time.time()
229
-
230
- def callback(step, timestep, latents=None):
231
- elapsed = time.time() - start_time
232
- avg = elapsed / (step + 1) if step > 0 else 0
233
- remaining = avg * (steps - step - 1) if step < steps - 1 else 0
234
- progress(
235
- (step + 1) / steps,
236
- desc=f"Step {step+1}/{steps} | ETA: {remaining:.1f}s"
237
- )
238
-
239
- print(f"🎨 Generating {width}x{height} in {steps} steps...")
240
-
241
- result = pipe(
242
- prompt=prompt,
243
- negative_prompt=None,
244
- width=width,
245
- height=height,
246
- num_inference_steps=steps,
247
- guidance_scale=1.0,
248
- generator=generator,
249
- callback=callback,
250
- callback_steps=1,
251
- output_type="pil"
252
- )
253
-
254
- image = result.images[0]
255
-
256
- # Post-generation cleanup
257
- del result
258
- gc.collect()
259
-
260
- return image, seed
261
-
262
- except torch.cuda.OutOfMemoryError:
263
- gc.collect()
264
- raise gr.Error("❌ Out of memory! Try smaller size or fewer steps")
265
- except Exception as e:
266
- gc.collect()
267
- raise gr.Error(f"❌ Generation error: {str(e)}")
268
-
269
-
270
- # =====================================================
271
- # 🎛️ MINIMAL GRADIO UI
272
- # =====================================================
273
-
274
- with gr.Blocks(title="Z-Image Turbo CPU") as demo:
275
- gr.Markdown("""
276
- # ⚡ Z-Image Turbo — CPU ULTRA MODE
277
- **HF Spaces Optimized | 16GB RAM | No GPU**
278
 
279
- ⚠️ Slow generation expected on CPU. Start with 256x256 and low steps.
280
- """)
281
-
282
- with gr.Row():
283
- with gr.Column(scale=2):
284
- prompt = gr.Textbox(
285
- label="Prompt",
286
- placeholder="Describe what you want...",
287
- lines=3
288
- )
 
 
 
 
 
 
 
 
 
 
 
 
289
 
290
- with gr.Row():
291
- width = gr.Slider(256, 512, 256, step=64, label="Width")
292
- height = gr.Slider(256, 512, 256, step=64, label="Height")
 
293
 
294
- with gr.Row():
295
- steps = gr.Slider(1, 4, 2, step=1, label="Steps")
296
- seed = gr.Number(value=-1, precision=0, label="Seed (-1=random)")
297
 
298
- btn = gr.Button("🚀 Generate", variant="primary", scale=2)
 
 
 
 
299
 
300
- with gr.Column(scale=1):
301
- output = gr.Image(label="Output")
302
- used_seed = gr.Number(label="Seed Used", interactive=False)
303
 
304
  btn.click(
305
  generate,
306
  inputs=[prompt, width, height, steps, seed],
307
- outputs=[output, used_seed]
 
308
  )
309
 
310
- gr.Markdown("""
311
- ### Performance Tips
312
- - Start with **256x256** resolution
313
- - Use **1-2 steps** for fast results
314
- - Each step takes ~30-60s on CPU
315
- - Results improve with more steps
316
- - Negative seeds auto-randomize
317
-
318
- ### 💾 Memory Strategy
319
- - Models loaded on first request only
320
- - Aggressive garbage collection after each run
321
- - float16 reduces memory by 50%
322
- - VAE tiling saves additional ~2GB
323
- """)
324
-
325
- demo.queue(concurrency_count=1, max_size=2)
326
 
327
  if __name__ == "__main__":
328
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
6
  import gradio as gr
7
 
8
  # =====================================================
9
+ # 🛠 CPU OPTIMIZED SETTINGS
10
  # =====================================================
11
 
 
 
 
 
12
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
13
  os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
14
  os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
15
+
16
+ CPU_THREADS = min(4, os.cpu_count() or 1)
17
  os.environ["OMP_NUM_THREADS"] = str(CPU_THREADS)
18
  os.environ["MKL_NUM_THREADS"] = str(CPU_THREADS)
19
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
22
 
23
  torch.set_num_threads(CPU_THREADS)
24
  torch.set_grad_enabled(False)
 
25
 
26
  DEVICE = "cpu"
27
+ DTYPE = torch.float32
28
  CACHE_DIR = "./hf_cache"
29
  os.makedirs(CACHE_DIR, exist_ok=True)
30
 
31
+ print("⚡ Z-Image Turbo CPU Optimized (Latest Docs)")
 
 
 
 
32
 
33
  try:
34
  from huggingface_hub import hf_hub_download
 
39
  AutoencoderKL,
40
  FlowMatchEulerDiscreteScheduler
41
  )
42
+ from transformers import CLIPTokenizer, CLIPTextModel
 
 
 
 
 
43
  except ImportError as e:
44
+ print(f"⚠️ Imports may not load: {e}")
 
 
 
 
45
 
46
  pipe = None
47
  _pipe_lock = False
48
 
49
+ def load_text_encoder_min():
50
+ tokenizer = CLIPTokenizer.from_pretrained(
51
+ "openai/clip-vit-base-patch32", cache_dir=CACHE_DIR
52
+ )
53
+ text_encoder = CLIPTextModel.from_pretrained(
54
+ "openai/clip-vit-base-patch32",
55
+ cache_dir=CACHE_DIR,
56
+ torch_dtype=DTYPE,
57
+ low_cpu_mem_usage=True
58
+ )
59
+ return tokenizer, text_encoder
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def load_pipeline():
 
62
  global pipe, _pipe_lock
 
63
  if pipe is not None:
64
  return pipe
 
65
  if _pipe_lock:
66
+ raise gr.Error("Pipeline already loading")
 
67
  _pipe_lock = True
68
+
69
  try:
 
70
  scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
71
  "Tongyi-MAI/Z-Image-Turbo",
72
  subfolder="scheduler",
 
74
  low_cpu_mem_usage=True
75
  )
76
 
 
77
  vae = AutoencoderKL.from_pretrained(
78
  "Tongyi-MAI/Z-Image-Turbo",
79
  subfolder="vae",
 
 
80
  cache_dir=CACHE_DIR,
81
+ torch_dtype=DTYPE,
82
+ low_cpu_mem_usage=True
83
  )
84
 
85
+ tokenizer, text_encoder = load_text_encoder_min()
 
86
 
 
87
  gguf_path = hf_hub_download(
88
  repo_id="unsloth/Z-Image-Turbo-GGUF",
89
  filename="z-image-turbo-Q2_K.gguf",
90
  cache_dir=CACHE_DIR,
91
+ resume_download=True
 
92
  )
93
 
94
  transformer = ZImageTransformer2DModel.from_single_file(
 
98
  low_cpu_mem_usage=True
99
  )
100
 
 
101
  pipe = ZImagePipeline(
102
  vae=vae,
103
  text_encoder=text_encoder,
 
106
  scheduler=scheduler
107
  ).to(DEVICE)
108
 
 
109
  pipe.enable_attention_slicing()
110
  pipe.enable_vae_slicing()
111
  pipe.enable_vae_tiling()
112
  pipe.set_progress_bar_config(disable=True)
113
+
 
114
  pipe.vae.eval()
115
  pipe.text_encoder.eval()
116
  pipe.transformer.eval()
117
 
 
118
  return pipe
119
 
120
  except Exception as e:
121
+ raise gr.Error(f"Failed to load model: {e}")
122
+
123
  finally:
124
  _pipe_lock = False
125
 
 
 
 
 
 
126
  @torch.inference_mode()
127
  def generate(prompt, width, height, steps, seed, progress=gr.Progress()):
128
+ if not prompt.strip():
129
+ raise gr.Error("Prompt required")
 
 
 
 
 
 
 
130
 
131
+ width = (max(256, min(int(width), 512)) // 64) * 64
132
+ height = (max(256, min(int(height), 512)) // 64) * 64
133
+ steps = max(1, min(int(steps), 4))
134
 
135
  if seed < 0 or seed == "":
136
  seed = random.randint(0, 2**31 - 1)
137
  else:
138
  seed = int(seed)
139
 
 
140
  gc.collect()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ pipe = load_pipeline()
143
+ generator = torch.Generator(device="cpu").manual_seed(seed)
144
+
145
+ start = time.time()
146
+ def callback(step, *_):
147
+ elapsed = time.time() - start
148
+ avg = elapsed / (step + 1)
149
+ remaining = avg * (steps - step - 1)
150
+ progress((step+1)/steps, desc=f"Step {step+1}/{steps} | ETA {remaining:.1f}s")
151
+
152
+ result = pipe(
153
+ prompt=prompt,
154
+ negative_prompt=None,
155
+ width=width,
156
+ height=height,
157
+ num_inference_steps=steps,
158
+ guidance_scale=1.0,
159
+ generator=generator,
160
+ callback=callback,
161
+ callback_steps=1,
162
+ output_type="pil"
163
+ )
164
 
165
+ image = result.images[0]
166
+ del result
167
+ gc.collect()
168
+ return image, seed
169
 
170
+ with gr.Blocks() as demo:
171
+ gr.Markdown("# Z-Image Turbo — CPU Optimized")
 
172
 
173
+ prompt = gr.Textbox(label="Prompt")
174
+ width = gr.Slider(256, 512, 256, step=64, label="Width")
175
+ height = gr.Slider(256, 512, 256, step=64, label="Height")
176
+ steps = gr.Slider(1, 4, 2, step=1, label="Steps")
177
+ seed = gr.Number(value=-1, precision=0, label="Seed (-1=random)")
178
 
179
+ btn = gr.Button("🚀 Generate")
180
+ output = gr.Image(label="Output")
181
+ used_seed = gr.Number(label="Seed Used", interactive=False)
182
 
183
  btn.click(
184
  generate,
185
  inputs=[prompt, width, height, steps, seed],
186
+ outputs=[output, used_seed],
187
+ concurrency_limit=1
188
  )
189
 
190
+ # Enable queue with up to 2 pending jobs
191
+ demo.queue(max_size=2) # queues events per current Gradio docs :contentReference[oaicite:1]{index=1}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
  if __name__ == "__main__":
194
  demo.launch(server_name="0.0.0.0", server_port=7860)