Molbap HF Staff commited on
Commit
64c9a18
Β·
1 Parent(s): 0548742

some improvements

Browse files
Files changed (1) hide show
  1. app.py +101 -11
app.py CHANGED
@@ -190,9 +190,11 @@ def build_attn_vis():
190
  from transformers import AutoModelForCausalLM, modeling_utils as MU # noqa: E402
191
 
192
  def _measure_load_timeline(model_id: str, disable_warmup: bool):
 
193
  orig = getattr(MU, "caching_allocator_warmup", None)
194
  if disable_warmup and orig is not None:
195
  MU.caching_allocator_warmup = lambda *a, **k: None # type: ignore[attr-defined]
 
196
  try:
197
  device = "cuda" if torch.cuda.is_available() else "cpu"
198
  tl = []
@@ -201,33 +203,46 @@ def _measure_load_timeline(model_id: str, disable_warmup: bool):
201
  while not stop_evt.is_set():
202
  if device == "cuda":
203
  torch.cuda.synchronize()
204
- alloc = torch.cuda.memory_allocated()
 
 
205
  else:
206
  alloc = 0
207
  tl.append({"t": time.perf_counter() - start_t, "MiB": alloc / (1024**2)})
208
- time.sleep(0.05)
209
 
210
  if device == "cuda":
211
  torch.cuda.empty_cache()
212
  torch.cuda.reset_peak_memory_stats()
 
 
 
213
 
214
  start = time.perf_counter()
215
  stop_evt = threading.Event()
216
  th = threading.Thread(target=sample, args=(start, stop_evt), daemon=True)
217
  th.start()
218
 
219
- kwargs = {}
 
220
  if device == "cuda":
221
- kwargs.update(dict(torch_dtype=torch.float16, device_map="cuda:0", low_cpu_mem_usage=True))
 
 
 
 
222
  model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
223
 
224
  stop_evt.set()
225
  th.join()
226
 
 
227
  if device == "cuda":
228
  torch.cuda.synchronize()
229
- tl.append({"t": time.perf_counter() - start, "MiB": torch.cuda.memory_allocated() / (1024**2)})
 
230
 
 
231
  del model
232
  if device == "cuda":
233
  torch.cuda.empty_cache()
@@ -240,11 +255,37 @@ def _measure_load_timeline(model_id: str, disable_warmup: bool):
240
 
241
  @spaces.GPU(duration=240)
242
  def profile_warmup(model_id: str):
243
- on = _measure_load_timeline(model_id, disable_warmup=False)
244
- off = _measure_load_timeline(model_id, disable_warmup=True)
245
- rows = [{"t": r["t"], "MiB": r["MiB"], "mode": "warmup ON"} for r in on] + \
246
- [{"t": r["t"], "MiB": r["MiB"], "mode": "warmup OFF"} for r in off]
247
- return pd.DataFrame(rows)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
  def build_alloc_plot():
250
  with gr.Group():
@@ -258,7 +299,6 @@ def build_alloc_plot():
258
  "openai-community/gpt2",
259
  "google/gemma-2-2b",
260
  "microsoft/DialoGPT-small",
261
- "distilbert-base-uncased",
262
  "facebook/opt-125m"
263
  ],
264
  value="openai-community/gpt2",
@@ -457,6 +497,56 @@ hr { border: 0; border-top: 1px solid var(--border-color); margin: 2rem 0; }
457
  background: #1d4ed8 !important;
458
  }
459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  """
461
 
462
  with gr.Blocks(css=CSS, fill_height=True, title="Interactive Blog β€” Transformers Feature Showcase") as demo:
 
190
  from transformers import AutoModelForCausalLM, modeling_utils as MU # noqa: E402
191
 
192
  def _measure_load_timeline(model_id: str, disable_warmup: bool):
193
+ """Measure memory usage during model loading with/without cache warmup."""
194
  orig = getattr(MU, "caching_allocator_warmup", None)
195
  if disable_warmup and orig is not None:
196
  MU.caching_allocator_warmup = lambda *a, **k: None # type: ignore[attr-defined]
197
+
198
  try:
199
  device = "cuda" if torch.cuda.is_available() else "cpu"
200
  tl = []
 
203
  while not stop_evt.is_set():
204
  if device == "cuda":
205
  torch.cuda.synchronize()
206
+ # Use max memory to capture peaks better
207
+ alloc = torch.cuda.max_memory_allocated()
208
+ torch.cuda.reset_peak_memory_stats()
209
  else:
210
  alloc = 0
211
  tl.append({"t": time.perf_counter() - start_t, "MiB": alloc / (1024**2)})
212
+ time.sleep(0.02) # Sample more frequently
213
 
214
  if device == "cuda":
215
  torch.cuda.empty_cache()
216
  torch.cuda.reset_peak_memory_stats()
217
+ initial_mem = torch.cuda.memory_allocated()
218
+ else:
219
+ initial_mem = 0
220
 
221
  start = time.perf_counter()
222
  stop_evt = threading.Event()
223
  th = threading.Thread(target=sample, args=(start, stop_evt), daemon=True)
224
  th.start()
225
 
226
+ # Load model with appropriate settings
227
+ kwargs = {"low_cpu_mem_usage": True}
228
  if device == "cuda":
229
+ kwargs.update({
230
+ "torch_dtype": torch.float16,
231
+ "device_map": "cuda:0"
232
+ })
233
+
234
  model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
235
 
236
  stop_evt.set()
237
  th.join()
238
 
239
+ # Final memory measurement
240
  if device == "cuda":
241
  torch.cuda.synchronize()
242
+ final_mem = torch.cuda.memory_allocated()
243
+ tl.append({"t": time.perf_counter() - start, "MiB": final_mem / (1024**2)})
244
 
245
+ # Clean up
246
  del model
247
  if device == "cuda":
248
  torch.cuda.empty_cache()
 
255
 
256
  @spaces.GPU(duration=240)
257
  def profile_warmup(model_id: str):
258
+ if not torch.cuda.is_available():
259
+ # Create dummy data for CPU demo
260
+ import numpy as np
261
+ t_points = np.linspace(0, 5, 50)
262
+ base_mem = np.cumsum(np.random.exponential(50, 50))
263
+ warmup_on = [{"t": t, "MiB": mem, "mode": "warmup ON"} for t, mem in zip(t_points, base_mem * 0.8)]
264
+ warmup_off = [{"t": t, "MiB": mem, "mode": "warmup OFF"} for t, mem in zip(t_points, base_mem)]
265
+ return pd.DataFrame(warmup_on + warmup_off)
266
+
267
+ try:
268
+ on_data = _measure_load_timeline(model_id, disable_warmup=False)
269
+ off_data = _measure_load_timeline(model_id, disable_warmup=True)
270
+
271
+ # Create DataFrame with better labeling
272
+ rows = [{"t": r["t"], "MiB": r["MiB"], "mode": "πŸš€ Warmup ON (Optimized)"} for r in on_data] + \
273
+ [{"t": r["t"], "MiB": r["MiB"], "mode": "πŸ“ˆ Warmup OFF (Standard)"} for r in off_data]
274
+
275
+ df = pd.DataFrame(rows)
276
+
277
+ # Add summary stats if we have data
278
+ if len(on_data) > 0 and len(off_data) > 0:
279
+ on_peak = max(r["MiB"] for r in on_data)
280
+ off_peak = max(r["MiB"] for r in off_data)
281
+ savings = ((off_peak - on_peak) / off_peak * 100) if off_peak > 0 else 0
282
+ print(f"Memory savings: {savings:.1f}% (Peak: {on_peak:.0f} MiB vs {off_peak:.0f} MiB)")
283
+
284
+ return df
285
+ except Exception as e:
286
+ print(f"Error profiling {model_id}: {e}")
287
+ # Return empty DataFrame on error
288
+ return pd.DataFrame(columns=["t", "MiB", "mode"])
289
 
290
  def build_alloc_plot():
291
  with gr.Group():
 
299
  "openai-community/gpt2",
300
  "google/gemma-2-2b",
301
  "microsoft/DialoGPT-small",
 
302
  "facebook/opt-125m"
303
  ],
304
  value="openai-community/gpt2",
 
497
  background: #1d4ed8 !important;
498
  }
499
 
500
+ /* Dropdown styling - fix contrast and visibility */
501
+ .gr-dropdown {
502
+ background: #ffffff !important;
503
+ border: 1px solid var(--border-color) !important;
504
+ border-radius: 8px !important;
505
+ }
506
+
507
+ .gr-dropdown .gr-box {
508
+ background: #ffffff !important;
509
+ border: 1px solid var(--border-color) !important;
510
+ }
511
+
512
+ .gr-dropdown input {
513
+ background: #ffffff !important;
514
+ color: #1f2937 !important;
515
+ border: none !important;
516
+ font-weight: 500 !important;
517
+ }
518
+
519
+ .gr-dropdown .options {
520
+ background: #ffffff !important;
521
+ border: 1px solid var(--border-color) !important;
522
+ border-radius: 8px !important;
523
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1) !important;
524
+ }
525
+
526
+ .gr-dropdown .option {
527
+ background: #ffffff !important;
528
+ color: #1f2937 !important;
529
+ padding: 0.75rem !important;
530
+ font-weight: 500 !important;
531
+ }
532
+
533
+ .gr-dropdown .option:hover {
534
+ background: #f8fafc !important;
535
+ color: #1f2937 !important;
536
+ }
537
+
538
+ .gr-dropdown .option.selected {
539
+ background: var(--link-text-color) !important;
540
+ color: white !important;
541
+ }
542
+
543
+ /* Fix label styling */
544
+ .gr-dropdown label {
545
+ color: #374151 !important;
546
+ font-weight: 600 !important;
547
+ margin-bottom: 0.5rem !important;
548
+ }
549
+
550
  """
551
 
552
  with gr.Blocks(css=CSS, fill_height=True, title="Interactive Blog β€” Transformers Feature Showcase") as demo: