some improvements
Browse files
app.py
CHANGED
|
@@ -190,9 +190,11 @@ def build_attn_vis():
|
|
| 190 |
from transformers import AutoModelForCausalLM, modeling_utils as MU # noqa: E402
|
| 191 |
|
| 192 |
def _measure_load_timeline(model_id: str, disable_warmup: bool):
|
|
|
|
| 193 |
orig = getattr(MU, "caching_allocator_warmup", None)
|
| 194 |
if disable_warmup and orig is not None:
|
| 195 |
MU.caching_allocator_warmup = lambda *a, **k: None # type: ignore[attr-defined]
|
|
|
|
| 196 |
try:
|
| 197 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 198 |
tl = []
|
|
@@ -201,33 +203,46 @@ def _measure_load_timeline(model_id: str, disable_warmup: bool):
|
|
| 201 |
while not stop_evt.is_set():
|
| 202 |
if device == "cuda":
|
| 203 |
torch.cuda.synchronize()
|
| 204 |
-
|
|
|
|
|
|
|
| 205 |
else:
|
| 206 |
alloc = 0
|
| 207 |
tl.append({"t": time.perf_counter() - start_t, "MiB": alloc / (1024**2)})
|
| 208 |
-
time.sleep(0.
|
| 209 |
|
| 210 |
if device == "cuda":
|
| 211 |
torch.cuda.empty_cache()
|
| 212 |
torch.cuda.reset_peak_memory_stats()
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
start = time.perf_counter()
|
| 215 |
stop_evt = threading.Event()
|
| 216 |
th = threading.Thread(target=sample, args=(start, stop_evt), daemon=True)
|
| 217 |
th.start()
|
| 218 |
|
| 219 |
-
|
|
|
|
| 220 |
if device == "cuda":
|
| 221 |
-
kwargs.update(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
|
| 223 |
|
| 224 |
stop_evt.set()
|
| 225 |
th.join()
|
| 226 |
|
|
|
|
| 227 |
if device == "cuda":
|
| 228 |
torch.cuda.synchronize()
|
| 229 |
-
|
|
|
|
| 230 |
|
|
|
|
| 231 |
del model
|
| 232 |
if device == "cuda":
|
| 233 |
torch.cuda.empty_cache()
|
|
@@ -240,11 +255,37 @@ def _measure_load_timeline(model_id: str, disable_warmup: bool):
|
|
| 240 |
|
| 241 |
@spaces.GPU(duration=240)
|
| 242 |
def profile_warmup(model_id: str):
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
def build_alloc_plot():
|
| 250 |
with gr.Group():
|
|
@@ -258,7 +299,6 @@ def build_alloc_plot():
|
|
| 258 |
"openai-community/gpt2",
|
| 259 |
"google/gemma-2-2b",
|
| 260 |
"microsoft/DialoGPT-small",
|
| 261 |
-
"distilbert-base-uncased",
|
| 262 |
"facebook/opt-125m"
|
| 263 |
],
|
| 264 |
value="openai-community/gpt2",
|
|
@@ -457,6 +497,56 @@ hr { border: 0; border-top: 1px solid var(--border-color); margin: 2rem 0; }
|
|
| 457 |
background: #1d4ed8 !important;
|
| 458 |
}
|
| 459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
"""
|
| 461 |
|
| 462 |
with gr.Blocks(css=CSS, fill_height=True, title="Interactive Blog β Transformers Feature Showcase") as demo:
|
|
|
|
| 190 |
from transformers import AutoModelForCausalLM, modeling_utils as MU # noqa: E402
|
| 191 |
|
| 192 |
def _measure_load_timeline(model_id: str, disable_warmup: bool):
|
| 193 |
+
"""Measure memory usage during model loading with/without cache warmup."""
|
| 194 |
orig = getattr(MU, "caching_allocator_warmup", None)
|
| 195 |
if disable_warmup and orig is not None:
|
| 196 |
MU.caching_allocator_warmup = lambda *a, **k: None # type: ignore[attr-defined]
|
| 197 |
+
|
| 198 |
try:
|
| 199 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 200 |
tl = []
|
|
|
|
| 203 |
while not stop_evt.is_set():
|
| 204 |
if device == "cuda":
|
| 205 |
torch.cuda.synchronize()
|
| 206 |
+
# Use max memory to capture peaks better
|
| 207 |
+
alloc = torch.cuda.max_memory_allocated()
|
| 208 |
+
torch.cuda.reset_peak_memory_stats()
|
| 209 |
else:
|
| 210 |
alloc = 0
|
| 211 |
tl.append({"t": time.perf_counter() - start_t, "MiB": alloc / (1024**2)})
|
| 212 |
+
time.sleep(0.02) # Sample more frequently
|
| 213 |
|
| 214 |
if device == "cuda":
|
| 215 |
torch.cuda.empty_cache()
|
| 216 |
torch.cuda.reset_peak_memory_stats()
|
| 217 |
+
initial_mem = torch.cuda.memory_allocated()
|
| 218 |
+
else:
|
| 219 |
+
initial_mem = 0
|
| 220 |
|
| 221 |
start = time.perf_counter()
|
| 222 |
stop_evt = threading.Event()
|
| 223 |
th = threading.Thread(target=sample, args=(start, stop_evt), daemon=True)
|
| 224 |
th.start()
|
| 225 |
|
| 226 |
+
# Load model with appropriate settings
|
| 227 |
+
kwargs = {"low_cpu_mem_usage": True}
|
| 228 |
if device == "cuda":
|
| 229 |
+
kwargs.update({
|
| 230 |
+
"torch_dtype": torch.float16,
|
| 231 |
+
"device_map": "cuda:0"
|
| 232 |
+
})
|
| 233 |
+
|
| 234 |
model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
|
| 235 |
|
| 236 |
stop_evt.set()
|
| 237 |
th.join()
|
| 238 |
|
| 239 |
+
# Final memory measurement
|
| 240 |
if device == "cuda":
|
| 241 |
torch.cuda.synchronize()
|
| 242 |
+
final_mem = torch.cuda.memory_allocated()
|
| 243 |
+
tl.append({"t": time.perf_counter() - start, "MiB": final_mem / (1024**2)})
|
| 244 |
|
| 245 |
+
# Clean up
|
| 246 |
del model
|
| 247 |
if device == "cuda":
|
| 248 |
torch.cuda.empty_cache()
|
|
|
|
| 255 |
|
| 256 |
@spaces.GPU(duration=240)
|
| 257 |
def profile_warmup(model_id: str):
|
| 258 |
+
if not torch.cuda.is_available():
|
| 259 |
+
# Create dummy data for CPU demo
|
| 260 |
+
import numpy as np
|
| 261 |
+
t_points = np.linspace(0, 5, 50)
|
| 262 |
+
base_mem = np.cumsum(np.random.exponential(50, 50))
|
| 263 |
+
warmup_on = [{"t": t, "MiB": mem, "mode": "warmup ON"} for t, mem in zip(t_points, base_mem * 0.8)]
|
| 264 |
+
warmup_off = [{"t": t, "MiB": mem, "mode": "warmup OFF"} for t, mem in zip(t_points, base_mem)]
|
| 265 |
+
return pd.DataFrame(warmup_on + warmup_off)
|
| 266 |
+
|
| 267 |
+
try:
|
| 268 |
+
on_data = _measure_load_timeline(model_id, disable_warmup=False)
|
| 269 |
+
off_data = _measure_load_timeline(model_id, disable_warmup=True)
|
| 270 |
+
|
| 271 |
+
# Create DataFrame with better labeling
|
| 272 |
+
rows = [{"t": r["t"], "MiB": r["MiB"], "mode": "π Warmup ON (Optimized)"} for r in on_data] + \
|
| 273 |
+
[{"t": r["t"], "MiB": r["MiB"], "mode": "π Warmup OFF (Standard)"} for r in off_data]
|
| 274 |
+
|
| 275 |
+
df = pd.DataFrame(rows)
|
| 276 |
+
|
| 277 |
+
# Add summary stats if we have data
|
| 278 |
+
if len(on_data) > 0 and len(off_data) > 0:
|
| 279 |
+
on_peak = max(r["MiB"] for r in on_data)
|
| 280 |
+
off_peak = max(r["MiB"] for r in off_data)
|
| 281 |
+
savings = ((off_peak - on_peak) / off_peak * 100) if off_peak > 0 else 0
|
| 282 |
+
print(f"Memory savings: {savings:.1f}% (Peak: {on_peak:.0f} MiB vs {off_peak:.0f} MiB)")
|
| 283 |
+
|
| 284 |
+
return df
|
| 285 |
+
except Exception as e:
|
| 286 |
+
print(f"Error profiling {model_id}: {e}")
|
| 287 |
+
# Return empty DataFrame on error
|
| 288 |
+
return pd.DataFrame(columns=["t", "MiB", "mode"])
|
| 289 |
|
| 290 |
def build_alloc_plot():
|
| 291 |
with gr.Group():
|
|
|
|
| 299 |
"openai-community/gpt2",
|
| 300 |
"google/gemma-2-2b",
|
| 301 |
"microsoft/DialoGPT-small",
|
|
|
|
| 302 |
"facebook/opt-125m"
|
| 303 |
],
|
| 304 |
value="openai-community/gpt2",
|
|
|
|
| 497 |
background: #1d4ed8 !important;
|
| 498 |
}
|
| 499 |
|
| 500 |
+
/* Dropdown styling - fix contrast and visibility */
|
| 501 |
+
.gr-dropdown {
|
| 502 |
+
background: #ffffff !important;
|
| 503 |
+
border: 1px solid var(--border-color) !important;
|
| 504 |
+
border-radius: 8px !important;
|
| 505 |
+
}
|
| 506 |
+
|
| 507 |
+
.gr-dropdown .gr-box {
|
| 508 |
+
background: #ffffff !important;
|
| 509 |
+
border: 1px solid var(--border-color) !important;
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
.gr-dropdown input {
|
| 513 |
+
background: #ffffff !important;
|
| 514 |
+
color: #1f2937 !important;
|
| 515 |
+
border: none !important;
|
| 516 |
+
font-weight: 500 !important;
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
.gr-dropdown .options {
|
| 520 |
+
background: #ffffff !important;
|
| 521 |
+
border: 1px solid var(--border-color) !important;
|
| 522 |
+
border-radius: 8px !important;
|
| 523 |
+
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1) !important;
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
.gr-dropdown .option {
|
| 527 |
+
background: #ffffff !important;
|
| 528 |
+
color: #1f2937 !important;
|
| 529 |
+
padding: 0.75rem !important;
|
| 530 |
+
font-weight: 500 !important;
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
.gr-dropdown .option:hover {
|
| 534 |
+
background: #f8fafc !important;
|
| 535 |
+
color: #1f2937 !important;
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
.gr-dropdown .option.selected {
|
| 539 |
+
background: var(--link-text-color) !important;
|
| 540 |
+
color: white !important;
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
/* Fix label styling */
|
| 544 |
+
.gr-dropdown label {
|
| 545 |
+
color: #374151 !important;
|
| 546 |
+
font-weight: 600 !important;
|
| 547 |
+
margin-bottom: 0.5rem !important;
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
"""
|
| 551 |
|
| 552 |
with gr.Blocks(css=CSS, fill_height=True, title="Interactive Blog β Transformers Feature Showcase") as demo:
|