Spaces:
Running on Zero
Running on Zero
Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -38,20 +38,30 @@ MAX_INPUT_CHARS = 1200 # roughly the 220-token benchmark cap
|
|
| 38 |
|
| 39 |
|
| 40 |
def model_path(volume_path: str, model_id: str) -> str:
|
| 41 |
-
"""Prefer a mounted hf:// volume (see `hf spaces volumes`) over a download.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
return volume_path if os.path.isdir(volume_path) else model_id
|
| 43 |
|
| 44 |
|
| 45 |
DG_PATH = model_path("/models/dg", "google/diffusiongemma-26B-A4B-it")
|
| 46 |
G4_PATH = model_path("/models/gemma", "google/gemma-4-E4B-it")
|
| 47 |
|
|
|
|
| 48 |
print(f"loading DiffusionGemma from {DG_PATH} ...")
|
| 49 |
dg_processor = AutoProcessor.from_pretrained(DG_PATH)
|
| 50 |
dg_model = DiffusionGemmaForBlockDiffusion.from_pretrained(DG_PATH, dtype=torch.bfloat16).to("cuda")
|
|
|
|
|
|
|
| 51 |
print(f"loading Gemma-4 from {G4_PATH} ...")
|
| 52 |
g4_processor = AutoProcessor.from_pretrained(G4_PATH)
|
| 53 |
g4_model = AutoModelForMultimodalLM.from_pretrained(G4_PATH, dtype=torch.bfloat16).to("cuda")
|
| 54 |
-
print("
|
| 55 |
|
| 56 |
|
| 57 |
STOP_MARKERS = ("<turn|>", "<eos>", "<end_of_turn>", "<pad>")
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
def model_path(volume_path: str, model_id: str) -> str:
|
| 41 |
+
"""Prefer a mounted hf:// volume (see `hf spaces volumes`) over a download.
|
| 42 |
+
|
| 43 |
+
Volume reads go over FUSE, which is sometimes slower for safetensors loading
|
| 44 |
+
than a fresh download to local disk — set USE_VOLUMES=0 (Space variable) to
|
| 45 |
+
force from_pretrained downloads for comparison.
|
| 46 |
+
"""
|
| 47 |
+
if os.environ.get("USE_VOLUMES", "1") == "0":
|
| 48 |
+
return model_id
|
| 49 |
return volume_path if os.path.isdir(volume_path) else model_id
|
| 50 |
|
| 51 |
|
| 52 |
DG_PATH = model_path("/models/dg", "google/diffusiongemma-26B-A4B-it")
|
| 53 |
G4_PATH = model_path("/models/gemma", "google/gemma-4-E4B-it")
|
| 54 |
|
| 55 |
+
t0 = time.perf_counter()
|
| 56 |
print(f"loading DiffusionGemma from {DG_PATH} ...")
|
| 57 |
dg_processor = AutoProcessor.from_pretrained(DG_PATH)
|
| 58 |
dg_model = DiffusionGemmaForBlockDiffusion.from_pretrained(DG_PATH, dtype=torch.bfloat16).to("cuda")
|
| 59 |
+
print(f"DiffusionGemma loaded in {time.perf_counter() - t0:.0f}s")
|
| 60 |
+
t0 = time.perf_counter()
|
| 61 |
print(f"loading Gemma-4 from {G4_PATH} ...")
|
| 62 |
g4_processor = AutoProcessor.from_pretrained(G4_PATH)
|
| 63 |
g4_model = AutoModelForMultimodalLM.from_pretrained(G4_PATH, dtype=torch.bfloat16).to("cuda")
|
| 64 |
+
print(f"Gemma-4 loaded in {time.perf_counter() - t0:.0f}s")
|
| 65 |
|
| 66 |
|
| 67 |
STOP_MARKERS = ("<turn|>", "<eos>", "<end_of_turn>", "<pad>")
|