davanstrien HF Staff commited on
Commit
90d93b6
·
verified ·
1 Parent(s): 163634f

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +12 -2
app.py CHANGED
@@ -38,20 +38,30 @@ MAX_INPUT_CHARS = 1200 # roughly the 220-token benchmark cap
38
 
39
 
40
  def model_path(volume_path: str, model_id: str) -> str:
41
- """Prefer a mounted hf:// volume (see `hf spaces volumes`) over a download."""
 
 
 
 
 
 
 
42
  return volume_path if os.path.isdir(volume_path) else model_id
43
 
44
 
45
  DG_PATH = model_path("/models/dg", "google/diffusiongemma-26B-A4B-it")
46
  G4_PATH = model_path("/models/gemma", "google/gemma-4-E4B-it")
47
 
 
48
  print(f"loading DiffusionGemma from {DG_PATH} ...")
49
  dg_processor = AutoProcessor.from_pretrained(DG_PATH)
50
  dg_model = DiffusionGemmaForBlockDiffusion.from_pretrained(DG_PATH, dtype=torch.bfloat16).to("cuda")
 
 
51
  print(f"loading Gemma-4 from {G4_PATH} ...")
52
  g4_processor = AutoProcessor.from_pretrained(G4_PATH)
53
  g4_model = AutoModelForMultimodalLM.from_pretrained(G4_PATH, dtype=torch.bfloat16).to("cuda")
54
- print("models loaded")
55
 
56
 
57
  STOP_MARKERS = ("<turn|>", "<eos>", "<end_of_turn>", "<pad>")
 
38
 
39
 
40
  def model_path(volume_path: str, model_id: str) -> str:
41
+ """Prefer a mounted hf:// volume (see `hf spaces volumes`) over a download.
42
+
43
+ Volume reads go over FUSE, which is sometimes slower for safetensors loading
44
+ than a fresh download to local disk — set USE_VOLUMES=0 (Space variable) to
45
+ force from_pretrained downloads for comparison.
46
+ """
47
+ if os.environ.get("USE_VOLUMES", "1") == "0":
48
+ return model_id
49
  return volume_path if os.path.isdir(volume_path) else model_id
50
 
51
 
52
  DG_PATH = model_path("/models/dg", "google/diffusiongemma-26B-A4B-it")
53
  G4_PATH = model_path("/models/gemma", "google/gemma-4-E4B-it")
54
 
55
+ t0 = time.perf_counter()
56
  print(f"loading DiffusionGemma from {DG_PATH} ...")
57
  dg_processor = AutoProcessor.from_pretrained(DG_PATH)
58
  dg_model = DiffusionGemmaForBlockDiffusion.from_pretrained(DG_PATH, dtype=torch.bfloat16).to("cuda")
59
+ print(f"DiffusionGemma loaded in {time.perf_counter() - t0:.0f}s")
60
+ t0 = time.perf_counter()
61
  print(f"loading Gemma-4 from {G4_PATH} ...")
62
  g4_processor = AutoProcessor.from_pretrained(G4_PATH)
63
  g4_model = AutoModelForMultimodalLM.from_pretrained(G4_PATH, dtype=torch.bfloat16).to("cuda")
64
+ print(f"Gemma-4 loaded in {time.perf_counter() - t0:.0f}s")
65
 
66
 
67
  STOP_MARKERS = ("<turn|>", "<eos>", "<end_of_turn>", "<pad>")