GravityShares commited on
Commit
65eb000
·
verified ·
1 Parent(s): 8bd029a

Deploy Nomos ZeroGPU app

Browse files
Files changed (3) hide show
  1. README.md +9 -2
  2. app.py +112 -32
  3. requirements.txt +4 -4
README.md CHANGED
@@ -17,8 +17,15 @@ This Space runs Nomos-compatible models with ZeroGPU and tries model candidates
17
 
18
  ## Suggested Variables
19
 
20
- - `MODEL_CANDIDATES=cyankiwi/nomos-1-AWQ-8bit,cyankiwi/nomos-1-AWQ-4bit`
 
 
 
21
  - `PREFER_FULL=false`
22
- - `GPU_DURATION_SECONDS=120`
 
 
23
  - `MAX_INPUT_TOKENS=2048`
24
  - `MAX_NEW_TOKENS_DEFAULT=256`
 
 
 
17
 
18
  ## Suggested Variables
19
 
20
+ - `MODEL_CANDIDATES=cyankiwi/nomos-1-AWQ-8bit`
21
+ - `TOKENIZER_ID=NousResearch/nomos-1`
22
+ - `TORCH_DTYPE=bfloat16`
23
+ - `MODEL_DEVICE_MAP=auto`
24
  - `PREFER_FULL=false`
25
+ - `GPU_SIZE=xlarge`
26
+ - `GPU_DURATION_SECONDS=180`
27
+ - `MAX_GPU_DURATION_SECONDS=300`
28
  - `MAX_INPUT_TOKENS=2048`
29
  - `MAX_NEW_TOKENS_DEFAULT=256`
30
+ - `HF_HOME=/tmp/hf-home`
31
+ - `HF_HUB_CACHE=/tmp/hf-home/hub`
app.py CHANGED
@@ -1,35 +1,41 @@
1
  #!/usr/bin/env python3
2
  import os
3
  import threading
4
- from collections.abc import Mapping
5
  from typing import Any
6
 
7
- import gradio as gr
8
- import torch
9
- from transformers import AutoModelForCausalLM, AutoTokenizer
10
-
11
  try:
12
  import spaces
13
  except Exception:
14
  class _SpacesFallback:
15
  @staticmethod
16
- def GPU(duration: int = 60):
17
  def _decorator(fn):
18
  return fn
 
19
  return _decorator
 
20
  spaces = _SpacesFallback()
21
 
 
 
 
 
22
 
23
  DEFAULT_FULL_MODEL = "NousResearch/nomos-1"
24
- DEFAULT_MODEL_CANDIDATES = "cyankiwi/nomos-1-AWQ-8bit,cyankiwi/nomos-1-AWQ-4bit"
25
  DEFAULT_TOKENIZER_ID = DEFAULT_FULL_MODEL
26
- GPU_DURATION_SECONDS = int(os.getenv("GPU_DURATION_SECONDS", "120"))
 
 
 
27
  MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "2048"))
28
  MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS_DEFAULT", "256"))
29
  TRUST_REMOTE_CODE = os.getenv("TRUST_REMOTE_CODE", "true").lower() == "true"
30
  PREFER_FULL = os.getenv("PREFER_FULL", "false").lower() == "true"
31
  TOKENIZER_ID = os.getenv("TOKENIZER_ID", DEFAULT_TOKENIZER_ID).strip() or DEFAULT_TOKENIZER_ID
32
- TORCH_DTYPE = os.getenv("TORCH_DTYPE", "float16").strip().lower()
 
33
 
34
  _MODEL_LOCK = threading.Lock()
35
  _MODEL: Any = None
@@ -46,6 +52,45 @@ def _ordered_candidates() -> list[str]:
46
  return candidates
47
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def _load_model_if_needed() -> tuple[str | None, str]:
50
  global _MODEL, _TOKENIZER, _MODEL_ID
51
  if _MODEL is not None and _TOKENIZER is not None and _MODEL_ID is not None:
@@ -62,14 +107,21 @@ def _load_model_if_needed() -> tuple[str | None, str]:
62
  TOKENIZER_ID,
63
  trust_remote_code=TRUST_REMOTE_CODE,
64
  )
65
- model = AutoModelForCausalLM.from_pretrained(
66
- candidate,
67
- device_map="auto",
68
- trust_remote_code=TRUST_REMOTE_CODE,
69
- low_cpu_mem_usage=True,
70
- torch_dtype=torch.float16 if TORCH_DTYPE == "float16" else torch.bfloat16,
71
- )
 
 
 
 
 
 
72
  model.eval()
 
73
  _TOKENIZER = tokenizer
74
  _MODEL = model
75
  _MODEL_ID = candidate
@@ -88,10 +140,12 @@ def _status_text() -> str:
88
  base = (
89
  f"Loaded model: `{loaded}`\n\n"
90
  f"Tokenizer: `{TOKENIZER_ID}`\n\n"
91
- f"Torch dtype: `{TORCH_DTYPE}`\n\n"
 
 
92
  f"Candidates: `{candidates}`\n\n"
93
- f"GPU duration: `{GPU_DURATION_SECONDS}s` | "
94
- f"Max input tokens: `{MAX_INPUT_TOKENS}`"
95
  )
96
  if _LOAD_ERRORS:
97
  err = "\n".join(f"- {e}" for e in _LOAD_ERRORS[-3:])
@@ -99,7 +153,32 @@ def _status_text() -> str:
99
  return base
100
 
101
 
102
- @spaces.GPU(duration=GPU_DURATION_SECONDS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  def generate(
104
  prompt: str,
105
  max_new_tokens: int,
@@ -120,24 +199,17 @@ def generate(
120
  model = _MODEL
121
 
122
  messages = [{"role": "user", "content": prompt}]
123
- chat_inputs = tokenizer.apply_chat_template(
124
  messages,
125
  add_generation_prompt=True,
126
- return_tensors="pt",
127
  )
 
 
128
  try:
129
  device = next(model.parameters()).device
130
  except Exception:
131
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
132
- if hasattr(chat_inputs, "to"):
133
- chat_inputs = chat_inputs.to(device)
134
-
135
- if torch.is_tensor(chat_inputs):
136
- model_inputs: dict[str, Any] = {"input_ids": chat_inputs}
137
- elif isinstance(chat_inputs, Mapping):
138
- model_inputs = dict(chat_inputs)
139
- else:
140
- raise TypeError(f"Unsupported chat template output type: {type(chat_inputs)}")
141
 
142
  for k, v in list(model_inputs.items()):
143
  if torch.is_tensor(v):
@@ -151,12 +223,20 @@ def generate(
151
  model_inputs[k] = v[:, trim:]
152
  input_ids = model_inputs["input_ids"]
153
 
 
 
 
 
 
 
154
  gen_kwargs: dict[str, Any] = {
155
  **model_inputs,
156
  "max_new_tokens": int(max_new_tokens),
157
  "do_sample": bool(do_sample),
158
- "pad_token_id": tokenizer.eos_token_id or tokenizer.pad_token_id or 0,
159
  }
 
 
160
  if do_sample:
161
  gen_kwargs.update(
162
  {
 
1
  #!/usr/bin/env python3
2
  import os
3
  import threading
 
4
  from typing import Any
5
 
6
+ # Importing spaces early is recommended for ZeroGPU runtime patching.
 
 
 
7
  try:
8
  import spaces
9
  except Exception:
10
  class _SpacesFallback:
11
  @staticmethod
12
+ def GPU(*args, **kwargs):
13
  def _decorator(fn):
14
  return fn
15
+
16
  return _decorator
17
+
18
  spaces = _SpacesFallback()
19
 
20
+ import gradio as gr
21
+ import torch
22
+ import transformers
23
+ from transformers import AutoModelForCausalLM, AutoTokenizer
24
 
25
  DEFAULT_FULL_MODEL = "NousResearch/nomos-1"
26
+ DEFAULT_MODEL_CANDIDATES = "cyankiwi/nomos-1-AWQ-8bit"
27
  DEFAULT_TOKENIZER_ID = DEFAULT_FULL_MODEL
28
+
29
+ GPU_DURATION_SECONDS = int(os.getenv("GPU_DURATION_SECONDS", "180"))
30
+ MAX_GPU_DURATION_SECONDS = int(os.getenv("MAX_GPU_DURATION_SECONDS", "300"))
31
+ GPU_SIZE = os.getenv("GPU_SIZE", "large").strip().lower() or "large"
32
  MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "2048"))
33
  MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS_DEFAULT", "256"))
34
  TRUST_REMOTE_CODE = os.getenv("TRUST_REMOTE_CODE", "true").lower() == "true"
35
  PREFER_FULL = os.getenv("PREFER_FULL", "false").lower() == "true"
36
  TOKENIZER_ID = os.getenv("TOKENIZER_ID", DEFAULT_TOKENIZER_ID).strip() or DEFAULT_TOKENIZER_ID
37
+ TORCH_DTYPE = os.getenv("TORCH_DTYPE", "bfloat16").strip().lower()
38
+ MODEL_DEVICE_MAP = os.getenv("MODEL_DEVICE_MAP", "auto").strip() or "auto"
39
 
40
  _MODEL_LOCK = threading.Lock()
41
  _MODEL: Any = None
 
52
  return candidates
53
 
54
 
55
+ def _torch_dtype() -> torch.dtype | str:
56
+ if TORCH_DTYPE in {"", "auto"}:
57
+ return "auto"
58
+ if TORCH_DTYPE in {"bfloat16", "bf16"}:
59
+ return torch.bfloat16
60
+ if TORCH_DTYPE in {"float16", "fp16", "half"}:
61
+ return torch.float16
62
+ if TORCH_DTYPE in {"float32", "fp32"}:
63
+ return torch.float32
64
+ return "auto"
65
+
66
+
67
+ def _package_versions() -> str:
68
+ pieces = [
69
+ f"torch={torch.__version__}",
70
+ f"transformers={transformers.__version__}",
71
+ ]
72
+ try:
73
+ import compressed_tensors
74
+
75
+ pieces.append(f"compressed-tensors={compressed_tensors.__version__}")
76
+ except Exception as exc: # pragma: no cover - environment specific
77
+ pieces.append(f"compressed-tensors=unavailable({type(exc).__name__})")
78
+ return ", ".join(pieces)
79
+
80
+
81
+ def _cuda_status() -> str:
82
+ if not torch.cuda.is_available():
83
+ return "CUDA unavailable"
84
+
85
+ try:
86
+ idx = torch.cuda.current_device()
87
+ props = torch.cuda.get_device_properties(idx)
88
+ total_gb = props.total_memory / (1024**3)
89
+ return f"{props.name} ({total_gb:.1f} GB)"
90
+ except Exception as exc: # pragma: no cover - environment specific
91
+ return f"CUDA available (details unavailable: {type(exc).__name__})"
92
+
93
+
94
  def _load_model_if_needed() -> tuple[str | None, str]:
95
  global _MODEL, _TOKENIZER, _MODEL_ID
96
  if _MODEL is not None and _TOKENIZER is not None and _MODEL_ID is not None:
 
107
  TOKENIZER_ID,
108
  trust_remote_code=TRUST_REMOTE_CODE,
109
  )
110
+ if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
111
+ tokenizer.pad_token = tokenizer.eos_token
112
+
113
+ dtype = _torch_dtype()
114
+ model_kwargs: dict[str, Any] = {
115
+ "trust_remote_code": TRUST_REMOTE_CODE,
116
+ "low_cpu_mem_usage": True,
117
+ "device_map": MODEL_DEVICE_MAP,
118
+ }
119
+ if dtype != "auto":
120
+ model_kwargs["torch_dtype"] = dtype
121
+
122
+ model = AutoModelForCausalLM.from_pretrained(candidate, **model_kwargs)
123
  model.eval()
124
+
125
  _TOKENIZER = tokenizer
126
  _MODEL = model
127
  _MODEL_ID = candidate
 
140
  base = (
141
  f"Loaded model: `{loaded}`\n\n"
142
  f"Tokenizer: `{TOKENIZER_ID}`\n\n"
143
+ f"Torch dtype: `{TORCH_DTYPE}` | Device map: `{MODEL_DEVICE_MAP}`\n\n"
144
+ f"GPU size: `{GPU_SIZE}` | Duration default: `{GPU_DURATION_SECONDS}s`\n\n"
145
+ f"Max input tokens: `{MAX_INPUT_TOKENS}`\n\n"
146
  f"Candidates: `{candidates}`\n\n"
147
+ f"Runtime: `{_cuda_status()}`\n\n"
148
+ f"Packages: `{_package_versions()}`"
149
  )
150
  if _LOAD_ERRORS:
151
  err = "\n".join(f"- {e}" for e in _LOAD_ERRORS[-3:])
 
153
  return base
154
 
155
 
156
+ def _duration_for_generate(
157
+ prompt: str,
158
+ max_new_tokens: int,
159
+ temperature: float,
160
+ top_p: float,
161
+ top_k: int,
162
+ do_sample: bool,
163
+ ) -> int:
164
+ del prompt, temperature, top_p, top_k, do_sample
165
+ try:
166
+ requested_new = int(max_new_tokens)
167
+ except Exception:
168
+ requested_new = MAX_NEW_TOKENS_DEFAULT
169
+
170
+ est = max(GPU_DURATION_SECONDS, 60 + int(0.8 * max(32, requested_new)))
171
+ return min(MAX_GPU_DURATION_SECONDS, est)
172
+
173
+
174
+ def _gpu_decorator():
175
+ try:
176
+ return spaces.GPU(duration=_duration_for_generate, size=GPU_SIZE)
177
+ except TypeError:
178
+ return spaces.GPU(duration=_duration_for_generate)
179
+
180
+
181
+ @_gpu_decorator()
182
  def generate(
183
  prompt: str,
184
  max_new_tokens: int,
 
199
  model = _MODEL
200
 
201
  messages = [{"role": "user", "content": prompt}]
202
+ chat_text = tokenizer.apply_chat_template(
203
  messages,
204
  add_generation_prompt=True,
205
+ tokenize=False,
206
  )
207
+ model_inputs = tokenizer(chat_text, return_tensors="pt")
208
+
209
  try:
210
  device = next(model.parameters()).device
211
  except Exception:
212
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
 
 
213
 
214
  for k, v in list(model_inputs.items()):
215
  if torch.is_tensor(v):
 
223
  model_inputs[k] = v[:, trim:]
224
  input_ids = model_inputs["input_ids"]
225
 
226
+ generation_cfg = getattr(model, "generation_config", None)
227
+ eos_token_id = getattr(generation_cfg, "eos_token_id", None)
228
+ pad_token_id = getattr(generation_cfg, "pad_token_id", None)
229
+ if pad_token_id is None:
230
+ pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id or 0
231
+
232
  gen_kwargs: dict[str, Any] = {
233
  **model_inputs,
234
  "max_new_tokens": int(max_new_tokens),
235
  "do_sample": bool(do_sample),
236
+ "pad_token_id": pad_token_id,
237
  }
238
+ if eos_token_id is not None:
239
+ gen_kwargs["eos_token_id"] = eos_token_id
240
  if do_sample:
241
  gen_kwargs.update(
242
  {
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
- gradio>=5.0.0
2
- spaces>=0.30.0
3
- transformers>=4.51.0
4
  accelerate>=0.34.0
5
  safetensors>=0.5.0
6
- compressed-tensors>=0.12.3
 
1
+ gradio==5.12.0
2
+ spaces>=0.32.0
3
+ transformers==4.57.3
4
  accelerate>=0.34.0
5
  safetensors>=0.5.0
6
+ compressed-tensors==0.12.3a20251110