GravityShares commited on
Commit
353f0fe
·
verified ·
1 Parent(s): dab11ec

Deploy Nomos ZeroGPU app

Browse files
Files changed (3) hide show
  1. README.md +19 -7
  2. app.py +218 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,12 +1,24 @@
1
  ---
2
- title: Nomos 1 Zerogpu
3
- emoji: 🏃
4
- colorFrom: pink
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.5.1
 
8
  app_file: app.py
9
- pinned: false
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Nomos ZeroGPU Inference
3
+ colorFrom: gray
4
+ colorTo: blue
 
5
  sdk: gradio
6
+ sdk_version: 5.12.0
7
+ python_version: "3.10"
8
  app_file: app.py
9
+ startup_duration_timeout: 1h
10
+ preload_from_hub:
11
+ - cyankiwi/nomos-1-AWQ-8bit
12
  ---
13
 
14
+ # Nomos ZeroGPU Inference
15
+
16
+ This Space runs Nomos-compatible models with ZeroGPU and tries model candidates in order.
17
+
18
+ ## Suggested Variables
19
+
20
+ - `MODEL_CANDIDATES=cyankiwi/nomos-1-AWQ-8bit,cyankiwi/nomos-1-AWQ-4bit`
21
+ - `PREFER_FULL=false`
22
+ - `GPU_DURATION_SECONDS=120`
23
+ - `MAX_INPUT_TOKENS=2048`
24
+ - `MAX_NEW_TOKENS_DEFAULT=256`
app.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import threading
4
+ from typing import Any
5
+
6
+ import gradio as gr
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer
9
+
10
+ try:
11
+ import spaces
12
+ except Exception:
13
+ class _SpacesFallback:
14
+ @staticmethod
15
+ def GPU(duration: int = 60):
16
+ def _decorator(fn):
17
+ return fn
18
+ return _decorator
19
+ spaces = _SpacesFallback()
20
+
21
+
22
+ DEFAULT_FULL_MODEL = "NousResearch/nomos-1"
23
+ DEFAULT_MODEL_CANDIDATES = "cyankiwi/nomos-1-AWQ-8bit,cyankiwi/nomos-1-AWQ-4bit"
24
+ GPU_DURATION_SECONDS = int(os.getenv("GPU_DURATION_SECONDS", "120"))
25
+ MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "2048"))
26
+ MAX_NEW_TOKENS_DEFAULT = int(os.getenv("MAX_NEW_TOKENS_DEFAULT", "256"))
27
+ TRUST_REMOTE_CODE = os.getenv("TRUST_REMOTE_CODE", "true").lower() == "true"
28
+ PREFER_FULL = os.getenv("PREFER_FULL", "false").lower() == "true"
29
+
30
+ _MODEL_LOCK = threading.Lock()
31
+ _MODEL: Any = None
32
+ _TOKENIZER: Any = None
33
+ _MODEL_ID: str | None = None
34
+ _LOAD_ERRORS: list[str] = []
35
+
36
+
37
+ def _ordered_candidates() -> list[str]:
38
+ configured = os.getenv("MODEL_CANDIDATES", DEFAULT_MODEL_CANDIDATES)
39
+ candidates = [m.strip() for m in configured.split(",") if m.strip()]
40
+ if PREFER_FULL and DEFAULT_FULL_MODEL not in candidates:
41
+ candidates = [DEFAULT_FULL_MODEL] + candidates
42
+ return candidates
43
+
44
+
45
+ def _load_model_if_needed() -> tuple[str | None, str]:
46
+ global _MODEL, _TOKENIZER, _MODEL_ID
47
+ if _MODEL is not None and _TOKENIZER is not None and _MODEL_ID is not None:
48
+ return _MODEL_ID, "model already loaded"
49
+
50
+ with _MODEL_LOCK:
51
+ if _MODEL is not None and _TOKENIZER is not None and _MODEL_ID is not None:
52
+ return _MODEL_ID, "model already loaded"
53
+
54
+ errors: list[str] = []
55
+ for candidate in _ordered_candidates():
56
+ try:
57
+ tokenizer = AutoTokenizer.from_pretrained(
58
+ candidate,
59
+ trust_remote_code=TRUST_REMOTE_CODE,
60
+ )
61
+ model = AutoModelForCausalLM.from_pretrained(
62
+ candidate,
63
+ device_map="auto",
64
+ trust_remote_code=TRUST_REMOTE_CODE,
65
+ low_cpu_mem_usage=True,
66
+ )
67
+ model.eval()
68
+ _TOKENIZER = tokenizer
69
+ _MODEL = model
70
+ _MODEL_ID = candidate
71
+ _LOAD_ERRORS.clear()
72
+ return candidate, "loaded"
73
+ except Exception as exc:
74
+ errors.append(f"{candidate}: {type(exc).__name__}: {exc}")
75
+
76
+ _LOAD_ERRORS[:] = errors
77
+ return None, "load failed"
78
+
79
+
80
+ def _status_text() -> str:
81
+ candidates = ", ".join(_ordered_candidates())
82
+ loaded = _MODEL_ID or "none"
83
+ base = (
84
+ f"Loaded model: `{loaded}`\n\n"
85
+ f"Candidates: `{candidates}`\n\n"
86
+ f"GPU duration: `{GPU_DURATION_SECONDS}s` | "
87
+ f"Max input tokens: `{MAX_INPUT_TOKENS}`"
88
+ )
89
+ if _LOAD_ERRORS:
90
+ err = "\n".join(f"- {e}" for e in _LOAD_ERRORS[-3:])
91
+ return base + "\n\nRecent load errors:\n" + err
92
+ return base
93
+
94
+
95
+ @spaces.GPU(duration=GPU_DURATION_SECONDS)
96
+ def generate(
97
+ prompt: str,
98
+ max_new_tokens: int,
99
+ temperature: float,
100
+ top_p: float,
101
+ top_k: int,
102
+ do_sample: bool,
103
+ ) -> tuple[str, str]:
104
+ prompt = (prompt or "").strip()
105
+ if not prompt:
106
+ return "Provide a prompt.", _status_text()
107
+
108
+ model_id, _ = _load_model_if_needed()
109
+ if model_id is None:
110
+ return "Model load failed. Check status and Space logs.", _status_text()
111
+
112
+ tokenizer = _TOKENIZER
113
+ model = _MODEL
114
+
115
+ messages = [{"role": "user", "content": prompt}]
116
+ input_ids = tokenizer.apply_chat_template(
117
+ messages,
118
+ tokenize=True,
119
+ add_generation_prompt=True,
120
+ return_tensors="pt",
121
+ ).to(model.device)
122
+
123
+ if input_ids.shape[-1] > MAX_INPUT_TOKENS:
124
+ input_ids = input_ids[:, -MAX_INPUT_TOKENS:]
125
+
126
+ gen_kwargs: dict[str, Any] = {
127
+ "input_ids": input_ids,
128
+ "max_new_tokens": int(max_new_tokens),
129
+ "do_sample": bool(do_sample),
130
+ "pad_token_id": tokenizer.eos_token_id,
131
+ }
132
+ if do_sample:
133
+ gen_kwargs.update(
134
+ {
135
+ "temperature": float(temperature),
136
+ "top_p": float(top_p),
137
+ "top_k": int(top_k),
138
+ }
139
+ )
140
+
141
+ with torch.no_grad():
142
+ output_ids = model.generate(**gen_kwargs)
143
+
144
+ generated_ids = output_ids[0][input_ids.shape[-1]:]
145
+ text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
146
+ if not text:
147
+ text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
148
+
149
+ return text, _status_text()
150
+
151
+
152
+ with gr.Blocks(title="Nomos ZeroGPU Inference") as demo:
153
+ gr.Markdown(
154
+ "# Nomos Remote Inference (ZeroGPU)\n"
155
+ "This app tries model candidates in order and keeps the first that loads."
156
+ )
157
+ with gr.Row():
158
+ with gr.Column(scale=2):
159
+ prompt = gr.Textbox(
160
+ label="Prompt",
161
+ lines=10,
162
+ placeholder="Ask for a concise proof or solution sketch...",
163
+ )
164
+ with gr.Row():
165
+ max_new_tokens = gr.Slider(
166
+ minimum=32,
167
+ maximum=1024,
168
+ value=MAX_NEW_TOKENS_DEFAULT,
169
+ step=1,
170
+ label="Max new tokens",
171
+ )
172
+ top_k = gr.Slider(
173
+ minimum=1,
174
+ maximum=100,
175
+ value=20,
176
+ step=1,
177
+ label="Top-k",
178
+ )
179
+ with gr.Row():
180
+ temperature = gr.Slider(
181
+ minimum=0.0,
182
+ maximum=1.5,
183
+ value=0.6,
184
+ step=0.01,
185
+ label="Temperature",
186
+ )
187
+ top_p = gr.Slider(
188
+ minimum=0.05,
189
+ maximum=1.0,
190
+ value=0.95,
191
+ step=0.01,
192
+ label="Top-p",
193
+ )
194
+ do_sample = gr.Checkbox(value=True, label="Sample")
195
+ run_btn = gr.Button("Generate")
196
+ with gr.Column(scale=2):
197
+ output = gr.Textbox(label="Output", lines=18)
198
+ status = gr.Markdown(value=_status_text())
199
+
200
+ run_btn.click(
201
+ fn=generate,
202
+ inputs=[prompt, max_new_tokens, temperature, top_p, top_k, do_sample],
203
+ outputs=[output, status],
204
+ api_name="generate",
205
+ )
206
+
207
+ gr.Examples(
208
+ examples=[
209
+ ["Solve: Find all integers n such that n^2 + n + 1 is prime."],
210
+ ["Give a proof sketch that there are infinitely many primes."],
211
+ ],
212
+ inputs=prompt,
213
+ )
214
+
215
+ demo.queue(max_size=32)
216
+
217
+ if __name__ == "__main__":
218
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=5.0.0
2
+ spaces>=0.30.0
3
+ transformers>=4.51.0
4
+ accelerate>=0.34.0
5
+ safetensors>=0.5.0
6
+ compressed-tensors>=0.12.3