sovthpaw commited on
Commit
db69dc9
Β·
verified Β·
1 Parent(s): 0e106bb

Upload scripts/run_omnistep_12a3b.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/run_omnistep_12a3b.py +256 -0
scripts/run_omnistep_12a3b.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OmniStep 12A3B β€” main entry point.
4
+
5
+ A complete multimodal model: real-time ASR, text reasoning, TTS, image understanding,
6
+ and infinite background music generation. Built as a paper-exact Darwin weight-space
7
+ recombination (arXiv:2605.14386) of Qwen2.5-Omni-3B and ACE-Step v1.5 XL SFT 4B.
8
+
9
+ Usage:
10
+ python run_omnistep_6a3b.py text "Explain the Darwin Family paper."
11
+ python run_omnistep_6a3b.py music "chill lofi beats" --output ~/music/track.wav
12
+ python run_omnistep_6a3b.py music-loop "chill lofi beats" # infinite background music
13
+ python run_omnistep_6a3b.py voice # streaming voice assistant (ASR + TTS)
14
+ python run_omnistep_6a3b.py serve # start the vllm server
15
+
16
+ The model has ALL modalities. Pick a mode. They all work together.
17
+ """
18
+ import argparse
19
+ import asyncio
20
+ import json
21
+ import os
22
+ import sys
23
+ import time
24
+ from pathlib import Path
25
+
26
+ MODEL_ID = os.environ.get("OMNISTEP_MODEL", "sovthpaw/omnistep-12a3b")
27
+ HF_TOKEN = os.environ.get("HF_TOKEN")
28
+
29
+ # ============================================================================
30
+ # Text reasoning β€” works with vllm OR llama-server
31
+ # ============================================================================
32
+ def cmd_text(prompt: str, max_tokens: int = 200, server_url: str = None):
33
+ """Quick text reasoning via the running server (vllm or llama-server)."""
34
+ import urllib.request
35
+ url = server_url or os.environ.get("OMNISTEP_SERVER", "http://localhost:8080")
36
+ payload = {
37
+ "model": "omnistep-12a3b",
38
+ "messages": [{"role": "user", "content": prompt}],
39
+ "max_tokens": max_tokens,
40
+ "temperature": 0.7,
41
+ }
42
+ req = urllib.request.Request(
43
+ f"{url}/v1/chat/completions",
44
+ data=json.dumps(payload).encode(),
45
+ headers={"Content-Type": "application/json"},
46
+ )
47
+ with urllib.request.urlopen(req) as resp:
48
+ result = json.loads(resp.read())
49
+ print(result["choices"][0]["message"]["content"])
50
+
51
+ # ============================================================================
52
+ # Music generation β€” uses the ACE DiT (the diffusion part, F16 / unquantized)
53
+ # ============================================================================
54
+ def cmd_music(prompt: str, output: str, duration: int = 60, infer_steps: int = 8):
55
+ """One-shot music generation via the ACE-Step diffusion decoder.
56
+
57
+ Loads the model with transformers (full multimodal safetensors), uses the
58
+ OmniStep text body to craft the music prompt, and runs the ACE DiT to
59
+ generate audio. The DiT stays at F16 (unquantized) for max quality.
60
+ """
61
+ import torch
62
+ from transformers import AutoModel, AutoProcessor
63
+ import soundfile as sf
64
+
65
+ print(f"🎡 OmniStep 12A3B β€” music generation")
66
+ print(f" prompt: {prompt}")
67
+ print(f" duration: {duration}s, infer_steps: {infer_steps}")
68
+ print(f" output: {output}")
69
+
70
+ # Load the full multimodal model (with the music head)
71
+ model = AutoModel.from_pretrained(
72
+ MODEL_ID,
73
+ torch_dtype=torch.bfloat16,
74
+ device_map="auto",
75
+ trust_remote_code=True,
76
+ token=HF_TOKEN,
77
+ )
78
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
79
+
80
+ # Use the OmniStep text body to craft a richer prompt
81
+ crafted = model.craft_music_prompt(prompt, duration=duration)
82
+ print(f" crafted prompt: {crafted[:120]}...")
83
+
84
+ # Generate via the ACE DiT (unquantized F16 weights)
85
+ audio = model.generate_music(
86
+ prompt=crafted,
87
+ duration=duration,
88
+ infer_steps=infer_steps,
89
+ )
90
+
91
+ # Save
92
+ out_path = Path(output).expanduser()
93
+ out_path.parent.mkdir(parents=True, exist_ok=True)
94
+ sf.write(str(out_path), audio, 48000)
95
+ print(f" βœ“ wrote {out_path} ({out_path.stat().st_size/1e6:.1f}MB)")
96
+
97
+ # ============================================================================
98
+ # Infinite background music (the Evolutionary Radio)
99
+ # ============================================================================
100
+ def cmd_music_loop(prompt: str, queue_size: int = 5, duration: int = 60):
101
+ """Infinite background music generation.
102
+
103
+ Runs the 4 concurrent loops of the Evolutionary Radio:
104
+ 1. Playback (mpv)
105
+ 2. Queue fill (generate one track ahead, target 5-track queue)
106
+ 3. GEPA prompt evolution (background, every 50 generations)
107
+ 4. Darwin weight evolution (background, nightly)
108
+
109
+ This is the "infinitely generate its own background music" feature.
110
+ """
111
+ print(f"🎡 Evolutionary Radio (OmniStep 12A3B)")
112
+ print(f" prompt: {prompt}")
113
+ print(f" queue: {queue_size} tracks, duration: {duration}s each")
114
+ print(f" Press Ctrl+C to stop.")
115
+ print()
116
+
117
+ # Lazy imports β€” only needed for music-loop
118
+ try:
119
+ from omnistep_radio import EvolutionaryRadio
120
+ except ImportError:
121
+ print(" Note: omnistep_radio.py not in the same directory.")
122
+ print(" Run from the OmniStep 12A3B repo root, or set PYTHONPATH.")
123
+ sys.exit(1)
124
+
125
+ radio = EvolutionaryRadio(prompt=prompt, queue_size=queue_size, duration=duration)
126
+ try:
127
+ asyncio.run(radio.run())
128
+ except KeyboardInterrupt:
129
+ print("\n βœ“ stopped.")
130
+
131
+ # ============================================================================
132
+ # Streaming voice (ASR + reasoning + TTS) β€” needs vllm
133
+ # ============================================================================
134
+ def cmd_voice(server_url: str = None):
135
+ """Streaming voice assistant: ASR (audio in) β†’ text reasoning β†’ TTS (audio out)."""
136
+ print("🎀 OmniStep 12A3B β€” streaming voice assistant")
137
+ print()
138
+ print("This mode uses vllm with the full safetensors for streaming ASR + TTS.")
139
+ print("If vllm is not running, start it first:")
140
+ print(f" vllm serve {MODEL_ID} --max-model-len 32768 --gpu-memory-utilization 0.85")
141
+ print()
142
+ print("Then run this command. It will:")
143
+ print(" 1. Stream audio in (your microphone)")
144
+ print(" 2. Real-time ASR via the Whisper audio encoder")
145
+ print(" 3. Text reasoning via the OmniStep text body")
146
+ print(" 4. Streaming TTS via the Talker + token2wav speech-out heads")
147
+ print(" 5. (Optional) background music via the ACE DiT, mixed in")
148
+ print()
149
+ print("Requires: sounddevice, numpy, vllm-client, the model running via vllm.")
150
+ print()
151
+
152
+ try:
153
+ from omnistep_voice import VoiceAssistant
154
+ except ImportError:
155
+ print(" Note: omnistep_voice.py not in the same directory.")
156
+ print(" Run from the OmniStep 12A3B repo root, or set PYTHONPATH.")
157
+ sys.exit(1)
158
+
159
+ va = VoiceAssistant(server_url=server_url or os.environ.get("OMNISTEP_SERVER", "http://localhost:8080"))
160
+ try:
161
+ va.run()
162
+ except KeyboardInterrupt:
163
+ print("\n βœ“ stopped.")
164
+
165
+ # ============================================================================
166
+ # Start the vllm server (the easiest way to run the model)
167
+ # ============================================================================
168
+ def cmd_serve(gpu_memory: float = 0.85, max_len: int = 32768, port: int = 8080):
169
+ """Start the vllm server with the full multimodal OmniStep 12A3B model."""
170
+ import subprocess
171
+ print(f"πŸš€ Starting vllm server for {MODEL_ID}")
172
+ print(f" gpu_memory_utilization: {gpu_memory}, max_model_len: {max_len}, port: {port}")
173
+ print(f" vllm must be installed (pip install vllm)")
174
+ cmd = [
175
+ "vllm", "serve", MODEL_ID,
176
+ "--max-model-len", str(max_len),
177
+ "--gpu-memory-utilization", str(gpu_memory),
178
+ "--port", str(port),
179
+ "--trust-remote-code",
180
+ ]
181
+ print(f" $ {' '.join(cmd)}")
182
+ subprocess.run(cmd)
183
+
184
+ # ============================================================================
185
+ # Main
186
+ # ============================================================================
187
+ def main():
188
+ p = argparse.ArgumentParser(
189
+ description="OmniStep 12A3B β€” fast 4o-style streaming voice assistant with infinite background music",
190
+ formatter_class=argparse.RawDescriptionHelpFormatter,
191
+ epilog="""
192
+ Examples:
193
+ # Text reasoning (need a server running: vllm or llama-server)
194
+ python run_omnistep_6a3b.py text "Explain the Darwin Family paper."
195
+
196
+ # One-shot music generation (uses the ACE DiT, F16 unquantized)
197
+ python run_omnistep_6a3b.py music "chill lofi beats" --output ~/music/track.wav
198
+
199
+ # Infinite background music
200
+ python run_omnistep_6a3b.py music-loop "chill lofi beats"
201
+
202
+ # Streaming voice assistant (ASR + reasoning + TTS)
203
+ python run_omnistep_6a3b.py voice
204
+
205
+ # Start the vllm server (the easiest deployment)
206
+ python run_omnistep_6a3b.py serve
207
+
208
+ The model has ALL modalities. Pick a mode. They all work together.
209
+ """,
210
+ )
211
+ sub = p.add_subparsers(dest="mode", required=True)
212
+
213
+ # text
214
+ pt = sub.add_parser("text", help="Text reasoning (via running vllm or llama-server)")
215
+ pt.add_argument("prompt")
216
+ pt.add_argument("--max-tokens", type=int, default=200)
217
+ pt.add_argument("--server", default=None, help="vllm/llama-server URL (default: $OMNISTEP_SERVER or http://localhost:8080)")
218
+
219
+ # music
220
+ pm = sub.add_parser("music", help="One-shot music generation via the ACE DiT (F16 unquantized)")
221
+ pm.add_argument("prompt")
222
+ pm.add_argument("--output", default="~/music/omnistep_track.wav")
223
+ pm.add_argument("--duration", type=int, default=60)
224
+ pm.add_argument("--infer-steps", type=int, default=8)
225
+
226
+ # music-loop (infinite background music)
227
+ pl = sub.add_parser("music-loop", help="Infinite background music (Evolutionary Radio)")
228
+ pl.add_argument("prompt")
229
+ pl.add_argument("--queue-size", type=int, default=5)
230
+ pl.add_argument("--duration", type=int, default=60)
231
+
232
+ # voice
233
+ pv = sub.add_parser("voice", help="Streaming voice assistant (ASR + TTS)")
234
+ pv.add_argument("--server", default=None)
235
+
236
+ # serve
237
+ ps = sub.add_parser("serve", help="Start the vllm server")
238
+ ps.add_argument("--gpu-memory", type=float, default=0.85)
239
+ ps.add_argument("--max-len", type=int, default=32768)
240
+ ps.add_argument("--port", type=int, default=8080)
241
+
242
+ args = p.parse_args()
243
+
244
+ if args.mode == "text":
245
+ cmd_text(args.prompt, args.max_tokens, args.server)
246
+ elif args.mode == "music":
247
+ cmd_music(args.prompt, args.output, args.duration, args.infer_steps)
248
+ elif args.mode == "music-loop":
249
+ cmd_music_loop(args.prompt, args.queue_size, args.duration)
250
+ elif args.mode == "voice":
251
+ cmd_voice(args.server)
252
+ elif args.mode == "serve":
253
+ cmd_serve(args.gpu_memory, args.max_len, args.port)
254
+
255
+ if __name__ == "__main__":
256
+ main()