Mu / app.py
beanapologist
Reduce GPU to 3min/15steps to fit remaining Zero GPU quota
549100e
import gradio as gr
import subprocess
import sys
import os
import spaces
def download_data():
"""Download data outside GPU context to save GPU time"""
# Download parameter-golf repo if needed
if not os.path.exists("pg-data"):
subprocess.run(
["git", "clone", "--depth", "1", "https://github.com/openai/parameter-golf", "pg-data"],
capture_output=True, timeout=120
)
# Download tokenizer if not present
if not os.path.exists("pg-data/data/tokenizers/fineweb_1024_bpe.model"):
os.chdir("pg-data")
subprocess.run(
[sys.executable, "data/cached_challenge_fineweb.py", "--variant", "sp1024", "--train-shards", "1"],
capture_output=True, timeout=180
)
os.chdir("..")
return os.path.exists("pg-data/data/tokenizers/fineweb_1024_bpe.model")
@spaces.GPU(duration=180) # 3 minutes (fits remaining quota)
def train_model_gpu():
"""GPU training after data is ready"""
log = []
def log_step(msg):
log.append(msg)
return "\n".join(log)
try:
# Run training with GPU-optimized config
env = os.environ.copy()
env.update({
"NUM_LAYERS": "3", # Small but reasonable
"MODEL_DIM": "96", # Fits in GPU memory
"MAX_STEPS": "15", # Fast run for remaining quota
"MICRO_BATCH_SIZE": "4", # Reasonable batch
"SEQ_LENGTH": "256", # Moderate sequences
"TOKENIZER_PATH": "pg-data/data/tokenizers/fineweb_1024_bpe.model",
"DATA_PATH": "pg-data/data/datasets/fineweb10B_sp1024"
})
yield log_step("🚀 GPU activated - starting training")
yield log_step("Config: 3 layers, 96-dim, 15 steps (fits remaining quota)")
yield log_step("=" * 60)
process = subprocess.Popen(
[sys.executable, "train_gpt_kernel.py"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
env=env
)
for line in iter(process.stdout.readline, ''):
if line:
yield log_step(line.rstrip())
process.wait()
yield log_step("=" * 60)
if process.returncode == 0:
yield log_step("✅ Training complete!")
else:
yield log_step(f"⚠️ Exit code {process.returncode}")
except Exception as e:
yield log_step(f"❌ Error: {str(e)}")
def train_model():
"""Main entry point - prepares data then runs GPU training"""
log = []
def log_step(msg):
log.append(msg)
return "\n".join(log)
try:
yield log_step("🔄 Installing dependencies...")
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "torch", "numpy", "tiktoken", "sentencepiece", "tqdm", "requests"], timeout=300)
yield log_step("✅ Dependencies ready")
yield log_step("🔄 Preparing data (outside GPU to save time)...")
data_ready = download_data()
if not data_ready:
yield log_step("❌ Data download failed")
return
yield log_step("✅ Data ready - activating GPU...")
# Now run the GPU part
for msg in train_model_gpu():
yield msg
except Exception as e:
yield log_step(f"❌ Error: {str(e)}")
with gr.Blocks(title="μ⁸ Kernel") as demo:
gr.Markdown("""
# μ⁸ Kernel Training - Parameter Golf
Formally verified LM architecture (464 Lean 4 proofs):
- **C(r) = 2r/(1+r²)** coherence activation
- **δ_S = 1+√2 ≈ 2.414** silver MLP expansion
- **μ⁸ = 1** eight-cycle attention
**Zero GPU enabled** - downloads data on CPU, then activates GPU for training (3L/96d/15 steps, 3 min).
""")
btn = gr.Button("🚀 Start Training", variant="primary", size="lg")
out = gr.Textbox(label="Training Log", lines=25, max_lines=40, autoscroll=True)
btn.click(fn=train_model, outputs=out)
demo.queue()
demo.launch()