File size: 4,193 Bytes
83ad093
15771aa
 
d7c4afb
4f67ea2
83ad093
4f67ea2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549100e
4f67ea2
 
15771aa
 
 
 
 
 
 
4f67ea2
85c4144
 
4f67ea2
 
549100e
4f67ea2
 
d7c4afb
 
85c4144
15771aa
4f67ea2
549100e
4f67ea2
 
85c4144
 
 
 
 
 
 
 
15771aa
85c4144
 
 
 
 
15771aa
85c4144
 
150301a
85c4144
150301a
85c4144
15771aa
85c4144
15771aa
4f67ea2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85c4144
15771aa
85c4144
15771aa
150301a
 
 
 
4f67ea2
549100e
15771aa
 
85c4144
150301a
15771aa
85c4144
83ad093
85c4144
83ad093
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import gradio as gr
import subprocess
import sys
import os
import spaces

def download_data():
    """Download data outside GPU context to save GPU time"""
    # Download parameter-golf repo if needed
    if not os.path.exists("pg-data"):
        subprocess.run(
            ["git", "clone", "--depth", "1", "https://github.com/openai/parameter-golf", "pg-data"],
            capture_output=True, timeout=120
        )
    
    # Download tokenizer if not present
    if not os.path.exists("pg-data/data/tokenizers/fineweb_1024_bpe.model"):
        os.chdir("pg-data")
        subprocess.run(
            [sys.executable, "data/cached_challenge_fineweb.py", "--variant", "sp1024", "--train-shards", "1"],
            capture_output=True, timeout=180
        )
        os.chdir("..")
    
    return os.path.exists("pg-data/data/tokenizers/fineweb_1024_bpe.model")

@spaces.GPU(duration=180)  # 3 minutes (fits remaining quota)
def train_model_gpu():
    """GPU training after data is ready"""
    log = []
    
    def log_step(msg):
        log.append(msg)
        return "\n".join(log)
    
    try:
        # Run training with GPU-optimized config
        env = os.environ.copy()
        env.update({
            "NUM_LAYERS": "3",       # Small but reasonable
            "MODEL_DIM": "96",       # Fits in GPU memory
            "MAX_STEPS": "15",       # Fast run for remaining quota
            "MICRO_BATCH_SIZE": "4", # Reasonable batch
            "SEQ_LENGTH": "256",     # Moderate sequences
            "TOKENIZER_PATH": "pg-data/data/tokenizers/fineweb_1024_bpe.model",
            "DATA_PATH": "pg-data/data/datasets/fineweb10B_sp1024"
        })
        
        yield log_step("🚀 GPU activated - starting training")
        yield log_step("Config: 3 layers, 96-dim, 15 steps (fits remaining quota)")
        yield log_step("=" * 60)
        
        process = subprocess.Popen(
            [sys.executable, "train_gpt_kernel.py"],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1,
            env=env
        )
        
        for line in iter(process.stdout.readline, ''):
            if line:
                yield log_step(line.rstrip())
        
        process.wait()
        
        yield log_step("=" * 60)
        if process.returncode == 0:
            yield log_step("✅ Training complete!")
        else:
            yield log_step(f"⚠️ Exit code {process.returncode}")
            
    except Exception as e:
        yield log_step(f"❌ Error: {str(e)}")

def train_model():
    """Main entry point - prepares data then runs GPU training"""
    log = []
    
    def log_step(msg):
        log.append(msg)
        return "\n".join(log)
    
    try:
        yield log_step("🔄 Installing dependencies...")
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", "torch", "numpy", "tiktoken", "sentencepiece", "tqdm", "requests"], timeout=300)
        yield log_step("✅ Dependencies ready")
        
        yield log_step("🔄 Preparing data (outside GPU to save time)...")
        data_ready = download_data()
        
        if not data_ready:
            yield log_step("❌ Data download failed")
            return
        
        yield log_step("✅ Data ready - activating GPU...")
        
        # Now run the GPU part
        for msg in train_model_gpu():
            yield msg
            
    except Exception as e:
        yield log_step(f"❌ Error: {str(e)}")

with gr.Blocks(title="μ⁸ Kernel") as demo:
    gr.Markdown("""
    # μ⁸ Kernel Training - Parameter Golf
    
    Formally verified LM architecture (464 Lean 4 proofs):
    - **C(r) = 2r/(1+r²)** coherence activation  
    - **δ_S = 1+√2 ≈ 2.414** silver MLP expansion  
    - **μ⁸ = 1** eight-cycle attention
    
    **Zero GPU enabled** - downloads data on CPU, then activates GPU for training (3L/96d/15 steps, 3 min).
    """)
    
    btn = gr.Button("🚀 Start Training", variant="primary", size="lg")
    out = gr.Textbox(label="Training Log", lines=25, max_lines=40, autoscroll=True)
    
    btn.click(fn=train_model, outputs=out)

demo.queue()
demo.launch()