import gradio as gr
import subprocess
import sys
import os
import spaces

def download_data():
    """Download data outside GPU context to save GPU time"""
    # Download parameter-golf repo if needed
    if not os.path.exists("pg-data"):
        subprocess.run(
            ["git", "clone", "--depth", "1", "https://github.com/openai/parameter-golf", "pg-data"],
            capture_output=True, timeout=120
        )
    
    # Download tokenizer if not present
    if not os.path.exists("pg-data/data/tokenizers/fineweb_1024_bpe.model"):
        os.chdir("pg-data")
        subprocess.run(
            [sys.executable, "data/cached_challenge_fineweb.py", "--variant", "sp1024", "--train-shards", "1"],
            capture_output=True, timeout=180
        )
        os.chdir("..")
    
    return os.path.exists("pg-data/data/tokenizers/fineweb_1024_bpe.model")

@spaces.GPU(duration=180)  # 3 minutes (fits remaining quota)
def train_model_gpu():
    """GPU training after data is ready"""
    log = []
    
    def log_step(msg):
        log.append(msg)
        return "\n".join(log)
    
    try:
        # Run training with GPU-optimized config
        env = os.environ.copy()
        env.update({
            "NUM_LAYERS": "3",       # Small but reasonable
            "MODEL_DIM": "96",       # Fits in GPU memory
            "MAX_STEPS": "15",       # Fast run for remaining quota
            "MICRO_BATCH_SIZE": "4", # Reasonable batch
            "SEQ_LENGTH": "256",     # Moderate sequences
            "TOKENIZER_PATH": "pg-data/data/tokenizers/fineweb_1024_bpe.model",
            "DATA_PATH": "pg-data/data/datasets/fineweb10B_sp1024"
        })
        
        yield log_step("🚀 GPU activated - starting training")
        yield log_step("Config: 3 layers, 96-dim, 15 steps (fits remaining quota)")
        yield log_step("=" * 60)
        
        process = subprocess.Popen(
            [sys.executable, "train_gpt_kernel.py"],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1,
            env=env
        )
        
        for line in iter(process.stdout.readline, ''):
            if line:
                yield log_step(line.rstrip())
        
        process.wait()
        
        yield log_step("=" * 60)
        if process.returncode == 0:
            yield log_step("✅ Training complete!")
        else:
            yield log_step(f"⚠️ Exit code {process.returncode}")
            
    except Exception as e:
        yield log_step(f"❌ Error: {str(e)}")

def train_model():
    """Main entry point - prepares data then runs GPU training"""
    log = []
    
    def log_step(msg):
        log.append(msg)
        return "\n".join(log)
    
    try:
        yield log_step("🔄 Installing dependencies...")
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", "torch", "numpy", "tiktoken", "sentencepiece", "tqdm", "requests"], timeout=300)
        yield log_step("✅ Dependencies ready")
        
        yield log_step("🔄 Preparing data (outside GPU to save time)...")
        data_ready = download_data()
        
        if not data_ready:
            yield log_step("❌ Data download failed")
            return
        
        yield log_step("✅ Data ready - activating GPU...")
        
        # Now run the GPU part
        for msg in train_model_gpu():
            yield msg
            
    except Exception as e:
        yield log_step(f"❌ Error: {str(e)}")

with gr.Blocks(title="μ⁸ Kernel") as demo:
    gr.Markdown("""
    # μ⁸ Kernel Training - Parameter Golf
    
    Formally verified LM architecture (464 Lean 4 proofs):
    - **C(r) = 2r/(1+r²)** coherence activation  
    - **δ_S = 1+√2 ≈ 2.414** silver MLP expansion  
    - **μ⁸ = 1** eight-cycle attention
    
    **Zero GPU enabled** - downloads data on CPU, then activates GPU for training (3L/96d/15 steps, 3 min).
    """)
    
    btn = gr.Button("🚀 Start Training", variant="primary", size="lg")
    out = gr.Textbox(label="Training Log", lines=25, max_lines=40, autoscroll=True)
    
    btn.click(fn=train_model, outputs=out)

demo.queue()
demo.launch()