File size: 3,734 Bytes
bfc24c4
 
 
 
 
6dc86df
 
 
 
 
d03c67a
bfc24c4
 
95b6e6d
d03c67a
 
 
95b6e6d
 
 
 
 
 
 
 
bfc24c4
6dc86df
d03c67a
6dc86df
f779211
 
95b6e6d
 
bfc24c4
 
95b6e6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfc24c4
95b6e6d
 
d03c67a
 
bfc24c4
95b6e6d
bfc24c4
95b6e6d
bfc24c4
 
95b6e6d
6dc86df
 
 
 
 
 
d03c67a
6dc86df
d03c67a
 
 
bfc24c4
 
 
 
dd4f5ac
3e8f1b5
ef82471
bfc24c4
 
 
6dc86df
 
bfc24c4
3f15b4c
 
bfc24c4
3f15b4c
bfc24c4
3f15b4c
 
6dc86df
f779211
bfc24c4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
import subprocess
import os
import threading
import sys
import time
from pathlib import Path

# Training log file
TRAINING_LOG = "/tmp/training.log"
TRAINING_PID_FILE = "/tmp/training.pid"

def start_training():
    """Start D1337 CIPHER training - SUBPROCESS VERSION (MORE RELIABLE)"""
    
    # Check if already running
    if Path(TRAINING_PID_FILE).exists():
        try:
            pid = int(Path(TRAINING_PID_FILE).read_text())
            # Check if process still running
            os.kill(pid, 0)  # Will raise if not running
            return "⚠️ Training already running! Check logs below..."
        except:
            # Process dead, remove PID file
            Path(TRAINING_PID_FILE).unlink()
    
    # Clear log file
    Path(TRAINING_LOG).write_text("🔥 D1337 CIPHER TRAINING STARTING...\n\n", encoding="utf-8")
    
    # Use optimized training script for L40S x4
    runner_path = "/app/train.py"
    
    # Start subprocess
    def run_training():
        try:
            # Run training script and redirect output to log
            with open(TRAINING_LOG, "a", encoding="utf-8") as log_file:
                process = subprocess.Popen(
                    [sys.executable, runner_path],
                    stdout=log_file,
                    stderr=subprocess.STDOUT,
                    cwd="/app",
                    env=os.environ.copy()
                )
                
                # Write PID
                Path(TRAINING_PID_FILE).write_text(str(process.pid))
                
                # Wait for completion
                process.wait()
                
                # Remove PID file
                if Path(TRAINING_PID_FILE).exists():
                    Path(TRAINING_PID_FILE).unlink()
                    
        except Exception as e:
            error_msg = f"\n❌ ERROR: {str(e)}\n"
            Path(TRAINING_LOG).write_text(error_msg, encoding="utf-8", mode="a")
            if Path(TRAINING_PID_FILE).exists():
                Path(TRAINING_PID_FILE).unlink()
    
    # Run in background thread
    thread = threading.Thread(target=run_training)
    thread.daemon = False
    thread.start()
    
    return "🔥 D1337 CIPHER TRAINING STARTED!\n\nLoading model (31B) - this may take 2-5 minutes...\n\nOutput will appear below automatically (refresh every 3 seconds)."

def get_training_log():
    """Get latest training log"""
    try:
        if Path(TRAINING_LOG).exists():
            content = Path(TRAINING_LOG).read_text(encoding="utf-8")
            if content.strip():
                return content
        return "Waiting for training to start...\n\nClick 'START TRAINING' button above."
    except Exception as e:
        return f"Error reading log: {str(e)}"

# UI
with gr.Blocks(title="D1337 CIPHER Training") as demo:
    gr.Markdown("# 🔥 D1337 CIPHER C2 V.1 - TRAINING")
    gr.Markdown("**Hardware**: L40S x4 (192GB VRAM)")
    gr.Markdown("**Base**: Huihui-GLM-4.7-Flash-abliterated (31B)")
    gr.Markdown("**Dataset**: 92 samples | **Epochs**: 3 | **4-bit + LoRA**")
    
    with gr.Row():
        train_btn = gr.Button("🚀 START TRAINING", variant="primary")
    
    output = gr.Textbox(label="Training Output", lines=20, value="Click 'START TRAINING' to begin...")
    
    refresh_btn = gr.Button("🔄 Refresh Logs", variant="secondary")
    
    train_btn.click(start_training, outputs=output)
    refresh_btn.click(get_training_log, outputs=output)
    
    # Auto-load on page refresh
    demo.load(fn=get_training_log, outputs=output)
    
    gr.Markdown("**Expected time: ~5-10 minutes on L40S x4**")

demo.launch(server_name="0.0.0.0", server_port=7860, share=False, ssl_verify=False, show_error=True)