Ksjsjjdj commited on
Commit
33ff5b3
·
verified ·
1 Parent(s): b9cf174

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +501 -232
app.py CHANGED
@@ -1,18 +1,24 @@
1
  import os
2
- #os.system("pip install spaces-0.1.0-py3-none-any.whl")
3
- import torch
4
  import logging
5
  import multiprocessing
6
  import threading
7
- from itertools import chain
 
 
 
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
 
 
 
 
 
9
  from datasets import load_dataset, get_dataset_config_names, IterableDataset
10
  from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback
11
  from peft import LoraConfig, get_peft_model, PeftModel
12
  from huggingface_hub import login, whoami, create_repo, upload_folder
13
- from IPython.display import clear_output
14
- import gradio as gr
15
- from dotenv import load_dotenv
16
  import spaces
17
 
18
  try:
@@ -20,263 +26,526 @@ try:
20
  except:
21
  pass
22
 
23
- @spaces.GPU
24
- class GradioProgressCallback(TrainerCallback):
25
- def __init__(self, progress_bar):
26
- self.progress_bar = progress_bar
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def on_step_end(self, args, state, control, **kwargs):
29
- if state.global_step > 0:
30
- self.progress_bar(state.global_step / state.max_steps, desc=f"Paso {state.global_step}/{state.max_steps}")
 
 
 
 
 
 
31
  return control
32
 
33
- @spaces.GPU()
34
- def run_training(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_dropout,
35
- train_steps, learning_rate, batch_size, datasets_text, progress=gr.Progress()):
36
-
37
- os.environ["WANDB_DISABLED"] = "true"
38
- os.environ["HF_TOKEN"] = hf_token
39
 
 
 
 
 
40
  try:
 
 
 
 
41
  login(token=hf_token)
42
- username = whoami()["name"]
43
- except Exception as e:
44
- return f"Error de autenticación: {str(e)}"
 
 
45
 
46
- # device = "cuda" if torch.cuda.is_available() else "cpu"
47
- num_workers = multiprocessing.cpu_count()
 
 
 
48
 
49
- if not hasattr(torch, 'xla'):
50
- class DummyXLA:
51
- def __getattr__(self, name):
52
- return lambda *args, **kwargs: None
53
- torch.xla = DummyXLA()
54
 
55
- logging.basicConfig(level=logging.INFO)
56
- logger = logging.getLogger(__name__)
 
 
 
 
57
 
58
- raw_items = datasets_text.replace('\n', ',').split(',')
59
- dataset_list = [item.strip() for item in raw_items if item.strip()]
 
 
 
 
 
 
 
 
60
 
61
- def get_sample_text(ds):
62
- try:
63
- sample = next(iter(ds))
64
- if isinstance(sample, dict):
65
- return sample.get("text", str(sample))
66
- return str(sample)
67
- except:
68
- return None
 
 
 
 
69
 
70
- def load_single(ds_name, cfg):
71
- try:
72
- ds = load_dataset(ds_name, cfg, streaming=True, trust_remote_code=True)
73
- if isinstance(ds, dict):
74
- ds = next(iter(ds.values()))
 
 
 
 
 
 
 
75
 
76
- if get_sample_text(ds):
77
- return ds
78
- return None
79
- except:
80
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- def load_all_datasets():
83
- streams = []
84
- tasks = []
85
- progress(0.1, desc="Analizando configuraciones...")
86
 
87
- for ds_name in dataset_list:
88
- try:
89
- configs = get_dataset_config_names(ds_name)
90
- except:
91
- configs = []
92
-
93
- if not configs:
94
- tasks.append((ds_name, None))
95
- else:
96
- for c in configs:
97
- tasks.append((ds_name, c))
98
-
99
- progress(0.2, desc=f"Cargando {len(tasks)} fuentes...")
100
- with ThreadPoolExecutor(max_workers=num_workers) as executor:
101
- future_to_task = {executor.submit(load_single, d, c): (d, c) for d, c in tasks}
102
- for future in as_completed(future_to_task):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  try:
104
- ds = future.result()
105
- if ds:
106
- streams.append(ds)
 
107
  except:
108
  pass
109
- return streams
110
 
111
- loaded_streams = load_all_datasets()
112
- if not loaded_streams:
113
- return "Error: No se pudo cargar ningún dataset válido."
114
 
115
- def all_samples():
116
- return chain.from_iterable(loaded_streams)
 
 
 
 
 
 
117
 
118
- progress(0.3, desc="Cargando Tokenizer...")
119
- try:
120
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side="left", add_eos_token=True, add_bos_token=True)
121
- tokenizer.pad_token = tokenizer.eos_token
122
- except Exception as e:
123
- return f"Error cargando tokenizer: {str(e)}"
124
-
125
- def create_text_lines(sample):
126
- if isinstance(sample, dict):
127
- text = sample.get("text", "\n".join(str(v) for v in sample.values() if isinstance(v, str)))
128
- else:
129
- text = str(sample)
130
- return [line.strip() for line in text.splitlines() if line.strip()]
131
-
132
- def process_sample(sample):
133
- lines = create_text_lines(sample)
134
- results = []
135
- for line in lines:
136
- tok = tokenizer(line, truncation=False)
137
- tok["labels"] = tok["input_ids"].copy()
138
- results.append(tok)
139
- return results
140
-
141
- def processed_samples_generator():
142
- batch = []
143
- for sample in all_samples():
144
- batch.append(sample)
145
- if len(batch) >= 100:
146
- with ThreadPoolExecutor(max_workers=num_workers) as executor:
147
- futures = [executor.submit(process_sample, s) for s in batch]
148
- for future in as_completed(futures):
149
- try:
150
- res = future.result()
151
- for tok in res:
152
- yield tok
153
- except:
154
- pass
155
- batch.clear()
156
-
157
- if batch:
158
- with ThreadPoolExecutor(max_workers=num_workers) as executor:
159
- futures = [executor.submit(process_sample, s) for s in batch]
160
- for future in as_completed(futures):
161
- try:
162
- res = future.result()
163
- for tok in res:
164
- yield tok
165
- except:
166
- pass
167
-
168
- progress(0.4, desc="Cargando Modelo...")
169
- try:
170
- original_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
171
  except Exception as e:
172
- return f"Error cargando modelo: {str(e)}"
173
-
174
- peft_config = LoraConfig(
175
- r=int(lora_r),
176
- lora_alpha=int(lora_alpha),
177
- target_modules=["q_proj", "k_proj", "v_proj", "dense"],
178
- bias="none",
179
- lora_dropout=lora_dropout,
180
- task_type="CAUSAL_LM"
181
- )
182
 
183
- peft_model = get_peft_model(original_model, peft_config)
184
- peft_model.config.use_cache = False
185
-
186
- output_dir = "/content/final-checkpoint"
187
- max_steps_val = int(train_steps)
188
- save_steps_val = max_steps_val // 2 if max_steps_val > 10 else 1
189
-
190
- training_args = TrainingArguments(
191
- output_dir=output_dir,
192
- per_device_train_batch_size=int(batch_size),
193
- gradient_accumulation_steps=1,
194
- max_steps=max_steps_val,
195
- learning_rate=learning_rate,
196
- optim="adamw_torch",
197
- logging_steps=5,
198
- save_strategy="steps",
199
- save_steps=save_steps_val,
200
- report_to="none"
201
- )
202
 
203
- processed_dataset = IterableDataset.from_generator(processed_samples_generator)
 
204
 
205
- trainer = Trainer(
206
- model=peft_model,
207
- train_dataset=processed_dataset,
208
- args=training_args,
209
- callbacks=[GradioProgressCallback(progress)]
210
  )
211
-
212
- progress(0.5, desc="Entrenando...")
213
- trainer.train()
 
 
 
 
 
 
 
 
 
 
 
214
 
215
- progress(0.8, desc="Guardando...")
216
- trainer.save_model(output_dir)
217
-
218
- progress(0.9, desc="Fusionando...")
219
- ft = PeftModel.from_pretrained(original_model, output_dir, torch_dtype=torch.float32, is_trainable=False).merge_and_unload()
220
 
221
- final_path = "/content/merged_model"
222
- ft.save_pretrained(final_path, safe_serialization=True)
223
- tokenizer.save_pretrained(final_path)
224
-
225
- progress(0.95, desc="Subiendo...")
226
- full_repo = f"{username}/{new_repo_name}"
227
- create_repo(full_repo, token=hf_token, exist_ok=True)
228
- upload_folder(folder_path=final_path, repo_id=full_repo, token=hf_token)
229
-
230
- return f"Completado: https://huggingface.co/{full_repo}"
231
-
232
- custom_css = """
233
- body {background-color: #0b0f19; color: #e0e6ed;}
234
- .gradio-container {max-width: 1200px !important; margin: 0 auto;}
235
- h1 {text-align: center; color: #00e5ff; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; text-transform: uppercase; letter-spacing: 2px;}
236
- .primary-btn {background: linear-gradient(135deg, #00C9FF 0%, #92FE9D 100%); border: none; color: #000; font-weight: 800; font-size: 16px; padding: 12px; transition: transform 0.2s;}
237
- .primary-btn:hover {transform: scale(1.02); filter: brightness(1.1);}
238
- .input-box textarea {font-family: 'Consolas', 'Monaco', monospace; font-size: 13px; background-color: #1a202c; color: #a0aec0; border: 1px solid #2d3748;}
239
- .gr-box {border-radius: 8px; background-color: #1a202c; border: 1px solid #2d3748;}
240
- label {color: #00e5ff !important; font-weight: bold;}
241
- """
242
-
243
- with gr.Blocks(title="Entrenador LLM Ultimate") as demo:
244
- gr.HTML(f"<style>{custom_css}</style>")
245
- gr.HTML("""
246
- <div style="text-align: center; margin-bottom: 20px;">
247
- <h1 style="margin: 0;">⚡ INFINITE LLM TRAINER ⚡</h1>
248
- <p style="color: #a0aec0;">Entrenamiento Multi-Dataset con Fusión Automática y Subida a Hub</p>
249
  </div>
250
- """)
 
251
 
252
- with gr.Row():
253
- with gr.Column(scale=1):
254
- hf_token_input = gr.Textbox(label="HuggingFace Token", type="password", placeholder="hf_...", value=os.getenv("HF_TOKEN", ""))
255
- model_input = gr.Textbox(label="Modelo Base", value="", placeholder="Ej: Qwen/Qwen2.5-0.5B (Requerido)")
256
- repo_input = gr.Textbox(label="Nombre Nuevo Repo", value="multi-dataset-model-v1")
257
-
258
- with gr.Column(scale=1):
259
- with gr.Group():
260
- gr.Markdown("### 🎛️ Configuración Avanzada LoRA")
261
- r_input = gr.Slider(minimum=8, maximum=256, value=32, step=8, label="Rank (r)")
262
- alpha_input = gr.Slider(minimum=8, maximum=512, value=32, step=8, label="Alpha")
263
- dropout_input = gr.Slider(minimum=0.0, maximum=0.5, value=0.05, step=0.01, label="Dropout")
264
-
265
- with gr.Row():
266
- steps_input = gr.Number(label="Max Steps (Duración)", value=500, precision=0)
267
- lr_input = gr.Number(label="Learning Rate", value=2e-4)
268
- batch_input = gr.Number(label="Batch Size", value=1, precision=0)
269
-
270
- datasets_input = gr.Textbox(label="Fuentes de Datos (Datasets)", value="", placeholder="Pega aquí tus datasets separados por coma o salto de línea.\nEjemplo:\nSalesforce/fineweb_deduplicated\nbigcode/the-stack, v2", lines=12, elem_classes="input-box")
271
 
272
- train_btn = gr.Button("🚀 INICIAR ENTRENAMIENTO", elem_classes="primary-btn")
273
- status_output = gr.Textbox(label="Log del Sistema", interactive=False, lines=3)
274
-
275
- train_btn.click(
276
- fn=run_training,
277
- inputs=[hf_token_input, model_input, repo_input, r_input, alpha_input, dropout_input,
278
- steps_input, lr_input, batch_input, datasets_input],
279
- outputs=status_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  )
281
 
282
- demo.launch(share=True, debug=True)
 
 
1
  import os
2
+ import json
 
3
  import logging
4
  import multiprocessing
5
  import threading
6
+ import uuid
7
+ import time
8
+ import sys
9
+ from datetime import datetime
10
  from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from itertools import chain
12
+
13
+ import torch
14
+ import gradio as gr
15
+ import transformers
16
+ import datasets
17
+ from dotenv import load_dotenv
18
  from datasets import load_dataset, get_dataset_config_names, IterableDataset
19
  from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback
20
  from peft import LoraConfig, get_peft_model, PeftModel
21
  from huggingface_hub import login, whoami, create_repo, upload_folder
 
 
 
22
  import spaces
23
 
24
  try:
 
26
  except:
27
  pass
28
 
29
+ transformers.logging.set_verbosity_error()
30
+ datasets.logging.set_verbosity_error()
31
+ logging.basicConfig(level=logging.ERROR)
32
+
33
+ JOBS = {}
34
+
35
+ class JobStatus:
36
+ def __init__(self):
37
+ self.id = str(uuid.uuid4())[:8]
38
+ self.status = "IDLE"
39
+ self.progress = 0.0
40
+ self.logs = []
41
+ self.result = None
42
+ self.error = None
43
+ self.created_at = datetime.now().strftime("%H:%M:%S")
44
+ self.repo_url = None
45
+
46
+ def add_log(self, message):
47
+ timestamp = datetime.now().strftime("%H:%M:%S")
48
+ self.logs.append(f"[{timestamp}] {message}")
49
+
50
+ def set_progress(self, val, msg=None):
51
+ self.progress = val
52
+ if msg:
53
+ self.add_log(msg)
54
+
55
+ class CustomTrainerCallback(TrainerCallback):
56
+ def __init__(self, job_id):
57
+ self.job_id = job_id
58
 
59
  def on_step_end(self, args, state, control, **kwargs):
60
+ if self.job_id in JOBS:
61
+ job = JOBS[self.job_id]
62
+ if state.max_steps > 0:
63
+ prog = state.global_step / state.max_steps
64
+ job.progress = 0.4 + (prog * 0.5)
65
+ if state.global_step % 5 == 0:
66
+ loss = state.log_history[-1].get('loss', 'N/A') if state.log_history else '...'
67
+ job.add_log(f"Step {state.global_step}/{state.max_steps} | Loss: {loss}")
68
  return control
69
 
70
+ @spaces.GPU(duration=300)
71
+ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_dropout,
72
+ train_steps, learning_rate, batch_size, datasets_text,
73
+ reasoning_mode, c_conf, c_tok, c_gen):
 
 
74
 
75
+ job = JOBS[job_id]
76
+ job.status = "ACTIVE"
77
+ job.add_log("Initializing Nucleus Core...")
78
+
79
  try:
80
+ os.environ["WANDB_DISABLED"] = "true"
81
+ os.environ["HF_TOKEN"] = hf_token
82
+ os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
83
+
84
  login(token=hf_token)
85
+ try:
86
+ username = whoami()["name"]
87
+ job.add_log(f"Authenticated: {username}")
88
+ except:
89
+ raise Exception("Authentication Failed")
90
 
91
+ if not hasattr(torch, 'xla'):
92
+ class DummyXLA:
93
+ def __getattr__(self, name):
94
+ return lambda *args, **kwargs: None
95
+ torch.xla = DummyXLA()
96
 
97
+ raw_items = datasets_text.replace('\n', ',').split(',')
98
+ dataset_list = [item.strip() for item in raw_items if item.strip()]
 
 
 
99
 
100
+ if reasoning_mode:
101
+ job.add_log("Reasoning Core: ACTIVATED")
102
+ job.add_log("Injecting Logic & CoT Datasets...")
103
+ dataset_list.append("gsm8k")
104
+ dataset_list.append("openai/gsm8k")
105
+ dataset_list.append("microsoft/orca-math-word-problems-200k")
106
 
107
+ def load_single(ds_name, cfg):
108
+ try:
109
+ ds = load_dataset(ds_name, cfg if cfg else "main", split="train", streaming=True, trust_remote_code=False)
110
+ try:
111
+ next(iter(ds))
112
+ return ds
113
+ except:
114
+ return None
115
+ except:
116
+ return None
117
 
118
+ streams = []
119
+ job.set_progress(0.1, "Analyzing Vector Streams...")
120
+
121
+ with ThreadPoolExecutor(max_workers=4) as executor:
122
+ futures = []
123
+ for ds_name in dataset_list:
124
+ futures.append(executor.submit(load_single, ds_name, None))
125
+
126
+ for future in as_completed(futures):
127
+ res = future.result()
128
+ if res:
129
+ streams.append(res)
130
 
131
+ if not streams:
132
+ raise Exception("Data Stream Failure: No valid inputs")
133
+
134
+ job.set_progress(0.2, f"Stream Locked: {len(streams)} Sources")
135
+
136
+ job.add_log("Tokenizing Input Stream...")
137
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False, padding_side="left", add_eos_token=True, add_bos_token=True)
138
+ tokenizer.pad_token = tokenizer.eos_token
139
+
140
+ def process_stream_generator():
141
+ iterator = chain.from_iterable(streams)
142
+ batch_buffer = []
143
 
144
+ for item in iterator:
145
+ try:
146
+ text = ""
147
+ if "question" in item and "answer" in item:
148
+ text = f"Question: {item['question']}\nAnswer: {item['answer']}"
149
+ elif "text" in item:
150
+ text = item["text"]
151
+ else:
152
+ text = str(item)
153
+
154
+ batch_buffer.append(text)
155
+
156
+ if len(batch_buffer) >= 50:
157
+ for txt in batch_buffer:
158
+ tokens = tokenizer(txt, truncation=True, max_length=1024)
159
+ tokens["labels"] = tokens["input_ids"].copy()
160
+ yield tokens
161
+ batch_buffer = []
162
+ except:
163
+ continue
164
+
165
+ for txt in batch_buffer:
166
+ tokens = tokenizer(txt, truncation=True, max_length=1024)
167
+ tokens["labels"] = tokens["input_ids"].copy()
168
+ yield tokens
169
 
170
+ job.set_progress(0.3, "Loading Neural Weights...")
171
+ original_model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=False, device_map="auto")
 
 
172
 
173
+ target_mods = ["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2", "o_proj"]
174
+ if reasoning_mode:
175
+ target_mods.extend(["gate_proj", "up_proj", "down_proj"])
176
+
177
+ peft_config = LoraConfig(
178
+ r=int(lora_r) * 2 if reasoning_mode else int(lora_r),
179
+ lora_alpha=int(lora_alpha),
180
+ target_modules=target_mods,
181
+ bias="none",
182
+ lora_dropout=lora_dropout,
183
+ task_type="CAUSAL_LM"
184
+ )
185
+
186
+ peft_model = get_peft_model(original_model, peft_config)
187
+ peft_model.config.use_cache = False
188
+
189
+ output_dir = f"checkpoints/{job_id}"
190
+
191
+ training_args = TrainingArguments(
192
+ output_dir=output_dir,
193
+ per_device_train_batch_size=int(batch_size),
194
+ gradient_accumulation_steps=4,
195
+ max_steps=int(train_steps),
196
+ learning_rate=learning_rate,
197
+ optim="adamw_torch",
198
+ logging_steps=5,
199
+ save_strategy="no",
200
+ report_to="none",
201
+ fp16=True if torch.cuda.is_available() else False,
202
+ lr_scheduler_type="cosine" if reasoning_mode else "linear",
203
+ disable_tqdm=True
204
+ )
205
+
206
+ dataset_iterable = IterableDataset.from_generator(process_stream_generator)
207
+
208
+ trainer = Trainer(
209
+ model=peft_model,
210
+ train_dataset=dataset_iterable,
211
+ args=training_args,
212
+ callbacks=[CustomTrainerCallback(job_id)]
213
+ )
214
+
215
+ job.set_progress(0.4, "Executing Neural Plasticity Phase...")
216
+ trainer.train()
217
+
218
+ job.set_progress(0.85, "Serializing Tensor Adapters...")
219
+ trainer.save_model(output_dir)
220
+
221
+ job.set_progress(0.9, "Fusing Tensor Layers...")
222
+ del peft_model
223
+ del original_model
224
+ torch.cuda.empty_cache()
225
+
226
+ base_reload = AutoModelForCausalLM.from_pretrained(
227
+ model_name,
228
+ return_dict=True,
229
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
230
+ trust_remote_code=False,
231
+ device_map="auto"
232
+ )
233
+
234
+ model_to_merge = PeftModel.from_pretrained(base_reload, output_dir)
235
+ final_model = model_to_merge.merge_and_unload()
236
+
237
+ final_path = f"merged/{job_id}"
238
+ final_model.save_pretrained(final_path, safe_serialization=True)
239
+ tokenizer.save_pretrained(final_path)
240
+
241
+ def inject_json(content, fname):
242
+ if content and content.strip():
243
  try:
244
+ data = json.loads(content)
245
+ with open(os.path.join(final_path, fname), 'w') as f:
246
+ json.dump(data, f, indent=2)
247
+ job.add_log(f"Config Injection: {fname}")
248
  except:
249
  pass
 
250
 
251
+ inject_json(c_conf, "config.json")
252
+ inject_json(c_tok, "tokenizer_config.json")
253
+ inject_json(c_gen, "generation_config.json")
254
 
255
+ job.set_progress(0.95, "Uploading Artifacts to Hub...")
256
+ full_repo = f"{username}/{new_repo_name}"
257
+ create_repo(full_repo, token=hf_token, exist_ok=True)
258
+ upload_folder(folder_path=final_path, repo_id=full_repo, token=hf_token)
259
+
260
+ job.repo_url = f"https://huggingface.co/{full_repo}"
261
+ job.status = "COMPLETED"
262
+ job.set_progress(1.0, "Operation Successful")
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  except Exception as e:
265
+ job.status = "FAILED"
266
+ job.error = str(e)
267
+ job.add_log(f"CRITICAL FAILURE: {str(e)}")
268
+
269
+ def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_dropout,
270
+ train_steps, learning_rate, batch_size, datasets_text,
271
+ reasoning_mode, c_conf, c_tok, c_gen):
 
 
 
272
 
273
+ if not hf_token or not model_name:
274
+ return "MISSING_CREDENTIALS", gr.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
+ new_job = JobStatus()
277
+ JOBS[new_job.id] = new_job
278
 
279
+ thread = threading.Thread(
280
+ target=background_train_task,
281
+ args=(new_job.id, hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_dropout,
282
+ train_steps, learning_rate, batch_size, datasets_text, reasoning_mode, c_conf, c_tok, c_gen)
 
283
  )
284
+ thread.daemon = True
285
+ thread.start()
286
+
287
+ return new_job.id, gr.update(visible=True, value=f"SESSION ID: {new_job.id}")
288
+
289
+ def get_job_update(job_id):
290
+ if job_id not in JOBS:
291
+ return (
292
+ "<span style='color: #ef4444'>INVALID SESSION ID</span>",
293
+ "--:--",
294
+ "0%",
295
+ "",
296
+ gr.update(visible=False)
297
+ )
298
 
299
+ job = JOBS[job_id]
 
 
 
 
300
 
301
+ log_html = "<br>".join([f"<div class='log-line'>{l}</div>" for l in job.logs[-50:]])
302
+
303
+ progress_html = f"""
304
+ <div class="p-bar-wrapper">
305
+ <div class="p-bar-fill" style="width: {job.progress * 100}%"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  </div>
307
+ <div class="p-text">{int(job.progress * 100)}% COMPLETE</div>
308
+ """
309
 
310
+ status_map = {
311
+ "IDLE": "#94a3b8",
312
+ "ACTIVE": "#3b82f6",
313
+ "COMPLETED": "#10b981",
314
+ "FAILED": "#ef4444"
315
+ }
316
+
317
+ status_html = f"<span style='color: {status_map.get(job.status, '#fff')}; font-weight: 900; letter-spacing: 1px;'>{job.status}</span>"
 
 
 
 
 
 
 
 
 
 
 
318
 
319
+ result_comp = gr.update(visible=False)
320
+ if job.status == "COMPLETED" and job.repo_url:
321
+ result_comp = gr.update(visible=True, value=f"ACCESS MODEL ARTIFACT: {job.repo_url}")
322
+
323
+ return status_html, job.created_at, progress_html, log_html, result_comp
324
+
325
+ css = """
326
+ @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@300;500;700&family=JetBrains+Mono:wght@400;700&display=swap');
327
+
328
+ :root {
329
+ --bg-dark: #0a0a0f;
330
+ --panel-dark: #13131f;
331
+ --primary: #6366f1;
332
+ --accent: #8b5cf6;
333
+ --text-main: #e2e8f0;
334
+ --text-dim: #64748b;
335
+ --border: #1e1e2e;
336
+ }
337
+
338
+ body {
339
+ background-color: var(--bg-dark) !important;
340
+ font-family: 'Space Grotesk', sans-serif !important;
341
+ }
342
+
343
+ .gradio-container {
344
+ background-color: transparent !important;
345
+ max-width: 1400px !important;
346
+ }
347
+
348
+ .header-container {
349
+ text-align: center;
350
+ padding: 3rem 0;
351
+ background: radial-gradient(circle at center, rgba(99, 102, 241, 0.05) 0%, transparent 60%);
352
+ margin-bottom: 2rem;
353
+ border-bottom: 1px solid var(--border);
354
+ }
355
+
356
+ h1 {
357
+ font-size: 3.5rem;
358
+ background: linear-gradient(135deg, #fff 0%, #94a3b8 100%);
359
+ -webkit-background-clip: text;
360
+ -webkit-text-fill-color: transparent;
361
+ text-transform: uppercase;
362
+ letter-spacing: -2px;
363
+ margin-bottom: 0.5rem;
364
+ }
365
+
366
+ .sub-header {
367
+ font-family: 'JetBrains Mono', monospace;
368
+ color: var(--primary);
369
+ font-size: 0.9rem;
370
+ letter-spacing: 2px;
371
+ text-transform: uppercase;
372
+ }
373
+
374
+ .gr-box, .gr-panel {
375
+ background: var(--panel-dark) !important;
376
+ border: 1px solid var(--border) !important;
377
+ border-radius: 4px !important;
378
+ }
379
+
380
+ .gr-input, .gr-textarea, .gr-number, .gr-dropdown {
381
+ background: #0d0d12 !important;
382
+ border: 1px solid var(--border) !important;
383
+ color: var(--text-main) !important;
384
+ font-family: 'JetBrains Mono', monospace;
385
+ font-size: 13px;
386
+ border-radius: 4px !important;
387
+ }
388
+
389
+ .gr-input:focus {
390
+ border-color: var(--primary) !important;
391
+ box-shadow: 0 0 0 1px var(--primary) !important;
392
+ }
393
+
394
+ .primary-btn {
395
+ background: var(--primary) !important;
396
+ border: none !important;
397
+ color: #fff !important;
398
+ font-family: 'JetBrains Mono', monospace !important;
399
+ text-transform: uppercase;
400
+ letter-spacing: 1px;
401
+ padding: 12px 24px !important;
402
+ border-radius: 2px !important;
403
+ transition: all 0.2s ease;
404
+ }
405
+
406
+ .primary-btn:hover {
407
+ background: var(--accent) !important;
408
+ box-shadow: 0 0 15px rgba(99, 102, 241, 0.3);
409
+ }
410
+
411
+ .p-bar-wrapper {
412
+ width: 100%;
413
+ height: 4px;
414
+ background: #1e1e2e;
415
+ margin-top: 15px;
416
+ }
417
+
418
+ .p-bar-fill {
419
+ height: 100%;
420
+ background: linear-gradient(90deg, var(--primary), var(--accent));
421
+ transition: width 0.4s cubic-bezier(0.4, 0, 0.2, 1);
422
+ }
423
+
424
+ .p-text {
425
+ font-family: 'JetBrains Mono', monospace;
426
+ font-size: 10px;
427
+ color: var(--primary);
428
+ text-align: right;
429
+ margin-top: 5px;
430
+ }
431
+
432
+ .log-line {
433
+ font-family: 'JetBrains Mono', monospace;
434
+ font-size: 11px;
435
+ color: var(--text-dim);
436
+ padding: 2px 0;
437
+ border-bottom: 1px solid rgba(255,255,255,0.03);
438
+ }
439
+
440
+ .session-box {
441
+ background: rgba(99, 102, 241, 0.1);
442
+ border: 1px solid var(--primary);
443
+ color: var(--primary);
444
+ font-family: 'JetBrains Mono', monospace;
445
+ padding: 1rem;
446
+ text-align: center;
447
+ font-size: 1.2rem;
448
+ margin: 1rem 0;
449
+ }
450
+
451
+ .label-wrap {
452
+ background: var(--panel-dark) !important;
453
+ border: 1px solid var(--border);
454
+ color: var(--text-main) !important;
455
+ }
456
+ """
457
+
458
+ with gr.Blocks(title="Nucleus Enterprise", css=css, theme=gr.themes.Base()) as demo:
459
+ with gr.Column():
460
+ gr.HTML("""
461
+ <div class="header-container">
462
+ <h1>Nucleus Enterprise</h1>
463
+ <div class="sub-header">Autonomous Neural Foundry // V.4.0</div>
464
+ </div>
465
+ """)
466
+
467
+ with gr.Tabs():
468
+ with gr.TabItem("DEPLOYMENT", id="deploy"):
469
+ with gr.Row():
470
+ with gr.Column(scale=2):
471
+ with gr.Row():
472
+ hf_token = gr.Textbox(label="HUGGINGFACE KEY", type="password", value=os.getenv("HF_TOKEN", ""))
473
+ model_name = gr.Textbox(label="BASE MODEL ID", placeholder="Qwen/Qwen2.5-0.5B")
474
+
475
+ repo_name = gr.Textbox(label="TARGET REPOSITORY", value="nucleus-build-v1")
476
+ datasets = gr.Textbox(label="DATA STREAMS (CSV)", placeholder="Salesforce/fineweb_deduplicated", lines=4)
477
+
478
+ reasoning_toggle = gr.Checkbox(label="ENABLE REASONING CORE (INJECTS LOGIC DATASETS)", value=False, elem_id="reasoning-switch")
479
+
480
+ with gr.Column(scale=1):
481
+ gr.Markdown("### HYPERPARAMETERS")
482
+ train_steps = gr.Number(label="STEPS", value=100)
483
+ lr = gr.Number(label="LEARNING RATE", value=2e-4)
484
+ batch = gr.Number(label="BATCH SIZE", value=1)
485
+
486
+ gr.Markdown("### LORA ADAPTERS")
487
+ lora_r = gr.Slider(8, 256, 32, step=8, label="RANK")
488
+ lora_a = gr.Slider(8, 512, 64, step=8, label="ALPHA")
489
+ lora_d = gr.Slider(0, 0.5, 0.05, label="DROPOUT")
490
+
491
+ with gr.Accordion("ADVANCED CONFIGURATION INJECTION", open=False):
492
+ with gr.Row():
493
+ conf_json = gr.Code(label="CONFIG.JSON", language="json")
494
+ tok_json = gr.Code(label="TOKENIZER_CONFIG.JSON", language="json")
495
+ gen_json = gr.Code(label="GENERATION_CONFIG.JSON", language="json")
496
+
497
+ launch_btn = gr.Button("INITIALIZE TRAINING SEQUENCE", elem_classes="primary-btn")
498
+
499
+ job_info_area = gr.Group(visible=False)
500
+ with job_info_area:
501
+ new_job_id_display = gr.HTML()
502
+
503
+ with gr.TabItem("TELEMETRY", id="monitor"):
504
+ with gr.Row():
505
+ input_job_id = gr.Textbox(label="SESSION ID", placeholder="ENTER 8-DIGIT ID")
506
+ refresh_btn = gr.Button("ESTABLISH UPLINK", elem_classes="primary-btn")
507
+
508
+ with gr.Row():
509
+ with gr.Column(scale=1):
510
+ status_display = gr.HTML(label="STATUS")
511
+ created_display = gr.Textbox(label="TIMESTAMP", interactive=False)
512
+ final_link = gr.Markdown(visible=False)
513
+
514
+ with gr.Column(scale=2):
515
+ progress_display = gr.HTML()
516
+ with gr.Accordion("SYSTEM LOGS", open=False):
517
+ logs_display = gr.HTML()
518
+
519
+ timer = gr.Timer(3000, active=False)
520
+
521
+ def activate_timer():
522
+ return gr.Timer(active=True)
523
+
524
+ launch_btn.click(
525
+ start_training_wrapper,
526
+ inputs=[hf_token, model_name, repo_name, lora_r, lora_a, lora_d, train_steps, lr, batch, datasets, reasoning_toggle, conf_json, tok_json, gen_json],
527
+ outputs=[new_job_id_display, job_info_area]
528
+ ).then(
529
+ fn=lambda id: f"<div class='session-box'>{id}</div>",
530
+ inputs=[new_job_id_display],
531
+ outputs=[new_job_id_display]
532
+ )
533
+
534
+ refresh_btn.click(
535
+ get_job_update,
536
+ inputs=[input_job_id],
537
+ outputs=[status_display, created_display, progress_display, logs_display, final_link]
538
+ ).then(
539
+ activate_timer,
540
+ None,
541
+ timer
542
+ )
543
+
544
+ timer.tick(
545
+ get_job_update,
546
+ inputs=[input_job_id],
547
+ outputs=[status_display, created_display, progress_display, logs_display, final_link]
548
  )
549
 
550
+ if __name__ == "__main__":
551
+ demo.launch()