Ksjsjjdj commited on
Commit
7701af4
·
verified ·
1 Parent(s): 572dfb0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -316
app.py CHANGED
@@ -41,8 +41,8 @@ JOBS = {}
41
 
42
  class JobStatus:
43
  def __init__(self):
44
- self.id = str(uuid.uuid4())[:8]
45
- self.status = "IDLE"
46
  self.progress = 0.0
47
  self.logs = []
48
  self.result = None
@@ -68,10 +68,16 @@ class CustomTrainerCallback(TrainerCallback):
68
  job = JOBS[self.job_id]
69
  if state.max_steps > 0:
70
  prog = state.global_step / state.max_steps
71
- job.progress = 0.4 + (prog * 0.5)
72
- if state.global_step % 5 == 0:
73
  loss = state.log_history[-1].get('loss', 'N/A') if state.log_history else '...'
74
- job.add_log(f"Step {state.global_step}/{state.max_steps} | Loss: {loss}")
 
 
 
 
 
 
75
  return control
76
 
77
  @spaces.GPU(duration=300)
@@ -80,24 +86,23 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
80
  reasoning_mode, c_conf, c_tok, c_gen):
81
 
82
  job = JOBS[job_id]
83
- job.status = "ACTIVE"
84
- job.add_log("System: Initializing specialized environment...")
85
 
86
  try:
87
  if not hf_token.startswith("hf_"):
88
- raise ValueError("Invalid HuggingFace Token format")
89
 
90
  os.environ["WANDB_DISABLED"] = "true"
91
  os.environ["HF_TOKEN"] = hf_token
92
  os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
93
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
94
 
95
  login(token=hf_token)
96
  try:
97
  username = whoami()["name"]
98
- job.add_log(f"Auth Success: Connected as {username}")
99
  except:
100
- raise Exception("Authentication Failed: Check write permissions")
101
 
102
  if not hasattr(torch, 'xla'):
103
  class DummyXLA:
@@ -109,8 +114,8 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
109
  dataset_list = [item.strip() for item in raw_items if item.strip()]
110
 
111
  if reasoning_mode:
112
- job.add_log("Mode: Reasoning Core Active (Math/Logic Injection)")
113
- dataset_list.extend(["gsm8k", "openai/gsm8k", "microsoft/orca-math-word-problems-200k"])
114
 
115
  def load_single(ds_name, cfg):
116
  try:
@@ -124,7 +129,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
124
  return None
125
 
126
  streams = []
127
- job.set_progress(0.1, "Data: Establishing vector streams...")
128
 
129
  with ThreadPoolExecutor(max_workers=4) as executor:
130
  futures = []
@@ -137,54 +142,32 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
137
  streams.append(res)
138
 
139
  if not streams:
140
- raise Exception("Data Error: No valid streams available. Check dataset names.")
141
 
142
- job.set_progress(0.2, f"Data: {len(streams)} streams locked and ready.")
143
 
144
- job.add_log("Tokenizer: Loading configuration...")
145
- try:
146
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side="left", add_eos_token=True, add_bos_token=True)
147
- if tokenizer.pad_token is None:
148
- tokenizer.pad_token = tokenizer.eos_token
149
- except Exception as e:
150
- raise Exception(f"Tokenizer Load Failed: {str(e)}")
151
 
152
  def process_stream_generator():
153
  iterator = chain.from_iterable(streams)
154
  batch_buffer = []
155
-
156
  for item in iterator:
157
  try:
158
- text = ""
159
- if "question" in item and "answer" in item:
160
- text = f"Question: {item['question']}\nAnswer: {item['answer']}"
161
- elif "text" in item:
162
- text = item["text"]
163
- elif "content" in item:
164
- text = item["content"]
165
- else:
166
- text = str(item)
167
-
168
- if len(text) < 10:
169
- continue
170
-
171
  batch_buffer.append(text)
172
-
173
- if len(batch_buffer) >= 50:
174
  for txt in batch_buffer:
175
- tokens = tokenizer(txt, truncation=True, max_length=2048)
176
  tokens["labels"] = tokens["input_ids"].copy()
177
  yield tokens
178
  batch_buffer = []
179
  except:
180
  continue
181
-
182
- for txt in batch_buffer:
183
- tokens = tokenizer(txt, truncation=True, max_length=2048)
184
- tokens["labels"] = tokens["input_ids"].copy()
185
- yield tokens
186
 
187
- job.set_progress(0.3, "Model: Loading base weights (4-bit/8-bit optimized)...")
188
 
189
  torch.cuda.empty_cache()
190
  gc.collect()
@@ -196,14 +179,10 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
196
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
197
  )
198
 
199
- target_mods = ["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2", "o_proj"]
200
- if reasoning_mode:
201
- target_mods.extend(["gate_proj", "up_proj", "down_proj"])
202
-
203
  peft_config = LoraConfig(
204
- r=int(lora_r) * 2 if reasoning_mode else int(lora_r),
205
  lora_alpha=int(lora_alpha),
206
- target_modules=target_mods,
207
  bias="none",
208
  lora_dropout=lora_dropout,
209
  task_type="CAUSAL_LM"
@@ -214,25 +193,20 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
214
 
215
  output_dir = f"checkpoints/{job_id}"
216
 
217
- total_steps = int(train_steps)
218
- save_interval = max(10, int(total_steps * 0.2))
219
-
220
  training_args = TrainingArguments(
221
  output_dir=output_dir,
222
  per_device_train_batch_size=int(batch_size),
223
  gradient_accumulation_steps=4,
224
- max_steps=total_steps,
225
  learning_rate=learning_rate,
226
  optim="adamw_torch",
227
- logging_steps=5,
228
  save_strategy="steps",
229
- save_steps=save_interval,
230
  save_total_limit=2,
231
  report_to="none",
232
  fp16=True if torch.cuda.is_available() else False,
233
- lr_scheduler_type="cosine" if reasoning_mode else "linear",
234
- disable_tqdm=True,
235
- dataloader_pin_memory=False
236
  )
237
 
238
  dataset_iterable = IterableDataset.from_generator(process_stream_generator)
@@ -244,23 +218,19 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
244
  callbacks=[CustomTrainerCallback(job_id)]
245
  )
246
 
247
- job.set_progress(0.4, f"Training: Matrix adaptation started (Checkpointing every {save_interval} steps)...")
248
  trainer.train()
249
 
250
- job.set_progress(0.85, "Saving: Serializing adapters...")
251
- trainer.save_model(output_dir)
252
-
253
- job.set_progress(0.9, "Merging: Fusing weights and cleanup...")
254
  del peft_model
255
  del original_model
256
- del trainer
257
  torch.cuda.empty_cache()
258
  gc.collect()
259
 
260
  base_reload = AutoModelForCausalLM.from_pretrained(
261
  model_name,
262
  return_dict=True,
263
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
264
  trust_remote_code=True,
265
  device_map="auto"
266
  )
@@ -278,36 +248,34 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
278
  data = json.loads(content)
279
  with open(os.path.join(final_path, fname), 'w') as f:
280
  json.dump(data, f, indent=2)
281
- job.add_log(f"Config: Injected {fname}")
282
  except:
283
- job.add_log(f"Config: Failed to inject {fname} (Invalid JSON)")
284
 
285
  inject_json(c_conf, "config.json")
286
  inject_json(c_tok, "tokenizer_config.json")
287
  inject_json(c_gen, "generation_config.json")
288
 
289
- job.set_progress(0.95, "Upload: Pushing artifacts to Hub...")
290
  full_repo = f"{username}/{new_repo_name}"
291
  create_repo(full_repo, token=hf_token, exist_ok=True)
292
  upload_folder(folder_path=final_path, repo_id=full_repo, token=hf_token)
293
 
294
  job.repo_url = f"https://huggingface.co/{full_repo}"
295
  job.status = "COMPLETED"
296
- job.set_progress(1.0, "System: Operation Finalized Successfully")
297
 
298
  except Exception as e:
299
  job.status = "FAILED"
300
  job.error = str(e)
301
  job.add_log(f"CRITICAL ERROR: {str(e)}")
302
  torch.cuda.empty_cache()
303
- gc.collect()
304
 
305
  def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_dropout,
306
  train_steps, learning_rate, batch_size, datasets_text,
307
  reasoning_mode, c_conf, c_tok, c_gen):
308
 
309
  if not hf_token or not model_name:
310
- return "ERROR: Missing Credentials", gr.update(visible=False), gr.update(visible=False)
311
 
312
  new_job = JobStatus()
313
  JOBS[new_job.id] = new_job
@@ -319,44 +287,25 @@ def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alp
319
  )
320
  thread.daemon = True
321
  thread.start()
322
-
323
- return new_job.id, gr.update(visible=True, value=f"SESSION ID: {new_job.id}"), gr.update(visible=True)
324
 
325
  def get_job_update(job_id):
 
 
 
326
  if job_id not in JOBS:
327
- return (
328
- "<span style='color: #ef4444'>INVALID SESSION ID</span>",
329
- "--:--",
330
- "0%",
331
- "",
332
- gr.update(visible=False)
333
- )
334
 
335
  job = JOBS[job_id]
336
 
337
- log_html = "<br>".join([f"<div class='log-line'>{l}</div>" for l in job.logs[-50:]])
338
-
339
- progress_html = f"""
340
- <div class="p-bar-wrapper">
341
- <div class="p-bar-fill" style="width: {job.progress * 100}%"></div>
342
- </div>
343
- <div class="p-text">{int(job.progress * 100)}% COMPLETE</div>
344
- """
345
-
346
- status_map = {
347
- "IDLE": "#94a3b8",
348
- "ACTIVE": "#3b82f6",
349
- "COMPLETED": "#10b981",
350
- "FAILED": "#ef4444"
351
- }
352
-
353
- status_html = f"<span style='color: {status_map.get(job.status, '#fff')}; font-weight: 900; letter-spacing: 1px;'>{job.status}</span>"
354
 
355
  result_comp = gr.update(visible=False)
356
  if job.status == "COMPLETED" and job.repo_url:
357
- result_comp = gr.update(visible=True, value=f"ACCESS MODEL ARTIFACT: {job.repo_url}")
358
 
359
- return status_html, job.created_at, progress_html, log_html, result_comp
360
 
361
  def load_from_url(request: gr.Request):
362
  try:
@@ -369,243 +318,79 @@ def load_from_url(request: gr.Request):
369
  return gr.update(selected="launch_tab"), ""
370
 
371
  css = """
372
- @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@300;500;700&family=JetBrains+Mono:wght@400;700&display=swap');
373
-
374
- :root {
375
- --bg-dark: #0a0a0f;
376
- --panel-dark: #13131f;
377
- --primary: #6366f1;
378
- --accent: #8b5cf6;
379
- --text-main: #e2e8f0;
380
- --text-dim: #64748b;
381
- --border: #1e1e2e;
382
- }
383
-
384
- body {
385
- background-color: var(--bg-dark) !important;
386
- font-family: 'Space Grotesk', sans-serif !important;
387
- }
388
-
389
- .gradio-container {
390
- background-color: transparent !important;
391
- max-width: 1400px !important;
392
- }
393
-
394
- .header-container {
395
- text-align: center;
396
- padding: 3rem 0;
397
- background: radial-gradient(circle at center, rgba(99, 102, 241, 0.05) 0%, transparent 60%);
398
- margin-bottom: 2rem;
399
- border-bottom: 1px solid var(--border);
400
- }
401
-
402
- h1 {
403
- font-size: 3.5rem;
404
- background: linear-gradient(135deg, #fff 0%, #94a3b8 100%);
405
- -webkit-background-clip: text;
406
- -webkit-text-fill-color: transparent;
407
- text-transform: uppercase;
408
- letter-spacing: -2px;
409
- margin-bottom: 0.5rem;
410
- }
411
-
412
- .sub-header {
413
- font-family: 'JetBrains Mono', monospace;
414
- color: var(--primary);
415
- font-size: 0.9rem;
416
- letter-spacing: 2px;
417
- text-transform: uppercase;
418
- }
419
-
420
- .gr-box, .gr-panel {
421
- background: var(--panel-dark) !important;
422
- border: 1px solid var(--border) !important;
423
- border-radius: 4px !important;
424
- }
425
-
426
- .gr-input, .gr-textarea, .gr-number, .gr-dropdown {
427
- background: #0d0d12 !important;
428
- border: 1px solid var(--border) !important;
429
- color: var(--text-main) !important;
430
- font-family: 'JetBrains Mono', monospace;
431
- font-size: 13px;
432
- border-radius: 4px !important;
433
- }
434
-
435
- .gr-input:focus {
436
- border-color: var(--primary) !important;
437
- box-shadow: 0 0 0 1px var(--primary) !important;
438
- }
439
-
440
- .primary-btn {
441
- background: var(--primary) !important;
442
- border: none !important;
443
- color: #fff !important;
444
- font-family: 'JetBrains Mono', monospace !important;
445
- text-transform: uppercase;
446
- letter-spacing: 1px;
447
- padding: 12px 24px !important;
448
- border-radius: 2px !important;
449
- transition: all 0.2s ease;
450
- }
451
-
452
- .primary-btn:hover {
453
- background: var(--accent) !important;
454
- box-shadow: 0 0 15px rgba(99, 102, 241, 0.3);
455
- }
456
-
457
- .p-bar-wrapper {
458
- width: 100%;
459
- height: 4px;
460
- background: #1e1e2e;
461
- margin-top: 15px;
462
- }
463
-
464
- .p-bar-fill {
465
- height: 100%;
466
- background: linear-gradient(90deg, var(--primary), var(--accent));
467
- transition: width 0.4s cubic-bezier(0.4, 0, 0.2, 1);
468
- }
469
-
470
- .p-text {
471
- font-family: 'JetBrains Mono', monospace;
472
- font-size: 10px;
473
- color: var(--primary);
474
- text-align: right;
475
- margin-top: 5px;
476
- }
477
-
478
- .log-line {
479
- font-family: 'JetBrains Mono', monospace;
480
- font-size: 11px;
481
- color: var(--text-dim);
482
- padding: 2px 0;
483
- border-bottom: 1px solid rgba(255,255,255,0.03);
484
- }
485
-
486
- .session-box {
487
- background: rgba(99, 102, 241, 0.1);
488
- border: 1px solid var(--primary);
489
- color: var(--primary);
490
- font-family: 'JetBrains Mono', monospace;
491
- padding: 1rem;
492
- text-align: center;
493
- font-size: 1.2rem;
494
- margin: 1rem 0;
495
- }
496
-
497
- .label-wrap {
498
- background: var(--panel-dark) !important;
499
- border: 1px solid var(--border);
500
- color: var(--text-main) !important;
501
- }
502
  """
503
 
504
- with gr.Blocks(title="Nucleus Enterprise") as demo:
505
- gr.HTML(f"<style>{css}</style>")
506
  with gr.Column():
507
- gr.HTML("""
508
- <div class="header-container">
509
- <h1>Nucleus Enterprise</h1>
510
- <div class="sub-header">Autonomous Neural Foundry // V.4.0</div>
511
- </div>
512
- """)
513
 
514
  with gr.Tabs() as main_tabs:
515
- with gr.TabItem("DEPLOYMENT", id="launch_tab"):
516
  with gr.Row():
517
  with gr.Column(scale=2):
518
  with gr.Row():
519
- hf_token = gr.Textbox(label="HUGGINGFACE KEY", type="password", value=os.getenv("HF_TOKEN", ""))
520
- model_name = gr.Textbox(label="BASE MODEL ID", placeholder="Qwen/Qwen2.5-0.5B")
521
 
522
- repo_name = gr.Textbox(label="TARGET REPOSITORY", value="nucleus-build-v1")
523
- datasets = gr.Textbox(label="DATA STREAMS (CSV)", placeholder="Salesforce/fineweb_deduplicated", lines=4)
524
-
525
- reasoning_toggle = gr.Checkbox(label="ENABLE REASONING CORE (INJECTS LOGIC DATASETS)", value=False, elem_id="reasoning-switch")
526
 
527
  with gr.Column(scale=1):
528
- gr.Markdown("### HYPERPARAMETERS")
529
- train_steps = gr.Number(label="STEPS", value=100)
530
- lr = gr.Number(label="LEARNING RATE", value=2e-4)
531
- batch = gr.Number(label="BATCH SIZE", value=1)
532
-
533
- gr.Markdown("### LORA ADAPTERS")
534
- lora_r = gr.Slider(8, 256, 32, step=8, label="RANK")
535
- lora_a = gr.Slider(8, 512, 64, step=8, label="ALPHA")
536
- lora_d = gr.Slider(0, 0.5, 0.05, label="DROPOUT")
537
-
538
- with gr.Accordion("ADVANCED CONFIGURATION INJECTION", open=False):
539
- with gr.Row():
540
- conf_json = gr.Code(label="CONFIG.JSON", language="json")
541
- tok_json = gr.Code(label="TOKENIZER_CONFIG.JSON", language="json")
542
- gen_json = gr.Code(label="GENERATION_CONFIG.JSON", language="json")
543
-
544
- launch_btn = gr.Button("INITIALIZE TRAINING SEQUENCE", elem_classes="primary-btn")
545
-
546
- job_info_area = gr.Group(visible=False)
547
- with job_info_area:
548
- new_job_id_display = gr.HTML()
549
- share_link_display = gr.Textbox(label="DIRECT MONITOR UPLINK", interactive=True)
550
- hidden_job_id = gr.Textbox(visible=False)
551
 
552
- with gr.TabItem("TELEMETRY", id="monitor_tab"):
 
 
553
  with gr.Row():
554
- input_job_id = gr.Textbox(label="SESSION ID", placeholder="ENTER 8-DIGIT ID")
555
- refresh_btn = gr.Button("ESTABLISH UPLINK", elem_classes="primary-btn")
556
 
557
  with gr.Row():
558
- with gr.Column(scale=1):
559
- status_display = gr.HTML(label="STATUS")
560
- created_display = gr.Textbox(label="TIMESTAMP", interactive=False)
561
- final_link = gr.Markdown(visible=False)
562
-
563
- with gr.Column(scale=2):
564
- progress_display = gr.HTML()
565
- with gr.Accordion("SYSTEM LOGS", open=False):
566
- logs_display = gr.HTML()
567
-
568
- timer = gr.Timer(3000, active=False)
569
 
570
- def activate_timer():
571
- return gr.Timer(active=True)
572
 
573
- demo.load(
574
- load_from_url,
575
- None,
576
- [main_tabs, input_job_id]
577
- )
578
 
579
- launch_btn.click(
580
  start_training_wrapper,
581
- inputs=[hf_token, model_name, repo_name, lora_r, lora_a, lora_d, train_steps, lr, batch, datasets, reasoning_toggle, conf_json, tok_json, gen_json],
582
- outputs=[hidden_job_id, new_job_id_display, job_info_area]
583
  ).then(
584
- fn=None,
585
- inputs=[hidden_job_id],
586
- outputs=[share_link_display],
587
- js="(id) => { return window.location.protocol + '//' + window.location.host + window.location.pathname + '?job_id=' + id; }"
588
  ).then(
589
- fn=lambda id: f"<div class='session-box'>{id}</div>",
590
- inputs=[hidden_job_id],
591
- outputs=[new_job_id_display]
592
  )
593
 
594
- refresh_btn.click(
595
- get_job_update,
596
- inputs=[input_job_id],
597
- outputs=[status_display, created_display, progress_display, logs_display, final_link]
598
- ).then(
599
- activate_timer,
600
- None,
601
- timer
602
- )
603
-
604
- timer.tick(
605
- get_job_update,
606
- inputs=[input_job_id],
607
- outputs=[status_display, created_display, progress_display, logs_display, final_link]
608
- )
609
 
610
  if __name__ == "__main__":
611
  demo.launch(ssr_mode=False)
 
41
 
42
  class JobStatus:
43
  def __init__(self):
44
+ self.id = str(uuid.uuid4())
45
+ self.status = "INITIALIZING"
46
  self.progress = 0.0
47
  self.logs = []
48
  self.result = None
 
68
  job = JOBS[self.job_id]
69
  if state.max_steps > 0:
70
  prog = state.global_step / state.max_steps
71
+ job.progress = 0.1 + (prog * 0.8)
72
+ if state.global_step % 1 == 0:
73
  loss = state.log_history[-1].get('loss', 'N/A') if state.log_history else '...'
74
+ job.add_log(f"Training Step {state.global_step}/{state.max_steps} | Loss: {loss}")
75
+ return control
76
+
77
+ def on_save(self, args, state, control, **kwargs):
78
+ if self.job_id in JOBS:
79
+ job = JOBS[self.job_id]
80
+ job.add_log(f"System: Checkpoint saved at step {state.global_step}")
81
  return control
82
 
83
  @spaces.GPU(duration=300)
 
86
  reasoning_mode, c_conf, c_tok, c_gen):
87
 
88
  job = JOBS[job_id]
89
+ job.status = "RUNNING"
90
+ job.add_log("System: Starting Neural Forge Engine...")
91
 
92
  try:
93
  if not hf_token.startswith("hf_"):
94
+ raise ValueError("Invalid HuggingFace Token")
95
 
96
  os.environ["WANDB_DISABLED"] = "true"
97
  os.environ["HF_TOKEN"] = hf_token
98
  os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
 
99
 
100
  login(token=hf_token)
101
  try:
102
  username = whoami()["name"]
103
+ job.add_log(f"Auth: Verified as {username}")
104
  except:
105
+ raise Exception("Authentication Failed")
106
 
107
  if not hasattr(torch, 'xla'):
108
  class DummyXLA:
 
114
  dataset_list = [item.strip() for item in raw_items if item.strip()]
115
 
116
  if reasoning_mode:
117
+ job.add_log("Config: Reasoning Core Active")
118
+ dataset_list.extend(["gsm8k", "openai/gsm8k"])
119
 
120
  def load_single(ds_name, cfg):
121
  try:
 
129
  return None
130
 
131
  streams = []
132
+ job.set_progress(0.05, "Data: Connecting streams...")
133
 
134
  with ThreadPoolExecutor(max_workers=4) as executor:
135
  futures = []
 
142
  streams.append(res)
143
 
144
  if not streams:
145
+ raise Exception("No valid datasets found")
146
 
147
+ job.set_progress(0.1, f"Data: {len(streams)} sources active.")
148
 
149
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side="left", add_eos_token=True, add_bos_token=True)
150
+ if tokenizer.pad_token is None:
151
+ tokenizer.pad_token = tokenizer.eos_token
 
 
 
 
152
 
153
  def process_stream_generator():
154
  iterator = chain.from_iterable(streams)
155
  batch_buffer = []
 
156
  for item in iterator:
157
  try:
158
+ text = str(item.get("text", item.get("content", str(item))))
159
+ if len(text) < 10: continue
 
 
 
 
 
 
 
 
 
 
 
160
  batch_buffer.append(text)
161
+ if len(batch_buffer) >= 20:
 
162
  for txt in batch_buffer:
163
+ tokens = tokenizer(txt, truncation=True, max_length=1024)
164
  tokens["labels"] = tokens["input_ids"].copy()
165
  yield tokens
166
  batch_buffer = []
167
  except:
168
  continue
 
 
 
 
 
169
 
170
+ job.set_progress(0.15, "Model: Loading weights...")
171
 
172
  torch.cuda.empty_cache()
173
  gc.collect()
 
179
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
180
  )
181
 
 
 
 
 
182
  peft_config = LoraConfig(
183
+ r=int(lora_r),
184
  lora_alpha=int(lora_alpha),
185
+ target_modules=["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2", "o_proj"],
186
  bias="none",
187
  lora_dropout=lora_dropout,
188
  task_type="CAUSAL_LM"
 
193
 
194
  output_dir = f"checkpoints/{job_id}"
195
 
 
 
 
196
  training_args = TrainingArguments(
197
  output_dir=output_dir,
198
  per_device_train_batch_size=int(batch_size),
199
  gradient_accumulation_steps=4,
200
+ max_steps=int(train_steps),
201
  learning_rate=learning_rate,
202
  optim="adamw_torch",
203
+ logging_steps=1,
204
  save_strategy="steps",
205
+ save_steps=max(10, int(int(train_steps)/5)),
206
  save_total_limit=2,
207
  report_to="none",
208
  fp16=True if torch.cuda.is_available() else False,
209
+ disable_tqdm=True
 
 
210
  )
211
 
212
  dataset_iterable = IterableDataset.from_generator(process_stream_generator)
 
218
  callbacks=[CustomTrainerCallback(job_id)]
219
  )
220
 
221
+ job.set_progress(0.2, "Training: Phase initiated...")
222
  trainer.train()
223
 
224
+ job.set_progress(0.9, "Processing: Merging tensors...")
 
 
 
225
  del peft_model
226
  del original_model
 
227
  torch.cuda.empty_cache()
228
  gc.collect()
229
 
230
  base_reload = AutoModelForCausalLM.from_pretrained(
231
  model_name,
232
  return_dict=True,
233
+ torch_dtype=torch.float16,
234
  trust_remote_code=True,
235
  device_map="auto"
236
  )
 
248
  data = json.loads(content)
249
  with open(os.path.join(final_path, fname), 'w') as f:
250
  json.dump(data, f, indent=2)
 
251
  except:
252
+ pass
253
 
254
  inject_json(c_conf, "config.json")
255
  inject_json(c_tok, "tokenizer_config.json")
256
  inject_json(c_gen, "generation_config.json")
257
 
258
+ job.set_progress(0.95, "Network: Uploading to HuggingFace...")
259
  full_repo = f"{username}/{new_repo_name}"
260
  create_repo(full_repo, token=hf_token, exist_ok=True)
261
  upload_folder(folder_path=final_path, repo_id=full_repo, token=hf_token)
262
 
263
  job.repo_url = f"https://huggingface.co/{full_repo}"
264
  job.status = "COMPLETED"
265
+ job.set_progress(1.0, "System: Mission Accomplished")
266
 
267
  except Exception as e:
268
  job.status = "FAILED"
269
  job.error = str(e)
270
  job.add_log(f"CRITICAL ERROR: {str(e)}")
271
  torch.cuda.empty_cache()
 
272
 
273
  def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_dropout,
274
  train_steps, learning_rate, batch_size, datasets_text,
275
  reasoning_mode, c_conf, c_tok, c_gen):
276
 
277
  if not hf_token or not model_name:
278
+ return None, gr.update(selected="launch_tab")
279
 
280
  new_job = JobStatus()
281
  JOBS[new_job.id] = new_job
 
287
  )
288
  thread.daemon = True
289
  thread.start()
290
+
291
+ return new_job.id, gr.update(selected="monitor_tab")
292
 
293
  def get_job_update(job_id):
294
+ if not job_id:
295
+ return "Waiting for Job ID...", "", 0, "", gr.update(visible=False)
296
+
297
  if job_id not in JOBS:
298
+ return "Job ID not found in memory.", "", 0, "", gr.update(visible=False)
 
 
 
 
 
 
299
 
300
  job = JOBS[job_id]
301
 
302
+ log_text = "\n".join(job.logs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
  result_comp = gr.update(visible=False)
305
  if job.status == "COMPLETED" and job.repo_url:
306
+ result_comp = gr.update(visible=True, value=f" Model Published: {job.repo_url}")
307
 
308
+ return job.status, job.created_at, job.progress, log_text, result_comp
309
 
310
  def load_from_url(request: gr.Request):
311
  try:
 
318
  return gr.update(selected="launch_tab"), ""
319
 
320
  css = """
321
+ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=Inter:wght@400;700&display=swap');
322
+ body { background: #0b0f19; color: #fff; font-family: 'Inter', sans-serif; }
323
+ .gradio-container { border: 1px solid #2d3748; border-radius: 8px; background: #111827; }
324
+ h1 { color: #6366f1; text-align: center; font-weight: 800; text-transform: uppercase; letter-spacing: 2px; }
325
+ .gr-button.primary { background: #4f46e5; border: none; color: white; font-weight: bold; }
326
+ .gr-button.primary:hover { background: #4338ca; }
327
+ .gr-input, .gr-textarea, .gr-box { background: #1f2937 !important; border-color: #374151 !important; color: #e5e7eb !important; }
328
+ .gr-code { background: #000 !important; color: #0f0 !important; font-family: 'IBM Plex Mono', monospace; border: 1px solid #333; }
329
+ #status-badge { font-weight: bold; padding: 4px 8px; border-radius: 4px; background: #374151; display: inline-block; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  """
331
 
332
+ with gr.Blocks(title="Nucleus Enterprise", css=css, theme=gr.themes.Base()) as demo:
 
333
  with gr.Column():
334
+ gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
335
+ gr.Markdown("Autonomous LLM Foundry | V5.0 Stable")
 
 
 
 
336
 
337
  with gr.Tabs() as main_tabs:
338
+ with gr.TabItem("🚀 LAUNCHPAD", id="launch_tab"):
339
  with gr.Row():
340
  with gr.Column(scale=2):
341
  with gr.Row():
342
+ hf_token = gr.Textbox(label="HuggingFace Token", type="password", value=os.getenv("HF_TOKEN", ""))
343
+ model_name = gr.Textbox(label="Base Model", value="Qwen/Qwen2.5-0.5B")
344
 
345
+ repo_name = gr.Textbox(label="Output Repository", value="nucleus-model-v1")
346
+ datasets = gr.Textbox(label="Datasets (CSV)", value="Salesforce/fineweb_deduplicated", lines=3)
347
+ reasoning = gr.Checkbox(label="Inject Reasoning (CoT/Math)", value=False)
 
348
 
349
  with gr.Column(scale=1):
350
+ steps = gr.Number(label="Steps", value=100)
351
+ lr = gr.Number(label="Learning Rate", value=2e-4)
352
+ batch = gr.Number(label="Batch Size", value=1)
353
+ r = gr.Slider(8, 256, 32, step=8, label="LoRA Rank")
354
+ a = gr.Slider(8, 512, 64, step=8, label="LoRA Alpha")
355
+ d = gr.Slider(0, 0.5, 0.05, label="Dropout")
356
+
357
+ with gr.Accordion("Advanced Config", open=False):
358
+ c_conf = gr.Code(label="config.json", language="json")
359
+ c_tok = gr.Code(label="tokenizer_config.json", language="json")
360
+ c_gen = gr.Code(label="generation_config.json", language="json")
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
+ btn_launch = gr.Button("INITIALIZE SYSTEM", variant="primary", size="lg")
363
+
364
+ with gr.TabItem("📡 TELEMETRY", id="monitor_tab"):
365
  with gr.Row():
366
+ job_id_input = gr.Textbox(label="Active Job ID", interactive=True)
367
+ btn_refresh = gr.Button("Refresh Stream")
368
 
369
  with gr.Row():
370
+ status_out = gr.Textbox(label="Status", interactive=False)
371
+ time_out = gr.Textbox(label="Start Time", interactive=False)
372
+ progress_out = gr.Slider(label="Progress", minimum=0, maximum=1)
373
+
374
+ final_link = gr.Markdown(visible=False)
375
+ logs_out = gr.Code(label="Real-time Kernel Logs", language="shell", interactive=False, lines=15)
 
 
 
 
 
376
 
377
+ timer = gr.Timer(2000, active=False)
 
378
 
379
+ demo.load(load_from_url, None, [main_tabs, job_id_input]).then(lambda: gr.Timer(active=True), None, timer)
 
 
 
 
380
 
381
+ btn_launch.click(
382
  start_training_wrapper,
383
+ inputs=[hf_token, model_name, repo_name, r, a, d, steps, lr, batch, datasets, reasoning, c_conf, c_tok, c_gen],
384
+ outputs=[job_id_input, main_tabs]
385
  ).then(
386
+ None, [job_id_input], None,
387
+ js="(id) => { if (id) { const url = new URL(window.location); url.searchParams.set('job_id', id); window.history.pushState({}, '', url); } return id; }"
 
 
388
  ).then(
389
+ lambda: gr.Timer(active=True), None, timer
 
 
390
  )
391
 
392
+ btn_refresh.click(get_job_update, job_id_input, [status_out, time_out, progress_out, logs_out, final_link])
393
+ timer.tick(get_job_update, job_id_input, [status_out, time_out, progress_out, logs_out, final_link])
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
  if __name__ == "__main__":
396
  demo.launch(ssr_mode=False)