Ksjsjjdj commited on
Commit
572dfb0
·
verified ·
1 Parent(s): 887625f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -16
app.py CHANGED
@@ -28,7 +28,10 @@ except:
28
 
29
  transformers.logging.set_verbosity_error()
30
  datasets.logging.set_verbosity_error()
31
- logging.basicConfig(level=logging.ERROR)
 
 
 
32
 
33
  if torch.cuda.is_available():
34
  torch.backends.cuda.matmul.allow_tf32 = True
@@ -211,15 +214,20 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
211
 
212
  output_dir = f"checkpoints/{job_id}"
213
 
 
 
 
214
  training_args = TrainingArguments(
215
  output_dir=output_dir,
216
  per_device_train_batch_size=int(batch_size),
217
  gradient_accumulation_steps=4,
218
- max_steps=int(train_steps),
219
  learning_rate=learning_rate,
220
  optim="adamw_torch",
221
  logging_steps=5,
222
- save_strategy="no",
 
 
223
  report_to="none",
224
  fp16=True if torch.cuda.is_available() else False,
225
  lr_scheduler_type="cosine" if reasoning_mode else "linear",
@@ -236,7 +244,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
236
  callbacks=[CustomTrainerCallback(job_id)]
237
  )
238
 
239
- job.set_progress(0.4, "Training: Matrix adaptation started...")
240
  trainer.train()
241
 
242
  job.set_progress(0.85, "Saving: Serializing adapters...")
@@ -296,7 +304,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
296
 
297
  def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_dropout,
298
  train_steps, learning_rate, batch_size, datasets_text,
299
- reasoning_mode, c_conf, c_tok, c_gen, request: gr.Request):
300
 
301
  if not hf_token or not model_name:
302
  return "ERROR: Missing Credentials", gr.update(visible=False), gr.update(visible=False)
@@ -311,14 +319,8 @@ def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alp
311
  )
312
  thread.daemon = True
313
  thread.start()
314
-
315
- try:
316
- base_url = str(request.request.url).split('?')[0]
317
- share_url = f"{base_url}?job_id={new_job.id}"
318
- except:
319
- share_url = f"Job ID: {new_job.id}"
320
 
321
- return new_job.id, gr.update(visible=True, value=f"SESSION ID: {new_job.id}"), gr.update(visible=True, value=share_url)
322
 
323
  def get_job_update(job_id):
324
  if job_id not in JOBS:
@@ -514,7 +516,7 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
514
  with gr.Row():
515
  with gr.Column(scale=2):
516
  with gr.Row():
517
- hf_token = gr.Textbox(label="HUGGIGFACE KEY", type="password", value=os.getenv("HF_TOKEN", ""))
518
  model_name = gr.Textbox(label="BASE MODEL ID", placeholder="Qwen/Qwen2.5-0.5B")
519
 
520
  repo_name = gr.Textbox(label="TARGET REPOSITORY", value="nucleus-build-v1")
@@ -544,7 +546,8 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
544
  job_info_area = gr.Group(visible=False)
545
  with job_info_area:
546
  new_job_id_display = gr.HTML()
547
- share_link_display = gr.Textbox(label="DIRECT MONITOR UPLINK", interactive=False)
 
548
 
549
  with gr.TabItem("TELEMETRY", id="monitor_tab"):
550
  with gr.Row():
@@ -576,10 +579,15 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
576
  launch_btn.click(
577
  start_training_wrapper,
578
  inputs=[hf_token, model_name, repo_name, lora_r, lora_a, lora_d, train_steps, lr, batch, datasets, reasoning_toggle, conf_json, tok_json, gen_json],
579
- outputs=[new_job_id_display, job_info_area, share_link_display]
 
 
 
 
 
580
  ).then(
581
  fn=lambda id: f"<div class='session-box'>{id}</div>",
582
- inputs=[new_job_id_display],
583
  outputs=[new_job_id_display]
584
  )
585
 
 
28
 
29
  transformers.logging.set_verbosity_error()
30
  datasets.logging.set_verbosity_error()
31
+ logging.getLogger("transformers").setLevel(logging.CRITICAL)
32
+ logging.getLogger("datasets").setLevel(logging.CRITICAL)
33
+ logging.getLogger("torch").setLevel(logging.CRITICAL)
34
+ logging.basicConfig(level=logging.CRITICAL, stream=sys.stderr)
35
 
36
  if torch.cuda.is_available():
37
  torch.backends.cuda.matmul.allow_tf32 = True
 
214
 
215
  output_dir = f"checkpoints/{job_id}"
216
 
217
+ total_steps = int(train_steps)
218
+ save_interval = max(10, int(total_steps * 0.2))
219
+
220
  training_args = TrainingArguments(
221
  output_dir=output_dir,
222
  per_device_train_batch_size=int(batch_size),
223
  gradient_accumulation_steps=4,
224
+ max_steps=total_steps,
225
  learning_rate=learning_rate,
226
  optim="adamw_torch",
227
  logging_steps=5,
228
+ save_strategy="steps",
229
+ save_steps=save_interval,
230
+ save_total_limit=2,
231
  report_to="none",
232
  fp16=True if torch.cuda.is_available() else False,
233
  lr_scheduler_type="cosine" if reasoning_mode else "linear",
 
244
  callbacks=[CustomTrainerCallback(job_id)]
245
  )
246
 
247
+ job.set_progress(0.4, f"Training: Matrix adaptation started (Checkpointing every {save_interval} steps)...")
248
  trainer.train()
249
 
250
  job.set_progress(0.85, "Saving: Serializing adapters...")
 
304
 
305
  def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_dropout,
306
  train_steps, learning_rate, batch_size, datasets_text,
307
+ reasoning_mode, c_conf, c_tok, c_gen):
308
 
309
  if not hf_token or not model_name:
310
  return "ERROR: Missing Credentials", gr.update(visible=False), gr.update(visible=False)
 
319
  )
320
  thread.daemon = True
321
  thread.start()
 
 
 
 
 
 
322
 
323
+ return new_job.id, gr.update(visible=True, value=f"SESSION ID: {new_job.id}"), gr.update(visible=True)
324
 
325
  def get_job_update(job_id):
326
  if job_id not in JOBS:
 
516
  with gr.Row():
517
  with gr.Column(scale=2):
518
  with gr.Row():
519
+ hf_token = gr.Textbox(label="HUGGINGFACE KEY", type="password", value=os.getenv("HF_TOKEN", ""))
520
  model_name = gr.Textbox(label="BASE MODEL ID", placeholder="Qwen/Qwen2.5-0.5B")
521
 
522
  repo_name = gr.Textbox(label="TARGET REPOSITORY", value="nucleus-build-v1")
 
546
  job_info_area = gr.Group(visible=False)
547
  with job_info_area:
548
  new_job_id_display = gr.HTML()
549
+ share_link_display = gr.Textbox(label="DIRECT MONITOR UPLINK", interactive=True)
550
+ hidden_job_id = gr.Textbox(visible=False)
551
 
552
  with gr.TabItem("TELEMETRY", id="monitor_tab"):
553
  with gr.Row():
 
579
  launch_btn.click(
580
  start_training_wrapper,
581
  inputs=[hf_token, model_name, repo_name, lora_r, lora_a, lora_d, train_steps, lr, batch, datasets, reasoning_toggle, conf_json, tok_json, gen_json],
582
+ outputs=[hidden_job_id, new_job_id_display, job_info_area]
583
+ ).then(
584
+ fn=None,
585
+ inputs=[hidden_job_id],
586
+ outputs=[share_link_display],
587
+ js="(id) => { return window.location.protocol + '//' + window.location.host + window.location.pathname + '?job_id=' + id; }"
588
  ).then(
589
  fn=lambda id: f"<div class='session-box'>{id}</div>",
590
+ inputs=[hidden_job_id],
591
  outputs=[new_job_id_display]
592
  )
593