Train

Sleeping

App Files Files Community

Ksjsjjdj commited on Nov 30, 2025

Commit

572dfb0

verified ·

1 Parent(s): 887625f

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -16

app.py CHANGED Viewed

@@ -28,7 +28,10 @@ except:
 transformers.logging.set_verbosity_error()
 datasets.logging.set_verbosity_error()
-logging.basicConfig(level=logging.ERROR)
 if torch.cuda.is_available():
     torch.backends.cuda.matmul.allow_tf32 = True
@@ -211,15 +214,20 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
         output_dir = f"checkpoints/{job_id}"
         training_args = TrainingArguments(
             output_dir=output_dir,
             per_device_train_batch_size=int(batch_size),
             gradient_accumulation_steps=4,
-            max_steps=int(train_steps),
             learning_rate=learning_rate,
             optim="adamw_torch",
             logging_steps=5,
-            save_strategy="no",
             report_to="none",
             fp16=True if torch.cuda.is_available() else False,
             lr_scheduler_type="cosine" if reasoning_mode else "linear",
@@ -236,7 +244,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
             callbacks=[CustomTrainerCallback(job_id)]
         )
-        job.set_progress(0.4, "Training: Matrix adaptation started...")
         trainer.train()
         job.set_progress(0.85, "Saving: Serializing adapters...")
@@ -296,7 +304,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
 def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_dropout,
                            train_steps, learning_rate, batch_size, datasets_text,
-                           reasoning_mode, c_conf, c_tok, c_gen, request: gr.Request):
     if not hf_token or not model_name:
         return "ERROR: Missing Credentials", gr.update(visible=False), gr.update(visible=False)
@@ -311,14 +319,8 @@ def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alp
     )
     thread.daemon = True
     thread.start()
-    try:
-        base_url = str(request.request.url).split('?')[0]
-        share_url = f"{base_url}?job_id={new_job.id}"
-    except:
-        share_url = f"Job ID: {new_job.id}"
-    return new_job.id, gr.update(visible=True, value=f"SESSION ID: {new_job.id}"), gr.update(visible=True, value=share_url)
 def get_job_update(job_id):
     if job_id not in JOBS:
@@ -514,7 +516,7 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
                 with gr.Row():
                     with gr.Column(scale=2):
                         with gr.Row():
-                            hf_token = gr.Textbox(label="HUGGIGFACE KEY", type="password", value=os.getenv("HF_TOKEN", ""))
                             model_name = gr.Textbox(label="BASE MODEL ID", placeholder="Qwen/Qwen2.5-0.5B")
                         repo_name = gr.Textbox(label="TARGET REPOSITORY", value="nucleus-build-v1")
@@ -544,7 +546,8 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
                 job_info_area = gr.Group(visible=False)
                 with job_info_area:
                     new_job_id_display = gr.HTML()
-                    share_link_display = gr.Textbox(label="DIRECT MONITOR UPLINK", interactive=False)
             with gr.TabItem("TELEMETRY", id="monitor_tab"):
                 with gr.Row():
@@ -576,10 +579,15 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
     launch_btn.click(
         start_training_wrapper,
         inputs=[hf_token, model_name, repo_name, lora_r, lora_a, lora_d, train_steps, lr, batch, datasets, reasoning_toggle, conf_json, tok_json, gen_json],
-        outputs=[new_job_id_display, job_info_area, share_link_display]
     ).then(
         fn=lambda id: f"<div class='session-box'>{id}</div>",
-        inputs=[new_job_id_display],
         outputs=[new_job_id_display]
     )

 transformers.logging.set_verbosity_error()
 datasets.logging.set_verbosity_error()
+logging.getLogger("transformers").setLevel(logging.CRITICAL)
+logging.getLogger("datasets").setLevel(logging.CRITICAL)
+logging.getLogger("torch").setLevel(logging.CRITICAL)
+logging.basicConfig(level=logging.CRITICAL, stream=sys.stderr)
 if torch.cuda.is_available():
     torch.backends.cuda.matmul.allow_tf32 = True
         output_dir = f"checkpoints/{job_id}"
+        total_steps = int(train_steps)
+        save_interval = max(10, int(total_steps * 0.2))
         training_args = TrainingArguments(
             output_dir=output_dir,
             per_device_train_batch_size=int(batch_size),
             gradient_accumulation_steps=4,
+            max_steps=total_steps,
             learning_rate=learning_rate,
             optim="adamw_torch",
             logging_steps=5,
+            save_strategy="steps",
+            save_steps=save_interval,
+            save_total_limit=2,
             report_to="none",
             fp16=True if torch.cuda.is_available() else False,
             lr_scheduler_type="cosine" if reasoning_mode else "linear",
             callbacks=[CustomTrainerCallback(job_id)]
         )
+        job.set_progress(0.4, f"Training: Matrix adaptation started (Checkpointing every {save_interval} steps)...")
         trainer.train()
         job.set_progress(0.85, "Saving: Serializing adapters...")
 def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_dropout,
                            train_steps, learning_rate, batch_size, datasets_text,
+                           reasoning_mode, c_conf, c_tok, c_gen):
     if not hf_token or not model_name:
         return "ERROR: Missing Credentials", gr.update(visible=False), gr.update(visible=False)
     )
     thread.daemon = True
     thread.start()
+    return new_job.id, gr.update(visible=True, value=f"SESSION ID: {new_job.id}"), gr.update(visible=True)
 def get_job_update(job_id):
     if job_id not in JOBS:
                 with gr.Row():
                     with gr.Column(scale=2):
                         with gr.Row():
+                            hf_token = gr.Textbox(label="HUGGINGFACE KEY", type="password", value=os.getenv("HF_TOKEN", ""))
                             model_name = gr.Textbox(label="BASE MODEL ID", placeholder="Qwen/Qwen2.5-0.5B")
                         repo_name = gr.Textbox(label="TARGET REPOSITORY", value="nucleus-build-v1")
                 job_info_area = gr.Group(visible=False)
                 with job_info_area:
                     new_job_id_display = gr.HTML()
+                    share_link_display = gr.Textbox(label="DIRECT MONITOR UPLINK", interactive=True)
+                    hidden_job_id = gr.Textbox(visible=False)
             with gr.TabItem("TELEMETRY", id="monitor_tab"):
                 with gr.Row():
     launch_btn.click(
         start_training_wrapper,
         inputs=[hf_token, model_name, repo_name, lora_r, lora_a, lora_d, train_steps, lr, batch, datasets, reasoning_toggle, conf_json, tok_json, gen_json],
+        outputs=[hidden_job_id, new_job_id_display, job_info_area]
+    ).then(
+        fn=None,
+        inputs=[hidden_job_id],
+        outputs=[share_link_display],
+        js="(id) => { return window.location.protocol + '//' + window.location.host + window.location.pathname + '?job_id=' + id; }"
     ).then(
         fn=lambda id: f"<div class='session-box'>{id}</div>",
+        inputs=[hidden_job_id],
         outputs=[new_job_id_display]
     )