Train

Sleeping

App Files Files Community

Ksjsjjdj commited on Nov 30, 2025

Commit

6d6218a

verified ·

1 Parent(s): 7701af4

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -22

app.py CHANGED Viewed

@@ -60,8 +60,10 @@ class JobStatus:
             self.add_log(msg)
 class CustomTrainerCallback(TrainerCallback):
-    def __init__(self, job_id):
         self.job_id = job_id
     def on_step_end(self, args, state, control, **kwargs):
         if self.job_id in JOBS:
@@ -77,7 +79,25 @@ class CustomTrainerCallback(TrainerCallback):
     def on_save(self, args, state, control, **kwargs):
         if self.job_id in JOBS:
             job = JOBS[self.job_id]
-            job.add_log(f"System: Checkpoint saved at step {state.global_step}")
         return control
 @spaces.GPU(duration=300)
@@ -100,7 +120,9 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
         login(token=hf_token)
         try:
             username = whoami()["name"]
-            job.add_log(f"Auth: Verified as {username}")
         except:
             raise Exception("Authentication Failed")
@@ -215,7 +237,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
             model=peft_model,
             train_dataset=dataset_iterable,
             args=training_args,
-            callbacks=[CustomTrainerCallback(job_id)]
         )
         job.set_progress(0.2, "Training: Phase initiated...")
@@ -255,12 +277,10 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
         inject_json(c_tok, "tokenizer_config.json")
         inject_json(c_gen, "generation_config.json")
-        job.set_progress(0.95, "Network: Uploading to HuggingFace...")
-        full_repo = f"{username}/{new_repo_name}"
-        create_repo(full_repo, token=hf_token, exist_ok=True)
-        upload_folder(folder_path=final_path, repo_id=full_repo, token=hf_token)
-        job.repo_url = f"https://huggingface.co/{full_repo}"
         job.status = "COMPLETED"
         job.set_progress(1.0, "System: Mission Accomplished")
@@ -317,19 +337,7 @@ def load_from_url(request: gr.Request):
         pass
     return gr.update(selected="launch_tab"), ""
-css = """
-@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=Inter:wght@400;700&display=swap');
-body { background: #0b0f19; color: #fff; font-family: 'Inter', sans-serif; }
-.gradio-container { border: 1px solid #2d3748; border-radius: 8px; background: #111827; }
-h1 { color: #6366f1; text-align: center; font-weight: 800; text-transform: uppercase; letter-spacing: 2px; }
-.gr-button.primary { background: #4f46e5; border: none; color: white; font-weight: bold; }
-.gr-button.primary:hover { background: #4338ca; }
-.gr-input, .gr-textarea, .gr-box { background: #1f2937 !important; border-color: #374151 !important; color: #e5e7eb !important; }
-.gr-code { background: #000 !important; color: #0f0 !important; font-family: 'IBM Plex Mono', monospace; border: 1px solid #333; }
-#status-badge { font-weight: bold; padding: 4px 8px; border-radius: 4px; background: #374151; display: inline-block; }
-"""
-with gr.Blocks(title="Nucleus Enterprise", css=css, theme=gr.themes.Base()) as demo:
     with gr.Column():
         gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
         gr.Markdown("Autonomous LLM Foundry | V5.0 Stable")

             self.add_log(msg)
 class CustomTrainerCallback(TrainerCallback):
+    def __init__(self, job_id, hf_token, repo_id):
         self.job_id = job_id
+        self.hf_token = hf_token
+        self.repo_id = repo_id
     def on_step_end(self, args, state, control, **kwargs):
         if self.job_id in JOBS:
     def on_save(self, args, state, control, **kwargs):
         if self.job_id in JOBS:
             job = JOBS[self.job_id]
+            step = state.global_step
+            ckpt_name = f"checkpoint-{step}"
+            ckpt_path = os.path.join(args.output_dir, ckpt_name)
+            job.add_log(f"System: Local checkpoint saved ({ckpt_name})")
+            def _upload_bg():
+                try:
+                    upload_folder(
+                        folder_path=ckpt_path,
+                        path_in_repo=ckpt_name,
+                        repo_id=self.repo_id,
+                        token=self.hf_token
+                    )
+                    job.add_log(f"Cloud: Checkpoint {step} synced to Hub")
+                except Exception:
+                    pass
+            threading.Thread(target=_upload_bg, daemon=True).start()
         return control
 @spaces.GPU(duration=300)
         login(token=hf_token)
         try:
             username = whoami()["name"]
+            full_repo_id = f"{username}/{new_repo_name}"
+            create_repo(full_repo_id, token=hf_token, exist_ok=True)
+            job.add_log(f"Auth: Verified as {username}. Target: {full_repo_id}")
         except:
             raise Exception("Authentication Failed")
             model=peft_model,
             train_dataset=dataset_iterable,
             args=training_args,
+            callbacks=[CustomTrainerCallback(job_id, hf_token, full_repo_id)]
         )
         job.set_progress(0.2, "Training: Phase initiated...")
         inject_json(c_tok, "tokenizer_config.json")
         inject_json(c_gen, "generation_config.json")
+        job.set_progress(0.95, "Network: Uploading final model...")
+        upload_folder(folder_path=final_path, repo_id=full_repo_id, token=hf_token)
+        job.repo_url = f"https://huggingface.co/{full_repo_id}"
         job.status = "COMPLETED"
         job.set_progress(1.0, "System: Mission Accomplished")
         pass
     return gr.update(selected="launch_tab"), ""
+with gr.Blocks(title="Nucleus Enterprise", theme=gr.themes.Base()) as demo:
     with gr.Column():
         gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
         gr.Markdown("Autonomous LLM Foundry | V5.0 Stable")