Update app.py
Browse files
app.py
CHANGED
|
@@ -28,7 +28,10 @@ except:
|
|
| 28 |
|
| 29 |
transformers.logging.set_verbosity_error()
|
| 30 |
datasets.logging.set_verbosity_error()
|
| 31 |
-
logging.
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
if torch.cuda.is_available():
|
| 34 |
torch.backends.cuda.matmul.allow_tf32 = True
|
|
@@ -211,15 +214,20 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
|
|
| 211 |
|
| 212 |
output_dir = f"checkpoints/{job_id}"
|
| 213 |
|
|
|
|
|
|
|
|
|
|
| 214 |
training_args = TrainingArguments(
|
| 215 |
output_dir=output_dir,
|
| 216 |
per_device_train_batch_size=int(batch_size),
|
| 217 |
gradient_accumulation_steps=4,
|
| 218 |
-
max_steps=
|
| 219 |
learning_rate=learning_rate,
|
| 220 |
optim="adamw_torch",
|
| 221 |
logging_steps=5,
|
| 222 |
-
save_strategy="
|
|
|
|
|
|
|
| 223 |
report_to="none",
|
| 224 |
fp16=True if torch.cuda.is_available() else False,
|
| 225 |
lr_scheduler_type="cosine" if reasoning_mode else "linear",
|
|
@@ -236,7 +244,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
|
|
| 236 |
callbacks=[CustomTrainerCallback(job_id)]
|
| 237 |
)
|
| 238 |
|
| 239 |
-
job.set_progress(0.4, "Training: Matrix adaptation started...")
|
| 240 |
trainer.train()
|
| 241 |
|
| 242 |
job.set_progress(0.85, "Saving: Serializing adapters...")
|
|
@@ -296,7 +304,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, lora_r, l
|
|
| 296 |
|
| 297 |
def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_dropout,
|
| 298 |
train_steps, learning_rate, batch_size, datasets_text,
|
| 299 |
-
reasoning_mode, c_conf, c_tok, c_gen
|
| 300 |
|
| 301 |
if not hf_token or not model_name:
|
| 302 |
return "ERROR: Missing Credentials", gr.update(visible=False), gr.update(visible=False)
|
|
@@ -311,14 +319,8 @@ def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alp
|
|
| 311 |
)
|
| 312 |
thread.daemon = True
|
| 313 |
thread.start()
|
| 314 |
-
|
| 315 |
-
try:
|
| 316 |
-
base_url = str(request.request.url).split('?')[0]
|
| 317 |
-
share_url = f"{base_url}?job_id={new_job.id}"
|
| 318 |
-
except:
|
| 319 |
-
share_url = f"Job ID: {new_job.id}"
|
| 320 |
|
| 321 |
-
return new_job.id, gr.update(visible=True, value=f"SESSION ID: {new_job.id}"), gr.update(visible=True
|
| 322 |
|
| 323 |
def get_job_update(job_id):
|
| 324 |
if job_id not in JOBS:
|
|
@@ -514,7 +516,7 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
|
|
| 514 |
with gr.Row():
|
| 515 |
with gr.Column(scale=2):
|
| 516 |
with gr.Row():
|
| 517 |
-
hf_token = gr.Textbox(label="
|
| 518 |
model_name = gr.Textbox(label="BASE MODEL ID", placeholder="Qwen/Qwen2.5-0.5B")
|
| 519 |
|
| 520 |
repo_name = gr.Textbox(label="TARGET REPOSITORY", value="nucleus-build-v1")
|
|
@@ -544,7 +546,8 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
|
|
| 544 |
job_info_area = gr.Group(visible=False)
|
| 545 |
with job_info_area:
|
| 546 |
new_job_id_display = gr.HTML()
|
| 547 |
-
share_link_display = gr.Textbox(label="DIRECT MONITOR UPLINK", interactive=
|
|
|
|
| 548 |
|
| 549 |
with gr.TabItem("TELEMETRY", id="monitor_tab"):
|
| 550 |
with gr.Row():
|
|
@@ -576,10 +579,15 @@ with gr.Blocks(title="Nucleus Enterprise") as demo:
|
|
| 576 |
launch_btn.click(
|
| 577 |
start_training_wrapper,
|
| 578 |
inputs=[hf_token, model_name, repo_name, lora_r, lora_a, lora_d, train_steps, lr, batch, datasets, reasoning_toggle, conf_json, tok_json, gen_json],
|
| 579 |
-
outputs=[new_job_id_display, job_info_area
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
).then(
|
| 581 |
fn=lambda id: f"<div class='session-box'>{id}</div>",
|
| 582 |
-
inputs=[
|
| 583 |
outputs=[new_job_id_display]
|
| 584 |
)
|
| 585 |
|
|
|
|
| 28 |
|
| 29 |
transformers.logging.set_verbosity_error()
|
| 30 |
datasets.logging.set_verbosity_error()
|
| 31 |
+
logging.getLogger("transformers").setLevel(logging.CRITICAL)
|
| 32 |
+
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
| 33 |
+
logging.getLogger("torch").setLevel(logging.CRITICAL)
|
| 34 |
+
logging.basicConfig(level=logging.CRITICAL, stream=sys.stderr)
|
| 35 |
|
| 36 |
if torch.cuda.is_available():
|
| 37 |
torch.backends.cuda.matmul.allow_tf32 = True
|
|
|
|
| 214 |
|
| 215 |
output_dir = f"checkpoints/{job_id}"
|
| 216 |
|
| 217 |
+
total_steps = int(train_steps)
|
| 218 |
+
save_interval = max(10, int(total_steps * 0.2))
|
| 219 |
+
|
| 220 |
training_args = TrainingArguments(
|
| 221 |
output_dir=output_dir,
|
| 222 |
per_device_train_batch_size=int(batch_size),
|
| 223 |
gradient_accumulation_steps=4,
|
| 224 |
+
max_steps=total_steps,
|
| 225 |
learning_rate=learning_rate,
|
| 226 |
optim="adamw_torch",
|
| 227 |
logging_steps=5,
|
| 228 |
+
save_strategy="steps",
|
| 229 |
+
save_steps=save_interval,
|
| 230 |
+
save_total_limit=2,
|
| 231 |
report_to="none",
|
| 232 |
fp16=True if torch.cuda.is_available() else False,
|
| 233 |
lr_scheduler_type="cosine" if reasoning_mode else "linear",
|
|
|
|
| 244 |
callbacks=[CustomTrainerCallback(job_id)]
|
| 245 |
)
|
| 246 |
|
| 247 |
+
job.set_progress(0.4, f"Training: Matrix adaptation started (Checkpointing every {save_interval} steps)...")
|
| 248 |
trainer.train()
|
| 249 |
|
| 250 |
job.set_progress(0.85, "Saving: Serializing adapters...")
|
|
|
|
| 304 |
|
| 305 |
def start_training_wrapper(hf_token, model_name, new_repo_name, lora_r, lora_alpha, lora_dropout,
|
| 306 |
train_steps, learning_rate, batch_size, datasets_text,
|
| 307 |
+
reasoning_mode, c_conf, c_tok, c_gen):
|
| 308 |
|
| 309 |
if not hf_token or not model_name:
|
| 310 |
return "ERROR: Missing Credentials", gr.update(visible=False), gr.update(visible=False)
|
|
|
|
| 319 |
)
|
| 320 |
thread.daemon = True
|
| 321 |
thread.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
+
return new_job.id, gr.update(visible=True, value=f"SESSION ID: {new_job.id}"), gr.update(visible=True)
|
| 324 |
|
| 325 |
def get_job_update(job_id):
|
| 326 |
if job_id not in JOBS:
|
|
|
|
| 516 |
with gr.Row():
|
| 517 |
with gr.Column(scale=2):
|
| 518 |
with gr.Row():
|
| 519 |
+
hf_token = gr.Textbox(label="HUGGINGFACE KEY", type="password", value=os.getenv("HF_TOKEN", ""))
|
| 520 |
model_name = gr.Textbox(label="BASE MODEL ID", placeholder="Qwen/Qwen2.5-0.5B")
|
| 521 |
|
| 522 |
repo_name = gr.Textbox(label="TARGET REPOSITORY", value="nucleus-build-v1")
|
|
|
|
| 546 |
job_info_area = gr.Group(visible=False)
|
| 547 |
with job_info_area:
|
| 548 |
new_job_id_display = gr.HTML()
|
| 549 |
+
share_link_display = gr.Textbox(label="DIRECT MONITOR UPLINK", interactive=True)
|
| 550 |
+
hidden_job_id = gr.Textbox(visible=False)
|
| 551 |
|
| 552 |
with gr.TabItem("TELEMETRY", id="monitor_tab"):
|
| 553 |
with gr.Row():
|
|
|
|
| 579 |
launch_btn.click(
|
| 580 |
start_training_wrapper,
|
| 581 |
inputs=[hf_token, model_name, repo_name, lora_r, lora_a, lora_d, train_steps, lr, batch, datasets, reasoning_toggle, conf_json, tok_json, gen_json],
|
| 582 |
+
outputs=[hidden_job_id, new_job_id_display, job_info_area]
|
| 583 |
+
).then(
|
| 584 |
+
fn=None,
|
| 585 |
+
inputs=[hidden_job_id],
|
| 586 |
+
outputs=[share_link_display],
|
| 587 |
+
js="(id) => { return window.location.protocol + '//' + window.location.host + window.location.pathname + '?job_id=' + id; }"
|
| 588 |
).then(
|
| 589 |
fn=lambda id: f"<div class='session-box'>{id}</div>",
|
| 590 |
+
inputs=[hidden_job_id],
|
| 591 |
outputs=[new_job_id_display]
|
| 592 |
)
|
| 593 |
|