Ksjsjjdj commited on
Commit
a2dde7c
·
verified ·
1 Parent(s): 40f8e1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -42
app.py CHANGED
@@ -18,7 +18,7 @@ import transformers
18
  import datasets
19
  from dotenv import load_dotenv
20
  from datasets import load_dataset, get_dataset_config_names, IterableDataset
21
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
22
  from huggingface_hub import login, whoami, create_repo, upload_folder
23
  import spaces
24
 
@@ -104,13 +104,13 @@ class CustomTrainerCallback(TrainerCallback):
104
  return control
105
 
106
  @spaces.GPU(duration=300)
107
- def background_train_task(job_id, hf_token, model_name, new_repo_name, training_mode,
108
  train_steps, learning_rate, batch_size, datasets_text,
109
  reasoning_mode, c_conf, c_tok, c_gen):
110
 
111
  job = JOBS[job_id]
112
  job.status = "RUNNING"
113
- job.add_log(f"System: initializing Full-Parameter Training ({training_mode})...")
114
 
115
  try:
116
  if not hf_token.startswith("hf_"):
@@ -177,27 +177,13 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, training_
177
  if tokenizer.pad_token is None:
178
  tokenizer.pad_token = tokenizer.eos_token
179
 
180
- is_sft = "SFT" in training_mode
181
-
182
  def process_stream_generator():
183
  iterator = chain.from_iterable(streams)
184
  batch_buffer = []
185
 
186
  for item in iterator:
187
  try:
188
- if is_sft:
189
- if "messages" in item:
190
- text = tokenizer.apply_chat_template(item["messages"], tokenize=False, add_generation_prompt=False)
191
- elif "conversation" in item:
192
- text = tokenizer.apply_chat_template(item["conversation"], tokenize=False, add_generation_prompt=False)
193
- elif "instruction" in item and "output" in item:
194
- msg = [{"role": "user", "content": item["instruction"]}, {"role": "assistant", "content": item["output"]}]
195
- text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
196
- else:
197
- text = str(item)
198
- else:
199
- text = str(item.get("text", item.get("content", str(item))))
200
-
201
  if len(text) < 5: continue
202
  batch_buffer.append(text)
203
 
@@ -209,20 +195,20 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, training_
209
  except:
210
  continue
211
 
212
- job.set_progress(0.15, "Model: Loading Full Weights...")
213
 
214
  torch.cuda.empty_cache()
215
  gc.collect()
216
 
217
- original_model = AutoModelForCausalLM.from_pretrained(
218
- model_name,
219
- trust_remote_code=True,
220
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
221
- use_cache=False
222
  )
223
 
224
  if torch.cuda.is_available():
225
- original_model = original_model.cuda()
226
 
227
  output_dir = f"checkpoints/{job_id}"
228
 
@@ -297,7 +283,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, training_
297
  path_in_repo=".",
298
  repo_id=full_repo_id,
299
  token=hf_token,
300
- commit_message=f"Full Fine-Tuned Model ({training_mode})"
301
  )
302
 
303
  job.repo_url = f"https://huggingface.co/{full_repo_id}"
@@ -310,7 +296,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, training_
310
  job.add_log(f"FATAL ERROR: {str(e)}")
311
  torch.cuda.empty_cache()
312
 
313
- def start_training_wrapper(hf_token, model_name, new_repo_name, training_mode,
314
  train_steps, learning_rate, batch_size, datasets_text,
315
  reasoning_mode, c_conf, c_tok, c_gen):
316
 
@@ -322,7 +308,7 @@ def start_training_wrapper(hf_token, model_name, new_repo_name, training_mode,
322
 
323
  thread = threading.Thread(
324
  target=background_train_task,
325
- args=(new_job.id, hf_token, model_name, new_repo_name, training_mode,
326
  train_steps, learning_rate, batch_size, datasets_text, reasoning_mode, c_conf, c_tok, c_gen)
327
  )
328
  thread.daemon = True
@@ -357,10 +343,10 @@ def load_from_url(request: gr.Request):
357
  pass
358
  return gr.update(selected="launch_tab"), ""
359
 
360
- with gr.Blocks(title="Nucleus Enterprise", theme=gr.themes.Base()) as demo:
361
  with gr.Column():
362
  gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
363
- gr.Markdown("Autonomous LLM Foundry | V6.0 Full Fine-Tune")
364
 
365
  with gr.Tabs() as main_tabs:
366
  with gr.TabItem("🚀 LAUNCHPAD", id="launch_tab"):
@@ -368,22 +354,16 @@ with gr.Blocks(title="Nucleus Enterprise", theme=gr.themes.Base()) as demo:
368
  with gr.Column(scale=2):
369
  with gr.Row():
370
  hf_token = gr.Textbox(label="HuggingFace Token", type="password", value=os.getenv("HF_TOKEN", ""))
371
- model_name = gr.Textbox(label="Base Model", value="Qwen/Qwen2.5-0.5B")
372
 
373
- repo_name = gr.Textbox(label="Output Repository", value="nucleus-model-v1")
374
  datasets = gr.Textbox(label="Datasets (CSV)", value="Salesforce/fineweb_deduplicated", lines=3)
375
 
376
- with gr.Row():
377
- training_mode = gr.Dropdown(
378
- choices=["Base Pre-Training", "Post-Training", "Base SFT", "Post-Training SFT"],
379
- value="Base Pre-Training",
380
- label="Training Strategy"
381
- )
382
- reasoning = gr.Checkbox(label="Inject Reasoning (CoT/Math)", value=False)
383
 
384
  with gr.Column(scale=1):
385
  steps = gr.Number(label="Steps", value=100)
386
- lr = gr.Number(label="Learning Rate", value=2e-5)
387
  batch = gr.Number(label="Batch Size", value=1)
388
 
389
  with gr.Accordion("Advanced Config", open=False):
@@ -391,7 +371,7 @@ with gr.Blocks(title="Nucleus Enterprise", theme=gr.themes.Base()) as demo:
391
  c_tok = gr.Code(label="tokenizer_config.json", language="json")
392
  c_gen = gr.Code(label="generation_config.json", language="json")
393
 
394
- btn_launch = gr.Button("INITIALIZE FULL TRAINING", variant="primary", size="lg")
395
 
396
  with gr.TabItem("📡 TELEMETRY", id="monitor_tab"):
397
  with gr.Row():
@@ -412,7 +392,7 @@ with gr.Blocks(title="Nucleus Enterprise", theme=gr.themes.Base()) as demo:
412
 
413
  btn_launch.click(
414
  start_training_wrapper,
415
- inputs=[hf_token, model_name, repo_name, training_mode, steps, lr, batch, datasets, reasoning, c_conf, c_tok, c_gen],
416
  outputs=[job_id_input, main_tabs]
417
  ).then(
418
  None, [job_id_input], None,
 
18
  import datasets
19
  from dotenv import load_dotenv
20
  from datasets import load_dataset, get_dataset_config_names, IterableDataset
21
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback, AutoConfig
22
  from huggingface_hub import login, whoami, create_repo, upload_folder
23
  import spaces
24
 
 
104
  return control
105
 
106
  @spaces.GPU(duration=300)
107
+ def background_train_task(job_id, hf_token, model_name, new_repo_name,
108
  train_steps, learning_rate, batch_size, datasets_text,
109
  reasoning_mode, c_conf, c_tok, c_gen):
110
 
111
  job = JOBS[job_id]
112
  job.status = "RUNNING"
113
+ job.add_log("System: initializing Scratch Training Protocol...")
114
 
115
  try:
116
  if not hf_token.startswith("hf_"):
 
177
  if tokenizer.pad_token is None:
178
  tokenizer.pad_token = tokenizer.eos_token
179
 
 
 
180
  def process_stream_generator():
181
  iterator = chain.from_iterable(streams)
182
  batch_buffer = []
183
 
184
  for item in iterator:
185
  try:
186
+ text = str(item.get("text", item.get("content", str(item))))
 
 
 
 
 
 
 
 
 
 
 
 
187
  if len(text) < 5: continue
188
  batch_buffer.append(text)
189
 
 
195
  except:
196
  continue
197
 
198
+ job.set_progress(0.15, "Model: Initializing Architecture from Scratch...")
199
 
200
  torch.cuda.empty_cache()
201
  gc.collect()
202
 
203
+ config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
204
+
205
+ original_model = AutoModelForCausalLM.from_config(
206
+ config,
207
+ trust_remote_code=True,
208
  )
209
 
210
  if torch.cuda.is_available():
211
+ original_model = original_model.to(torch.float16).cuda()
212
 
213
  output_dir = f"checkpoints/{job_id}"
214
 
 
283
  path_in_repo=".",
284
  repo_id=full_repo_id,
285
  token=hf_token,
286
+ commit_message="Scratch Trained Model"
287
  )
288
 
289
  job.repo_url = f"https://huggingface.co/{full_repo_id}"
 
296
  job.add_log(f"FATAL ERROR: {str(e)}")
297
  torch.cuda.empty_cache()
298
 
299
+ def start_training_wrapper(hf_token, model_name, new_repo_name,
300
  train_steps, learning_rate, batch_size, datasets_text,
301
  reasoning_mode, c_conf, c_tok, c_gen):
302
 
 
308
 
309
  thread = threading.Thread(
310
  target=background_train_task,
311
+ args=(new_job.id, hf_token, model_name, new_repo_name,
312
  train_steps, learning_rate, batch_size, datasets_text, reasoning_mode, c_conf, c_tok, c_gen)
313
  )
314
  thread.daemon = True
 
343
  pass
344
  return gr.update(selected="launch_tab"), ""
345
 
346
+ with gr.Blocks(title="Nucleus Enterprise") as demo:
347
  with gr.Column():
348
  gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
349
+ gr.Markdown("Autonomous LLM Foundry | V7.0 Scratch Edition")
350
 
351
  with gr.Tabs() as main_tabs:
352
  with gr.TabItem("🚀 LAUNCHPAD", id="launch_tab"):
 
354
  with gr.Column(scale=2):
355
  with gr.Row():
356
  hf_token = gr.Textbox(label="HuggingFace Token", type="password", value=os.getenv("HF_TOKEN", ""))
357
+ model_name = gr.Textbox(label="Architecture Config Source", value="Qwen/Qwen2.5-0.5B")
358
 
359
+ repo_name = gr.Textbox(label="Output Repository", value="nucleus-scratch-v1")
360
  datasets = gr.Textbox(label="Datasets (CSV)", value="Salesforce/fineweb_deduplicated", lines=3)
361
 
362
+ reasoning = gr.Checkbox(label="Inject Reasoning (CoT/Math)", value=False)
 
 
 
 
 
 
363
 
364
  with gr.Column(scale=1):
365
  steps = gr.Number(label="Steps", value=100)
366
+ lr = gr.Number(label="Learning Rate", value=1e-4)
367
  batch = gr.Number(label="Batch Size", value=1)
368
 
369
  with gr.Accordion("Advanced Config", open=False):
 
371
  c_tok = gr.Code(label="tokenizer_config.json", language="json")
372
  c_gen = gr.Code(label="generation_config.json", language="json")
373
 
374
+ btn_launch = gr.Button("INITIALIZE SCRATCH TRAINING", variant="primary", size="lg")
375
 
376
  with gr.TabItem("📡 TELEMETRY", id="monitor_tab"):
377
  with gr.Row():
 
392
 
393
  btn_launch.click(
394
  start_training_wrapper,
395
+ inputs=[hf_token, model_name, repo_name, steps, lr, batch, datasets, reasoning, c_conf, c_tok, c_gen],
396
  outputs=[job_id_input, main_tabs]
397
  ).then(
398
  None, [job_id_input], None,