Train

Sleeping

App Files Files Community

Ksjsjjdj commited on Dec 1, 2025

Commit

a2dde7c

verified ·

1 Parent(s): 40f8e1a

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -42

app.py CHANGED Viewed

@@ -18,7 +18,7 @@ import transformers
 import datasets
 from dotenv import load_dotenv
 from datasets import load_dataset, get_dataset_config_names, IterableDataset
-from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
 from huggingface_hub import login, whoami, create_repo, upload_folder
 import spaces
@@ -104,13 +104,13 @@ class CustomTrainerCallback(TrainerCallback):
         return control
 @spaces.GPU(duration=300)
-def background_train_task(job_id, hf_token, model_name, new_repo_name, training_mode,
                           train_steps, learning_rate, batch_size, datasets_text,
                           reasoning_mode, c_conf, c_tok, c_gen):
     job = JOBS[job_id]
     job.status = "RUNNING"
-    job.add_log(f"System: initializing Full-Parameter Training ({training_mode})...")
     try:
         if not hf_token.startswith("hf_"):
@@ -177,27 +177,13 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, training_
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        is_sft = "SFT" in training_mode
         def process_stream_generator():
             iterator = chain.from_iterable(streams)
             batch_buffer = []
             for item in iterator:
                 try:
-                    if is_sft:
-                        if "messages" in item:
-                            text = tokenizer.apply_chat_template(item["messages"], tokenize=False, add_generation_prompt=False)
-                        elif "conversation" in item:
-                            text = tokenizer.apply_chat_template(item["conversation"], tokenize=False, add_generation_prompt=False)
-                        elif "instruction" in item and "output" in item:
-                            msg = [{"role": "user", "content": item["instruction"]}, {"role": "assistant", "content": item["output"]}]
-                            text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)
-                        else:
-                            text = str(item)
-                    else:
-                        text = str(item.get("text", item.get("content", str(item))))
                     if len(text) < 5: continue
                     batch_buffer.append(text)
@@ -209,20 +195,20 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, training_
                 except:
                     continue
-        job.set_progress(0.15, "Model: Loading Full Weights...")
         torch.cuda.empty_cache()
         gc.collect()
-        original_model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            trust_remote_code=True,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            use_cache=False
         )
         if torch.cuda.is_available():
-            original_model = original_model.cuda()
         output_dir = f"checkpoints/{job_id}"
@@ -297,7 +283,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, training_
             path_in_repo=".",
             repo_id=full_repo_id,
             token=hf_token,
-            commit_message=f"Full Fine-Tuned Model ({training_mode})"
         )
         job.repo_url = f"https://huggingface.co/{full_repo_id}"
@@ -310,7 +296,7 @@ def background_train_task(job_id, hf_token, model_name, new_repo_name, training_
         job.add_log(f"FATAL ERROR: {str(e)}")
         torch.cuda.empty_cache()
-def start_training_wrapper(hf_token, model_name, new_repo_name, training_mode,
                            train_steps, learning_rate, batch_size, datasets_text,
                            reasoning_mode, c_conf, c_tok, c_gen):
@@ -322,7 +308,7 @@ def start_training_wrapper(hf_token, model_name, new_repo_name, training_mode,
     thread = threading.Thread(
         target=background_train_task,
-        args=(new_job.id, hf_token, model_name, new_repo_name, training_mode,
               train_steps, learning_rate, batch_size, datasets_text, reasoning_mode, c_conf, c_tok, c_gen)
     )
     thread.daemon = True
@@ -357,10 +343,10 @@ def load_from_url(request: gr.Request):
         pass
     return gr.update(selected="launch_tab"), ""
-with gr.Blocks(title="Nucleus Enterprise", theme=gr.themes.Base()) as demo:
     with gr.Column():
         gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
-        gr.Markdown("Autonomous LLM Foundry | V6.0 Full Fine-Tune")
         with gr.Tabs() as main_tabs:
             with gr.TabItem("🚀 LAUNCHPAD", id="launch_tab"):
@@ -368,22 +354,16 @@ with gr.Blocks(title="Nucleus Enterprise", theme=gr.themes.Base()) as demo:
                     with gr.Column(scale=2):
                         with gr.Row():
                             hf_token = gr.Textbox(label="HuggingFace Token", type="password", value=os.getenv("HF_TOKEN", ""))
-                            model_name = gr.Textbox(label="Base Model", value="Qwen/Qwen2.5-0.5B")
-                        repo_name = gr.Textbox(label="Output Repository", value="nucleus-model-v1")
                         datasets = gr.Textbox(label="Datasets (CSV)", value="Salesforce/fineweb_deduplicated", lines=3)
-                        with gr.Row():
-                            training_mode = gr.Dropdown(
-                                choices=["Base Pre-Training", "Post-Training", "Base SFT", "Post-Training SFT"],
-                                value="Base Pre-Training",
-                                label="Training Strategy"
-                            )
-                            reasoning = gr.Checkbox(label="Inject Reasoning (CoT/Math)", value=False)
                     with gr.Column(scale=1):
                         steps = gr.Number(label="Steps", value=100)
-                        lr = gr.Number(label="Learning Rate", value=2e-5)
                         batch = gr.Number(label="Batch Size", value=1)
                 with gr.Accordion("Advanced Config", open=False):
@@ -391,7 +371,7 @@ with gr.Blocks(title="Nucleus Enterprise", theme=gr.themes.Base()) as demo:
                     c_tok = gr.Code(label="tokenizer_config.json", language="json")
                     c_gen = gr.Code(label="generation_config.json", language="json")
-                btn_launch = gr.Button("INITIALIZE FULL TRAINING", variant="primary", size="lg")
             with gr.TabItem("📡 TELEMETRY", id="monitor_tab"):
                 with gr.Row():
@@ -412,7 +392,7 @@ with gr.Blocks(title="Nucleus Enterprise", theme=gr.themes.Base()) as demo:
     btn_launch.click(
         start_training_wrapper,
-        inputs=[hf_token, model_name, repo_name, training_mode, steps, lr, batch, datasets, reasoning, c_conf, c_tok, c_gen],
         outputs=[job_id_input, main_tabs]
     ).then(
         None, [job_id_input], None,

 import datasets
 from dotenv import load_dotenv
 from datasets import load_dataset, get_dataset_config_names, IterableDataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback, AutoConfig
 from huggingface_hub import login, whoami, create_repo, upload_folder
 import spaces
         return control
 @spaces.GPU(duration=300)
+def background_train_task(job_id, hf_token, model_name, new_repo_name,
                           train_steps, learning_rate, batch_size, datasets_text,
                           reasoning_mode, c_conf, c_tok, c_gen):
     job = JOBS[job_id]
     job.status = "RUNNING"
+    job.add_log("System: initializing Scratch Training Protocol...")
     try:
         if not hf_token.startswith("hf_"):
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         def process_stream_generator():
             iterator = chain.from_iterable(streams)
             batch_buffer = []
             for item in iterator:
                 try:
+                    text = str(item.get("text", item.get("content", str(item))))
                     if len(text) < 5: continue
                     batch_buffer.append(text)
                 except:
                     continue
+        job.set_progress(0.15, "Model: Initializing Architecture from Scratch...")
         torch.cuda.empty_cache()
         gc.collect()
+        config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        original_model = AutoModelForCausalLM.from_config(
+            config,
+            trust_remote_code=True,
         )
         if torch.cuda.is_available():
+            original_model = original_model.to(torch.float16).cuda()
         output_dir = f"checkpoints/{job_id}"
             path_in_repo=".",
             repo_id=full_repo_id,
             token=hf_token,
+            commit_message="Scratch Trained Model"
         )
         job.repo_url = f"https://huggingface.co/{full_repo_id}"
         job.add_log(f"FATAL ERROR: {str(e)}")
         torch.cuda.empty_cache()
+def start_training_wrapper(hf_token, model_name, new_repo_name,
                            train_steps, learning_rate, batch_size, datasets_text,
                            reasoning_mode, c_conf, c_tok, c_gen):
     thread = threading.Thread(
         target=background_train_task,
+        args=(new_job.id, hf_token, model_name, new_repo_name,
               train_steps, learning_rate, batch_size, datasets_text, reasoning_mode, c_conf, c_tok, c_gen)
     )
     thread.daemon = True
         pass
     return gr.update(selected="launch_tab"), ""
+with gr.Blocks(title="Nucleus Enterprise") as demo:
     with gr.Column():
         gr.Markdown("# ⚛️ NUCLEUS ENTERPRISE")
+        gr.Markdown("Autonomous LLM Foundry | V7.0 Scratch Edition")
         with gr.Tabs() as main_tabs:
             with gr.TabItem("🚀 LAUNCHPAD", id="launch_tab"):
                     with gr.Column(scale=2):
                         with gr.Row():
                             hf_token = gr.Textbox(label="HuggingFace Token", type="password", value=os.getenv("HF_TOKEN", ""))
+                            model_name = gr.Textbox(label="Architecture Config Source", value="Qwen/Qwen2.5-0.5B")
+                        repo_name = gr.Textbox(label="Output Repository", value="nucleus-scratch-v1")
                         datasets = gr.Textbox(label="Datasets (CSV)", value="Salesforce/fineweb_deduplicated", lines=3)
+                        reasoning = gr.Checkbox(label="Inject Reasoning (CoT/Math)", value=False)
                     with gr.Column(scale=1):
                         steps = gr.Number(label="Steps", value=100)
+                        lr = gr.Number(label="Learning Rate", value=1e-4)
                         batch = gr.Number(label="Batch Size", value=1)
                 with gr.Accordion("Advanced Config", open=False):
                     c_tok = gr.Code(label="tokenizer_config.json", language="json")
                     c_gen = gr.Code(label="generation_config.json", language="json")
+                btn_launch = gr.Button("INITIALIZE SCRATCH TRAINING", variant="primary", size="lg")
             with gr.TabItem("📡 TELEMETRY", id="monitor_tab"):
                 with gr.Row():
     btn_launch.click(
         start_training_wrapper,
+        inputs=[hf_token, model_name, repo_name, steps, lr, batch, datasets, reasoning, c_conf, c_tok, c_gen],
         outputs=[job_id_input, main_tabs]
     ).then(
         None, [job_id_input], None,