APT-product

Sleeping

App Files Files Community

FlameF0X commited on 20 days ago

Commit

32c80ed

verified ·

1 Parent(s): 35f317c

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -168

app.py CHANGED Viewed

@@ -1,5 +1,10 @@
 import gradio as gr
 import torch
 from transformers import (
     GPT2Config,
     GPT2LMHeadModel,
@@ -10,32 +15,7 @@ from transformers import (
     TrainerCallback
 )
 from datasets import load_dataset
-from huggingface_hub import whoami
-import os
-import threading
-import queue
-import time
-import json
-# --- Custom Code Templates ---
-CONFIGURATION_CODE = """
-from transformers import GPT2Config
-class CustomTinyConfig(GPT2Config):
-    model_type = "custom_tiny"
-"""
-MODELING_CODE = """
-from transformers import GPT2LMHeadModel
-from .configuration_custom import CustomTinyConfig
-class CustomTinyModel(GPT2LMHeadModel):
-    config_class = CustomTinyConfig
-    def __init__(self, config):
-        super().__init__(config)
-"""
 # --- Helper Classes ---
@@ -61,46 +41,50 @@ def get_user_info(token):
         return None
 def train_thread_target(
     dataset_id,
     model_name,
     num_layers,
     n_embd,
     epochs,
     lr,
     sample_limit,
-    token,
     log_queue,
     result_queue
 ):
     """
-    Function to be run in a separate thread.
-    Handles the heavy lifting of training and pushing.
     """
     try:
         username = get_user_info(token)
         if not username:
-            raise ValueError("Could not authenticate user.")
         full_repo_id = f"{username}/{model_name}"
-        log_queue.put(f"🚀 Starting process for {full_repo_id}...\n")
         # 1. Load Dataset
-        log_queue.put(f"📚 Loading dataset: {dataset_id}...\n")
         try:
             dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]")
         except Exception as e:
             raise ValueError(f"Error loading dataset: {e}")
-        # Find text column
         text_column = "text"
         if "text" not in dataset.column_names:
-            for col, dtype in zip(dataset.column_names, dataset.features.values()):
-                if hasattr(dtype, 'dtype') and dtype.dtype == 'string':
                     text_column = col
                     break
-        if text_column not in dataset.column_names:
-            raise ValueError("Could not find a text column in this dataset.")
         # 2. Tokenize
         log_queue.put("✂️ Tokenizing data...\n")
@@ -108,25 +92,25 @@ def train_thread_target(
         tokenizer.pad_token = tokenizer.eos_token
         def tokenize_function(examples):
-            return tokenizer(examples[text_column], padding="max_length", truncation=True, max_length=128)
-        tokenized_datasets = dataset.map(tokenize_function, batched=True)
         # 3. Initialize Model
-        log_queue.put("🏗️ Initializing Custom Nano Model...\n")
-        # We use GPT2Config but will modify it before push to look like "CustomTinyConfig"
         config = GPT2Config(
             vocab_size=len(tokenizer),
-            n_positions=128,
-            n_ctx=128,
             n_embd=int(n_embd),
             n_layer=int(num_layers),
-            n_head=4,
         )
-        # We train using standard GPT2 implementation for stability,
-        # but will wrap it in custom code files on upload.
         model = GPT2LMHeadModel(config)
         # 4. Train
@@ -136,14 +120,17 @@ def train_thread_target(
             output_dir="./results",
             overwrite_output_dir=True,
             num_train_epochs=epochs,
-            per_device_train_batch_size=8,
-            save_steps=1000, # Don't save intermediate checkpoints to save time/space
-            save_total_limit=1,
-            prediction_loss_only=True,
             learning_rate=lr,
-            logging_steps=5, # Log frequently for the UI
             report_to="none",
             use_cpu=not torch.cuda.is_available(),
         )
         trainer = Trainer(
@@ -156,166 +143,127 @@ def train_thread_target(
         trainer.train()
-        # 5. Prepare Custom Code Files
-        log_queue.put("📝 Generating Custom Code files (modeling_custom.py)...\n")
-        # Write the python files locally
-        with open("configuration_custom.py", "w") as f:
-            f.write(CONFIGURATION_CODE)
-        with open("modeling_custom.py", "w") as f:
-            f.write(MODELING_CODE)
-        # Update config to point to custom code
-        # This makes it a "Custom Code" model on the Hub
-        model.config.auto_map = {
-            "AutoConfig": "configuration_custom.CustomTinyConfig",
-            "AutoModelForCausalLM": "modeling_custom.CustomTinyModel"
-        }
-        # We also need to change the architecture name in config so it matches the class name
-        model.config.architectures = ["CustomTinyModel"]
-        # 6. Push to Hub
-        log_queue.put(f"☁️ Pushing to {full_repo_id} (this includes custom python files)...\n")
-        # Push model weights and config
-        model.push_to_hub(full_repo_id, token=token, private=True)
-        tokenizer.push_to_hub(full_repo_id, token=token, private=True)
-        # Upload the custom python files explicitly
-        api = gr.HuggingFaceHub(token=token) # wrapper or use HfApi
-        from huggingface_hub import HfApi
-        hf_api = HfApi(token=token)
-        hf_api.upload_file(
-            path_or_fileobj="configuration_custom.py",
-            path_in_repo="configuration_custom.py",
-            repo_id=full_repo_id,
-        )
-        hf_api.upload_file(
-            path_or_fileobj="modeling_custom.py",
-            path_in_repo="modeling_custom.py",
-            repo_id=full_repo_id,
-        )
-        result_queue.put(f"🎉 Done! Model available at: https://huggingface.co/{full_repo_id}")
     except Exception as e:
         log_queue.put(f"❌ Error: {str(e)}\n")
-        result_queue.put(None) # Signal failure
 # --- Main Generator Function ---
 def train_and_push_generator(
-    dataset_id,
-    model_name,
-    num_layers,
-    n_embd,
-    epochs,
-    lr,
-    sample_limit,
-    oauth_token: gr.OAuthToken
 ):
-    if oauth_token is None or oauth_token.token is None:
-        yield "You must be logged in to train a model!", ""
         return
-    token = oauth_token.token
-    # queues for communication between threads
     log_queue = queue.Queue()
     result_queue = queue.Queue()
-    # Start training in background thread
     t = threading.Thread(target=train_thread_target, args=(
-        dataset_id, model_name, num_layers, n_embd, epochs, lr, sample_limit, token, log_queue, result_queue
     ))
     t.start()
-    # Main loop: yield logs as they come in
     logs_history = ""
     while t.is_alive():
-        # Drain queue
         while not log_queue.empty():
-            new_log = log_queue.get()
-            logs_history += new_log
-            yield logs_history, "Training..."
         time.sleep(0.5)
-    # Drain remaining logs after thread finishes
     while not log_queue.empty():
-        new_log = log_queue.get()
-        logs_history += new_log
-    # Get final result
     if not result_queue.empty():
         result = result_queue.get()
-        if result:
-            yield logs_history, result
-        else:
-            yield logs_history, "Failed. Check logs."
     else:
         yield logs_history, "Process finished unexpectedly."
 # --- UI Layout ---
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # Auto-PreTrain
-        Login, pick a dataset, and train a **Custom Code** language model.
-        We will generate `modeling_custom.py` and `configuration_custom.py` and upload them to your repo!
-        """
-    )
-    with gr.Row():
-        login_btn = gr.LoginButton(value="Sign in with Hugging Face to Train")
     with gr.Row():
-        with gr.Column():
-            gr.Markdown("### 1. Data Configuration")
-            dataset_input = gr.Textbox(
-                label="Dataset Name",
-                value="roneneldan/TinyStories",
-                placeholder="e.g. wikitext, roneneldan/TinyStories"
-            )
-            sample_limit = gr.Slider(
-                minimum=100, maximum=5000, value=500, step=100,
-                label="Sample Size"
-            )
-        with gr.Column():
-            gr.Markdown("### 2. Hyperparameters")
-            model_name_input = gr.Textbox(
-                label="Model Name",
-                value="my-custom-tiny-model",
             )
             with gr.Row():
-                layers = gr.Slider(minimum=1, maximum=6, value=2, step=1, label="Layers")
-                embd = gr.Slider(minimum=32, maximum=256, value=64, step=32, label="Embed Dim")
             with gr.Row():
-                epochs = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Epochs")
-                lr = gr.Number(label="Learning Rate", value=5e-4)
-    train_btn = gr.Button("🚀 Train Custom Model", variant="primary")
     with gr.Row():
-        log_output = gr.Code(label="Training Logs", language="json", lines=10)
-        status_output = gr.Textbox(label="Final Status")
     train_btn.click(
         fn=train_and_push_generator,
         inputs=[
-            dataset_input,
-            model_name_input,
-            layers,
-            embd,
-            epochs,
-            lr,
-            sample_limit
         ],
         outputs=[log_output, status_output]
     )

 import gradio as gr
 import torch
+import os
+import threading
+import queue
+import time
+import json
 from transformers import (
     GPT2Config,
     GPT2LMHeadModel,
     TrainerCallback
 )
 from datasets import load_dataset
+from huggingface_hub import whoami, HfApi
 # --- Helper Classes ---
         return None
 def train_thread_target(
+    token,
     dataset_id,
     model_name,
     num_layers,
     n_embd,
+    n_head,
+    context_length,
     epochs,
     lr,
+    weight_decay,
+    warmup_steps,
+    batch_size,
+    grad_accumulation,
     sample_limit,
     log_queue,
     result_queue
 ):
     """
+    Background thread for training.
     """
     try:
         username = get_user_info(token)
         if not username:
+            raise ValueError("Invalid Hugging Face Token. Could not authenticate.")
         full_repo_id = f"{username}/{model_name}"
+        log_queue.put(f"🚀 Initializing for {full_repo_id}...\n")
         # 1. Load Dataset
+        log_queue.put(f"📚 Loading dataset: {dataset_id} (Limit: {sample_limit})...\n")
         try:
             dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]")
         except Exception as e:
             raise ValueError(f"Error loading dataset: {e}")
+        # Auto-detect text column
         text_column = "text"
         if "text" not in dataset.column_names:
+            for col in dataset.column_names:
+                if isinstance(dataset[0][col], str):
                     text_column = col
                     break
+        log_queue.put(f"🔍 Using text column: '{text_column}'\n")
         # 2. Tokenize
         log_queue.put("✂️ Tokenizing data...\n")
         tokenizer.pad_token = tokenizer.eos_token
         def tokenize_function(examples):
+            return tokenizer(
+                examples[text_column],
+                padding="max_length",
+                truncation=True,
+                max_length=int(context_length)
+            )
+        tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
         # 3. Initialize Model
+        log_queue.put("🏗️ Building GPT-2 Architecture...\n")
         config = GPT2Config(
             vocab_size=len(tokenizer),
+            n_positions=int(context_length),
+            n_ctx=int(context_length),
             n_embd=int(n_embd),
             n_layer=int(num_layers),
+            n_head=int(n_head),
         )
         model = GPT2LMHeadModel(config)
         # 4. Train
             output_dir="./results",
             overwrite_output_dir=True,
             num_train_epochs=epochs,
+            per_device_train_batch_size=int(batch_size),
+            gradient_accumulation_steps=int(grad_accumulation),
             learning_rate=lr,
+            weight_decay=weight_decay,
+            warmup_steps=int(warmup_steps),
+            logging_steps=10,
+            save_strategy="no", # Save only at the end
+            push_to_hub=False,
             report_to="none",
             use_cpu=not torch.cuda.is_available(),
+            fp16=torch.cuda.is_available(),
         )
         trainer = Trainer(
         trainer.train()
+        # 5. Push to Hub
+        log_queue.put(f"☁️ Pushing weights to https://huggingface.co/{full_repo_id}...\n")
+        model.push_to_hub(full_repo_id, token=token)
+        tokenizer.push_to_hub(full_repo_id, token=token)
+        result_queue.put(f"🎉 Success! Model published to: https://huggingface.co/{full_repo_id}")
     except Exception as e:
         log_queue.put(f"❌ Error: {str(e)}\n")
+        result_queue.put(None)
 # --- Main Generator Function ---
 def train_and_push_generator(
+    token, dataset_id, model_name,
+    num_layers, n_embd, n_head, context_length,
+    epochs, lr, weight_decay, warmup_steps,
+    batch_size, grad_accumulation, sample_limit
 ):
+    if not token:
+        yield "Error: Hugging Face Token is required.", ""
         return
     log_queue = queue.Queue()
     result_queue = queue.Queue()
     t = threading.Thread(target=train_thread_target, args=(
+        token, dataset_id, model_name,
+        num_layers, n_embd, n_head, context_length,
+        epochs, lr, weight_decay, warmup_steps,
+        batch_size, grad_accumulation, sample_limit,
+        log_queue, result_queue
     ))
     t.start()
     logs_history = ""
     while t.is_alive():
         while not log_queue.empty():
+            logs_history += log_queue.get()
+            yield logs_history, "Training in progress..."
         time.sleep(0.5)
     while not log_queue.empty():
+        logs_history += log_queue.get()
     if not result_queue.empty():
         result = result_queue.get()
+        yield logs_history, result or "Failed. Check logs for errors."
     else:
         yield logs_history, "Process finished unexpectedly."
 # --- UI Layout ---
+with gr.Blocks(theme=gr.themes.Default(primary_hue="orange", secondary_hue="gray")) as demo:
+    gr.Markdown("# 🔥 Advanced Auto-PreTrain")
+    gr.Markdown("Configure your transformer architecture and train it directly to your Hugging Face account.")
     with gr.Row():
+        hf_token = gr.Textbox(
+            label="Hugging Face Write Token",
+            placeholder="hf_...",
+            type="password",
+            info="Get your token at huggingface.co/settings/tokens (must have 'Write' access)"
+        )
+        model_name_input = gr.Textbox(
+            label="Model Repository Name",
+            value="my-tiny-gpt2",
+            placeholder="e.g. tiny-coder-v1"
+        )
+    with gr.Tabs():
+        with gr.TabItem("1. Dataset & Data"):
+            with gr.Row():
+                dataset_input = gr.Textbox(
+                    label="Dataset ID",
+                    value="roneneldan/TinyStories",
+                    placeholder="e.g. wikitext"
+                )
+                sample_limit = gr.Number(
+                    label="Sample Limit",
+                    value=1000,
+                    precision=0,
+                    info="Number of rows to use for training"
+                )
+            context_length = gr.Slider(
+                minimum=64, maximum=1024, value=128, step=64,
+                label="Max Context Length (Sequence Length)"
             )
+        with gr.TabItem("2. Model Architecture"):
             with gr.Row():
+                layers = gr.Slider(minimum=1, maximum=24, value=4, step=1, label="Number of Layers")
+                embd = gr.Slider(minimum=64, maximum=1024, value=256, step=64, label="Embedding Dimension")
+            with gr.Row():
+                heads = gr.Slider(minimum=2, maximum=16, value=8, step=2, label="Attention Heads")
+                gr.Markdown("Note: Embedding dimension must be divisible by attention heads.")
+        with gr.TabItem("3. Training Hyperparameters"):
+            with gr.Row():
+                epochs = gr.Slider(minimum=1, maximum=50, value=1, step=1, label="Epochs")
+                lr = gr.Number(label="Learning Rate", value=5e-4, format="%.1e")
+            with gr.Row():
+                batch_size = gr.Slider(minimum=1, maximum=64, value=8, step=1, label="Batch Size (per device)")
+                grad_accumulation = gr.Slider(minimum=1, maximum=32, value=1, step=1, label="Gradient Accumulation Steps")
             with gr.Row():
+                weight_decay = gr.Slider(minimum=0.0, maximum=0.1, value=0.01, step=0.01, label="Weight Decay")
+                warmup_steps = gr.Number(label="Warmup Steps", value=100, precision=0)
+    train_btn = gr.Button("🚀 Start Pre-Training", variant="primary")
     with gr.Row():
+        log_output = gr.Code(label="Live Training Logs", language="json", lines=15)
+        status_output = gr.Textbox(label="Status & Hub Link", interactive=False)
     train_btn.click(
         fn=train_and_push_generator,
         inputs=[
+            hf_token, dataset_input, model_name_input,
+            layers, embd, heads, context_length,
+            epochs, lr, weight_decay, warmup_steps,
+            batch_size, grad_accumulation, sample_limit
         ],
         outputs=[log_output, status_output]
     )