APT-product

Sleeping

App Files Files Community

FlameF0X commited on Jan 16

Commit

0bde4c8

verified ·

1 Parent(s): 553a5e7

Update app.py

Browse files

Files changed (1) hide show

app.py +188 -79

app.py CHANGED Viewed

@@ -6,13 +6,49 @@ from transformers import (
     GPT2Tokenizer,
     Trainer,
     TrainingArguments,
-    DataCollatorForLanguageModeling
 )
 from datasets import load_dataset
 from huggingface_hub import whoami
 import os
-# --- Helper Functions ---
 def get_user_info(token):
     """Retrieves the username from the HF token."""
@@ -24,7 +60,7 @@ def get_user_info(token):
     except Exception:
         return None
-def train_and_push(
     dataset_id,
     model_name,
     num_layers,
@@ -32,58 +68,42 @@ def train_and_push(
     epochs,
     lr,
     sample_limit,
-    oauth_token: gr.OAuthToken
 ):
     """
-    Main Logic:
-    1. Authenticate
-    2. Load & Prepare Data
-    3. Initialize Tiny Model
-    4. Train
-    5. Push to Hub
     """
-    # 1. Authentication Check
-    if oauth_token is None or oauth_token.token is None:
-        raise gr.Error("You must be logged in to train a model!")
-    token = oauth_token.token
-    username = get_user_info(token)
-    if not username:
-        raise gr.Error("Could not retrieve user info. Please try logging in again.")
-    full_repo_id = f"{username}/{model_name}"
-    progress = gr.Progress()
     try:
-        # 2. Load Dataset
-        progress(0.1, desc=f"Loading dataset: {dataset_id}...")
-        # We try to load the dataset. We'll default to the 'train' split.
-        # We only take a small slice to keep it fast for this demo.
         try:
-            # Try loading just the first 'sample_limit' rows to save bandwidth/memory
             dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]")
         except Exception as e:
-            raise gr.Error(f"Error loading dataset: {str(e)}. Make sure it exists and has a 'train' split.")
-        # Heuristic: Find the text column (first string column)
         text_column = "text"
         if "text" not in dataset.column_names:
-            # simple fallback: look for the first string column
             for col, dtype in zip(dataset.column_names, dataset.features.values()):
                 if hasattr(dtype, 'dtype') and dtype.dtype == 'string':
                     text_column = col
                     break
         if text_column not in dataset.column_names:
-            raise gr.Error("Could not find a text column in this dataset. Please use a dataset with a 'text' column.")
-        progress(0.2, desc="Tokenizing data...")
-        # We use the standard GPT-2 tokenizer for convenience
         tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
         tokenizer.pad_token = tokenizer.eos_token
@@ -91,76 +111,164 @@ def train_and_push(
             return tokenizer(examples[text_column], padding="max_length", truncation=True, max_length=128)
         tokenized_datasets = dataset.map(tokenize_function, batched=True)
         # 3. Initialize Model
-        progress(0.3, desc="Initializing Nano Model...")
-        # We create a custom configuration based on user inputs (Constrained for speed)
         config = GPT2Config(
             vocab_size=len(tokenizer),
-            n_positions=128,  # Short context window for speed
             n_ctx=128,
-            n_embd=int(n_embd),     # Small embedding size
-            n_layer=int(num_layers), # Few layers
             n_head=4,
         )
         model = GPT2LMHeadModel(config)
-        # 4. Training
-        progress(0.4, desc="Starting Training (this might take a minute)...")
         training_args = TrainingArguments(
             output_dir="./results",
             overwrite_output_dir=True,
             num_train_epochs=epochs,
             per_device_train_batch_size=8,
-            save_steps=500,
             save_total_limit=1,
             prediction_loss_only=True,
             learning_rate=lr,
-            logging_steps=10,
-            report_to="none", # Don't log to wandb/tensorboard
-            use_cpu=not torch.cuda.is_available(), # Force CPU if no GPU available
-        )
-        data_collator = DataCollatorForLanguageModeling(
-            tokenizer=tokenizer, mlm=False
         )
         trainer = Trainer(
             model=model,
             args=training_args,
-            data_collator=data_collator,
             train_dataset=tokenized_datasets,
         )
         trainer.train()
-        # 5. Push to Hub
-        progress(0.9, desc=f"Pushing to {full_repo_id}...")
-        # We push both model and tokenizer
-        model.push_to_hub(full_repo_id, token=token, private=True) # Default to private for safety
         tokenizer.push_to_hub(full_repo_id, token=token, private=True)
-        return f"🎉 Success! Model trained and pushed to: https://huggingface.co/{full_repo_id}"
     except Exception as e:
-        raise gr.Error(f"An error occurred: {str(e)}")
 # --- UI Layout ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # 🚂 Tiny AutoTrain Space
-        Login with your Hugging Face account, pick a dataset, and train a tiny language model from scratch!
-        The model will be automatically uploaded to your profile.
         """
     )
-    # Login Button (Native HF Integration)
     with gr.Row():
         login_btn = gr.LoginButton(value="Sign in with Hugging Face to Train")
@@ -168,37 +276,38 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         with gr.Column():
             gr.Markdown("### 1. Data Configuration")
             dataset_input = gr.Textbox(
-                label="Dataset Name (from Hub)",
                 value="roneneldan/TinyStories",
                 placeholder="e.g. wikitext, roneneldan/TinyStories"
             )
             sample_limit = gr.Slider(
                 minimum=100, maximum=5000, value=500, step=100,
-                label="Training Sample Size (Keep small for speed)"
             )
         with gr.Column():
-            gr.Markdown("### 2. Model Hyperparameters")
             model_name_input = gr.Textbox(
-                label="New Model Name",
-                value="my-tiny-model",
-                placeholder="Name of the repo to create"
             )
             with gr.Row():
-                layers = gr.Slider(minimum=1, maximum=6, value=2, step=1, label="Layers (Depth)")
-                embd = gr.Slider(minimum=32, maximum=256, value=64, step=32, label="Embedding Size (Width)")
             with gr.Row():
                 epochs = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Epochs")
                 lr = gr.Number(label="Learning Rate", value=5e-4)
-    train_btn = gr.Button("🚀 Train & Publish", variant="primary")
-    output_text = gr.Textbox(label="Status", interactive=False)
-    # Wire up the button
     train_btn.click(
-        fn=train_and_push,
         inputs=[
             dataset_input,
             model_name_input,
@@ -208,7 +317,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             lr,
             sample_limit
         ],
-        outputs=output_text
     )
 if __name__ == "__main__":

     GPT2Tokenizer,
     Trainer,
     TrainingArguments,
+    DataCollatorForLanguageModeling,
+    TrainerCallback
 )
 from datasets import load_dataset
 from huggingface_hub import whoami
 import os
+import threading
+import queue
+import time
+import json
+# --- Custom Code Templates ---
+CONFIGURATION_CODE = """
+from transformers import GPT2Config
+class CustomTinyConfig(GPT2Config):
+    model_type = "custom_tiny"
+"""
+MODELING_CODE = """
+from transformers import GPT2LMHeadModel
+from .configuration_custom import CustomTinyConfig
+class CustomTinyModel(GPT2LMHeadModel):
+    config_class = CustomTinyConfig
+    def __init__(self, config):
+        super().__init__(config)
+"""
+# --- Helper Classes ---
+class LogQueueCallback(TrainerCallback):
+    """A custom callback that pushes logs to a queue for the UI."""
+    def __init__(self, log_queue):
+        self.log_queue = log_queue
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        if logs:
+            # Format log dictionary nicely
+            log_str = f"Step {state.global_step}: {json.dumps(logs)}\n"
+            self.log_queue.put(log_str)
 def get_user_info(token):
     """Retrieves the username from the HF token."""
     except Exception:
         return None
+def train_thread_target(
     dataset_id,
     model_name,
     num_layers,
     epochs,
     lr,
     sample_limit,
+    token,
+    log_queue,
+    result_queue
 ):
     """
+    Function to be run in a separate thread.
+    Handles the heavy lifting of training and pushing.
     """
     try:
+        username = get_user_info(token)
+        if not username:
+            raise ValueError("Could not authenticate user.")
+        full_repo_id = f"{username}/{model_name}"
+        log_queue.put(f"🚀 Starting process for {full_repo_id}...\n")
+        # 1. Load Dataset
+        log_queue.put(f"📚 Loading dataset: {dataset_id}...\n")
         try:
             dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]")
         except Exception as e:
+            raise ValueError(f"Error loading dataset: {e}")
+        # Find text column
         text_column = "text"
         if "text" not in dataset.column_names:
             for col, dtype in zip(dataset.column_names, dataset.features.values()):
                 if hasattr(dtype, 'dtype') and dtype.dtype == 'string':
                     text_column = col
                     break
         if text_column not in dataset.column_names:
+            raise ValueError("Could not find a text column in this dataset.")
+        # 2. Tokenize
+        log_queue.put("✂️ Tokenizing data...\n")
         tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
         tokenizer.pad_token = tokenizer.eos_token
             return tokenizer(examples[text_column], padding="max_length", truncation=True, max_length=128)
         tokenized_datasets = dataset.map(tokenize_function, batched=True)
         # 3. Initialize Model
+        log_queue.put("🏗️ Initializing Custom Nano Model...\n")
+        # We use GPT2Config but will modify it before push to look like "CustomTinyConfig"
         config = GPT2Config(
             vocab_size=len(tokenizer),
+            n_positions=128,
             n_ctx=128,
+            n_embd=int(n_embd),
+            n_layer=int(num_layers),
             n_head=4,
         )
+        # We train using standard GPT2 implementation for stability,
+        # but will wrap it in custom code files on upload.
         model = GPT2LMHeadModel(config)
+        # 4. Train
+        log_queue.put("🏋️ Starting Training Loop...\n")
         training_args = TrainingArguments(
             output_dir="./results",
             overwrite_output_dir=True,
             num_train_epochs=epochs,
             per_device_train_batch_size=8,
+            save_steps=1000, # Don't save intermediate checkpoints to save time/space
             save_total_limit=1,
             prediction_loss_only=True,
             learning_rate=lr,
+            logging_steps=5, # Log frequently for the UI
+            report_to="none",
+            use_cpu=not torch.cuda.is_available(),
         )
         trainer = Trainer(
             model=model,
             args=training_args,
+            data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
             train_dataset=tokenized_datasets,
+            callbacks=[LogQueueCallback(log_queue)]
         )
         trainer.train()
+        # 5. Prepare Custom Code Files
+        log_queue.put("📝 Generating Custom Code files (modeling_custom.py)...\n")
+        # Write the python files locally
+        with open("configuration_custom.py", "w") as f:
+            f.write(CONFIGURATION_CODE)
+        with open("modeling_custom.py", "w") as f:
+            f.write(MODELING_CODE)
+        # Update config to point to custom code
+        # This makes it a "Custom Code" model on the Hub
+        model.config.auto_map = {
+            "AutoConfig": "configuration_custom.CustomTinyConfig",
+            "AutoModelForCausalLM": "modeling_custom.CustomTinyModel"
+        }
+        # We also need to change the architecture name in config so it matches the class name
+        model.config.architectures = ["CustomTinyModel"]
+        # 6. Push to Hub
+        log_queue.put(f"☁️ Pushing to {full_repo_id} (this includes custom python files)...\n")
+        # Push model weights and config
+        model.push_to_hub(full_repo_id, token=token, private=True)
         tokenizer.push_to_hub(full_repo_id, token=token, private=True)
+        # Upload the custom python files explicitly
+        api = gr.HuggingFaceHub(token=token) # wrapper or use HfApi
+        from huggingface_hub import HfApi
+        hf_api = HfApi(token=token)
+        hf_api.upload_file(
+            path_or_fileobj="configuration_custom.py",
+            path_in_repo="configuration_custom.py",
+            repo_id=full_repo_id,
+        )
+        hf_api.upload_file(
+            path_or_fileobj="modeling_custom.py",
+            path_in_repo="modeling_custom.py",
+            repo_id=full_repo_id,
+        )
+        result_queue.put(f"🎉 Done! Model available at: https://huggingface.co/{full_repo_id}")
     except Exception as e:
+        log_queue.put(f"❌ Error: {str(e)}\n")
+        result_queue.put(None) # Signal failure
+# --- Main Generator Function ---
+def train_and_push_generator(
+    dataset_id,
+    model_name,
+    num_layers,
+    n_embd,
+    epochs,
+    lr,
+    sample_limit,
+    oauth_token: gr.OAuthToken
+):
+    if oauth_token is None or oauth_token.token is None:
+        yield "You must be logged in to train a model!", ""
+        return
+    token = oauth_token.token
+    # queues for communication between threads
+    log_queue = queue.Queue()
+    result_queue = queue.Queue()
+    # Start training in background thread
+    t = threading.Thread(target=train_thread_target, args=(
+        dataset_id, model_name, num_layers, n_embd, epochs, lr, sample_limit, token, log_queue, result_queue
+    ))
+    t.start()
+    # Main loop: yield logs as they come in
+    logs_history = ""
+    while t.is_alive():
+        # Drain queue
+        while not log_queue.empty():
+            new_log = log_queue.get()
+            logs_history += new_log
+            yield logs_history, "Training..."
+        time.sleep(0.5)
+    # Drain remaining logs after thread finishes
+    while not log_queue.empty():
+        new_log = log_queue.get()
+        logs_history += new_log
+    # Get final result
+    if not result_queue.empty():
+        result = result_queue.get()
+        if result:
+            yield logs_history, result
+        else:
+            yield logs_history, "Failed. Check logs."
+    else:
+        yield logs_history, "Process finished unexpectedly."
 # --- UI Layout ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # 🚂 Tiny AutoTrain Space (Custom Code Edition)
+        Login, pick a dataset, and train a **Custom Code** language model.
+        We will generate `modeling_custom.py` and `configuration_custom.py` and upload them to your repo!
         """
     )
     with gr.Row():
         login_btn = gr.LoginButton(value="Sign in with Hugging Face to Train")
         with gr.Column():
             gr.Markdown("### 1. Data Configuration")
             dataset_input = gr.Textbox(
+                label="Dataset Name",
                 value="roneneldan/TinyStories",
                 placeholder="e.g. wikitext, roneneldan/TinyStories"
             )
             sample_limit = gr.Slider(
                 minimum=100, maximum=5000, value=500, step=100,
+                label="Sample Size"
             )
         with gr.Column():
+            gr.Markdown("### 2. Hyperparameters")
             model_name_input = gr.Textbox(
+                label="Model Name",
+                value="my-custom-tiny-model",
             )
             with gr.Row():
+                layers = gr.Slider(minimum=1, maximum=6, value=2, step=1, label="Layers")
+                embd = gr.Slider(minimum=32, maximum=256, value=64, step=32, label="Embed Dim")
             with gr.Row():
                 epochs = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Epochs")
                 lr = gr.Number(label="Learning Rate", value=5e-4)
+    train_btn = gr.Button("🚀 Train Custom Model", variant="primary")
+    with gr.Row():
+        log_output = gr.Code(label="Training Logs", language="json", lines=10)
+        status_output = gr.Textbox(label="Final Status")
     train_btn.click(
+        fn=train_and_push_generator,
         inputs=[
             dataset_input,
             model_name_input,
             lr,
             sample_limit
         ],
+        outputs=[log_output, status_output]
     )
 if __name__ == "__main__":