import gradio as gr import torch import os import threading import queue import time import json from transformers import ( GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback ) from datasets import load_dataset from huggingface_hub import whoami, HfApi # --- Helper Classes --- class LogQueueCallback(TrainerCallback): """A custom callback that pushes logs to a queue for the UI.""" def __init__(self, log_queue): self.log_queue = log_queue def on_log(self, args, state, control, logs=None, **kwargs): if logs: # Format log dictionary nicely log_str = f"Step {state.global_step}: {json.dumps(logs)}\n" self.log_queue.put(log_str) def get_username(token): """Retrieves the username from the HF token.""" if not token: return None try: info = whoami(token=token) return info['name'] except Exception: return None def train_thread_target( token, dataset_id, model_name, num_layers, n_embd, n_head, context_length, epochs, lr, weight_decay, warmup_steps, batch_size, grad_accumulation, sample_limit, log_queue, result_queue ): """ Background thread for training and pushing to user profile. """ try: # 0. Auth & Identity final_token = token or os.environ.get("HF_TOKEN") username = get_username(final_token) if not username: raise ValueError("Invalid or missing Hugging Face Token. Ensure the token is provided or set as HF_TOKEN secret.") # Target path is now the USER'S profile full_repo_id = f"{username}/{model_name}" log_queue.put(f"🚀 Initializing for user: {username}\n") log_queue.put(f"📦 Target Repository: https://huggingface.co/{full_repo_id}\n") # Validation for Transformer logic if n_embd % n_head != 0: raise ValueError(f"Embedding dimension ({n_embd}) must be divisible by number of heads ({n_head}).") # 1. Load Dataset log_queue.put(f"📚 Loading dataset: {dataset_id} (Limit: {sample_limit})...\n") try: # We use the train split; user can specify limit dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]") except Exception as e: raise ValueError(f"Error loading dataset: {e}") # Auto-detect text column text_column = "text" if "text" not in dataset.column_names: for col in dataset.column_names: if isinstance(dataset[0][col], str): text_column = col break log_queue.put(f"🔍 Using text column: '{text_column}'\n") # 2. Tokenize log_queue.put("✂️ Tokenizing data...\n") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") tokenizer.pad_token = tokenizer.eos_token def tokenize_function(examples): return tokenizer( examples[text_column], padding="max_length", truncation=True, max_length=int(context_length) ) tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names) # 3. Initialize Model log_queue.put("🏗️ Building GPT-2 Architecture...\n") config = GPT2Config( vocab_size=len(tokenizer), n_positions=int(context_length), n_ctx=int(context_length), n_embd=int(n_embd), n_layer=int(num_layers), n_head=int(n_head), ) model = GPT2LMHeadModel(config) # 4. Train log_queue.put("🏋️ Starting Training Loop...\n") training_args = TrainingArguments( output_dir="./local_results", overwrite_output_dir=True, num_train_epochs=epochs, per_device_train_batch_size=int(batch_size), gradient_accumulation_steps=int(grad_accumulation), learning_rate=lr, weight_decay=weight_decay, warmup_steps=int(warmup_steps), logging_steps=10, save_strategy="no", push_to_hub=False, report_to="none", use_cpu=not torch.cuda.is_available(), fp16=torch.cuda.is_available(), ) trainer = Trainer( model=model, args=training_args, data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), train_dataset=tokenized_datasets, callbacks=[LogQueueCallback(log_queue)] ) trainer.train() # 5. Push to User's Personal Hub log_queue.put(f"☁️ Uploading model to your profile...\n") model.push_to_hub(full_repo_id, token=final_token) tokenizer.push_to_hub(full_repo_id, token=final_token) result_queue.put(f"🎉 Success! Published to: https://huggingface.co/{full_repo_id}") except Exception as e: log_queue.put(f"❌ Error: {str(e)}\n") result_queue.put(None) # --- Generator for UI updates --- def train_and_push_generator( token, dataset_id, model_name, num_layers, n_embd, n_head, context_length, epochs, lr, weight_decay, warmup_steps, batch_size, grad_accumulation, sample_limit ): effective_token = token or os.environ.get("HF_TOKEN") if not effective_token: yield "Error: No Hugging Face Token found. Please enter a 'Write' token below.", "" return log_queue = queue.Queue() result_queue = queue.Queue() t = threading.Thread(target=train_thread_target, args=( effective_token, dataset_id, model_name, num_layers, n_embd, n_head, context_length, epochs, lr, weight_decay, warmup_steps, batch_size, grad_accumulation, sample_limit, log_queue, result_queue )) t.start() logs_history = "" while t.is_alive(): while not log_queue.empty(): logs_history += log_queue.get() yield logs_history, "Training in progress..." time.sleep(0.5) while not log_queue.empty(): logs_history += log_queue.get() if not result_queue.empty(): result = result_queue.get() yield logs_history, result or "Training failed. See logs." else: yield logs_history, "Process interrupted." # --- UI Layout --- with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate")) as demo: gr.Markdown("# 🚀 Personal Auto-PreTrain") gr.Markdown("Configure a custom GPT-2 architecture and train it directly to **your personal** Hugging Face profile.") with gr.Row(): hf_token = gr.Textbox( label="HF Write Token", placeholder="hf_...", type="password", info="Required to create the repo on your profile. Must have 'Write' permissions." ) model_name_input = gr.Textbox( label="Model Name", value="my-custom-gpt2", placeholder="e.g. tiny-stories-v1" ) with gr.Tabs(): with gr.TabItem("1. Data Selection"): with gr.Row(): dataset_input = gr.Textbox( label="Dataset ID", value="roneneldan/TinyStories", placeholder="e.g. wikitext" ) sample_limit = gr.Number( label="Training Samples", value=500, precision=0 ) context_length = gr.Slider( minimum=64, maximum=1024, value=128, step=64, label="Max Context Length" ) with gr.TabItem("2. Architecture"): with gr.Row(): layers = gr.Slider(minimum=1, maximum=12, value=2, step=1, label="Layers") embd = gr.Slider(minimum=64, maximum=1024, value=128, step=64, label="Embedding Dim") with gr.Row(): heads = gr.Slider(minimum=2, maximum=16, value=4, step=2, label="Attention Heads") gr.Markdown("_Note: Embedding Dim must be divisible by Attention Heads._") with gr.TabItem("3. Training Settings"): with gr.Row(): epochs = gr.Slider(minimum=1, maximum=20, value=1, step=1, label="Epochs") lr = gr.Number(label="Learning Rate", value=5e-4) with gr.Row(): batch_size = gr.Slider(minimum=1, maximum=32, value=4, step=1, label="Batch Size") grad_accumulation = gr.Slider(minimum=1, maximum=16, value=1, step=1, label="Grad Accumulation") with gr.Row(): weight_decay = gr.Slider(minimum=0.0, maximum=0.1, value=0.01, step=0.01, label="Weight Decay") warmup_steps = gr.Number(label="Warmup Steps", value=50, precision=0) train_btn = gr.Button("🔥 Start Training & Push to My Profile", variant="primary") with gr.Row(): log_output = gr.Code(label="Training Progress", language="json", lines=12) status_output = gr.Textbox(label="Final Status", interactive=False) train_btn.click( fn=train_and_push_generator, inputs=[ hf_token, dataset_input, model_name_input, layers, embd, heads, context_length, epochs, lr, weight_decay, warmup_steps, batch_size, grad_accumulation, sample_limit ], outputs=[log_output, status_output] ) if __name__ == "__main__": demo.launch()