Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import os | |
| import threading | |
| import queue | |
| import time | |
| import json | |
| from transformers import ( | |
| GPT2Config, | |
| GPT2LMHeadModel, | |
| GPT2Tokenizer, | |
| Trainer, | |
| TrainingArguments, | |
| DataCollatorForLanguageModeling, | |
| TrainerCallback | |
| ) | |
| from datasets import load_dataset | |
| from huggingface_hub import whoami, HfApi | |
| # --- Helper Classes --- | |
| class LogQueueCallback(TrainerCallback): | |
| """A custom callback that pushes logs to a queue for the UI.""" | |
| def __init__(self, log_queue): | |
| self.log_queue = log_queue | |
| def on_log(self, args, state, control, logs=None, **kwargs): | |
| if logs: | |
| # Format log dictionary nicely | |
| log_str = f"Step {state.global_step}: {json.dumps(logs)}\n" | |
| self.log_queue.put(log_str) | |
| def get_username(token): | |
| """Retrieves the username from the HF token.""" | |
| if not token: | |
| return None | |
| try: | |
| info = whoami(token=token) | |
| return info['name'] | |
| except Exception: | |
| return None | |
| def train_thread_target( | |
| token, | |
| dataset_id, | |
| model_name, | |
| num_layers, | |
| n_embd, | |
| n_head, | |
| context_length, | |
| epochs, | |
| lr, | |
| weight_decay, | |
| warmup_steps, | |
| batch_size, | |
| grad_accumulation, | |
| sample_limit, | |
| log_queue, | |
| result_queue | |
| ): | |
| """ | |
| Background thread for training and pushing to user profile. | |
| """ | |
| try: | |
| # 0. Auth & Identity | |
| final_token = token or os.environ.get("HF_TOKEN") | |
| username = get_username(final_token) | |
| if not username: | |
| raise ValueError("Invalid or missing Hugging Face Token. Ensure the token is provided or set as HF_TOKEN secret.") | |
| # Target path is now the USER'S profile | |
| full_repo_id = f"{username}/{model_name}" | |
| log_queue.put(f"🚀 Initializing for user: {username}\n") | |
| log_queue.put(f"📦 Target Repository: https://huggingface.co/{full_repo_id}\n") | |
| # Validation for Transformer logic | |
| if n_embd % n_head != 0: | |
| raise ValueError(f"Embedding dimension ({n_embd}) must be divisible by number of heads ({n_head}).") | |
| # 1. Load Dataset | |
| log_queue.put(f"📚 Loading dataset: {dataset_id} (Limit: {sample_limit})...\n") | |
| try: | |
| # We use the train split; user can specify limit | |
| dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]") | |
| except Exception as e: | |
| raise ValueError(f"Error loading dataset: {e}") | |
| # Auto-detect text column | |
| text_column = "text" | |
| if "text" not in dataset.column_names: | |
| for col in dataset.column_names: | |
| if isinstance(dataset[0][col], str): | |
| text_column = col | |
| break | |
| log_queue.put(f"🔍 Using text column: '{text_column}'\n") | |
| # 2. Tokenize | |
| log_queue.put("✂️ Tokenizing data...\n") | |
| tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
| tokenizer.pad_token = tokenizer.eos_token | |
| def tokenize_function(examples): | |
| return tokenizer( | |
| examples[text_column], | |
| padding="max_length", | |
| truncation=True, | |
| max_length=int(context_length) | |
| ) | |
| tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names) | |
| # 3. Initialize Model | |
| log_queue.put("🏗️ Building GPT-2 Architecture...\n") | |
| config = GPT2Config( | |
| vocab_size=len(tokenizer), | |
| n_positions=int(context_length), | |
| n_ctx=int(context_length), | |
| n_embd=int(n_embd), | |
| n_layer=int(num_layers), | |
| n_head=int(n_head), | |
| ) | |
| model = GPT2LMHeadModel(config) | |
| # 4. Train | |
| log_queue.put("🏋️ Starting Training Loop...\n") | |
| training_args = TrainingArguments( | |
| output_dir="./local_results", | |
| overwrite_output_dir=True, | |
| num_train_epochs=epochs, | |
| per_device_train_batch_size=int(batch_size), | |
| gradient_accumulation_steps=int(grad_accumulation), | |
| learning_rate=lr, | |
| weight_decay=weight_decay, | |
| warmup_steps=int(warmup_steps), | |
| logging_steps=10, | |
| save_strategy="no", | |
| push_to_hub=False, | |
| report_to="none", | |
| use_cpu=not torch.cuda.is_available(), | |
| fp16=torch.cuda.is_available(), | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), | |
| train_dataset=tokenized_datasets, | |
| callbacks=[LogQueueCallback(log_queue)] | |
| ) | |
| trainer.train() | |
| # 5. Push to User's Personal Hub | |
| log_queue.put(f"☁️ Uploading model to your profile...\n") | |
| model.push_to_hub(full_repo_id, token=final_token) | |
| tokenizer.push_to_hub(full_repo_id, token=final_token) | |
| result_queue.put(f"🎉 Success! Published to: https://huggingface.co/{full_repo_id}") | |
| except Exception as e: | |
| log_queue.put(f"❌ Error: {str(e)}\n") | |
| result_queue.put(None) | |
| # --- Generator for UI updates --- | |
| def train_and_push_generator( | |
| token, dataset_id, model_name, | |
| num_layers, n_embd, n_head, context_length, | |
| epochs, lr, weight_decay, warmup_steps, | |
| batch_size, grad_accumulation, sample_limit | |
| ): | |
| effective_token = token or os.environ.get("HF_TOKEN") | |
| if not effective_token: | |
| yield "Error: No Hugging Face Token found. Please enter a 'Write' token below.", "" | |
| return | |
| log_queue = queue.Queue() | |
| result_queue = queue.Queue() | |
| t = threading.Thread(target=train_thread_target, args=( | |
| effective_token, dataset_id, model_name, | |
| num_layers, n_embd, n_head, context_length, | |
| epochs, lr, weight_decay, warmup_steps, | |
| batch_size, grad_accumulation, sample_limit, | |
| log_queue, result_queue | |
| )) | |
| t.start() | |
| logs_history = "" | |
| while t.is_alive(): | |
| while not log_queue.empty(): | |
| logs_history += log_queue.get() | |
| yield logs_history, "Training in progress..." | |
| time.sleep(0.5) | |
| while not log_queue.empty(): | |
| logs_history += log_queue.get() | |
| if not result_queue.empty(): | |
| result = result_queue.get() | |
| yield logs_history, result or "Training failed. See logs." | |
| else: | |
| yield logs_history, "Process interrupted." | |
| # --- UI Layout --- | |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate")) as demo: | |
| gr.Markdown("# 🚀 Personal Auto-PreTrain") | |
| gr.Markdown("Configure a custom GPT-2 architecture and train it directly to **your personal** Hugging Face profile.") | |
| with gr.Row(): | |
| hf_token = gr.Textbox( | |
| label="HF Write Token", | |
| placeholder="hf_...", | |
| type="password", | |
| info="Required to create the repo on your profile. Must have 'Write' permissions." | |
| ) | |
| model_name_input = gr.Textbox( | |
| label="Model Name", | |
| value="my-custom-gpt2", | |
| placeholder="e.g. tiny-stories-v1" | |
| ) | |
| with gr.Tabs(): | |
| with gr.TabItem("1. Data Selection"): | |
| with gr.Row(): | |
| dataset_input = gr.Textbox( | |
| label="Dataset ID", | |
| value="roneneldan/TinyStories", | |
| placeholder="e.g. wikitext" | |
| ) | |
| sample_limit = gr.Number( | |
| label="Training Samples", | |
| value=500, | |
| precision=0 | |
| ) | |
| context_length = gr.Slider( | |
| minimum=64, maximum=1024, value=128, step=64, | |
| label="Max Context Length" | |
| ) | |
| with gr.TabItem("2. Architecture"): | |
| with gr.Row(): | |
| layers = gr.Slider(minimum=1, maximum=12, value=2, step=1, label="Layers") | |
| embd = gr.Slider(minimum=64, maximum=1024, value=128, step=64, label="Embedding Dim") | |
| with gr.Row(): | |
| heads = gr.Slider(minimum=2, maximum=16, value=4, step=2, label="Attention Heads") | |
| gr.Markdown("_Note: Embedding Dim must be divisible by Attention Heads._") | |
| with gr.TabItem("3. Training Settings"): | |
| with gr.Row(): | |
| epochs = gr.Slider(minimum=1, maximum=20, value=1, step=1, label="Epochs") | |
| lr = gr.Number(label="Learning Rate", value=5e-4) | |
| with gr.Row(): | |
| batch_size = gr.Slider(minimum=1, maximum=32, value=4, step=1, label="Batch Size") | |
| grad_accumulation = gr.Slider(minimum=1, maximum=16, value=1, step=1, label="Grad Accumulation") | |
| with gr.Row(): | |
| weight_decay = gr.Slider(minimum=0.0, maximum=0.1, value=0.01, step=0.01, label="Weight Decay") | |
| warmup_steps = gr.Number(label="Warmup Steps", value=50, precision=0) | |
| train_btn = gr.Button("🔥 Start Training & Push to My Profile", variant="primary") | |
| with gr.Row(): | |
| log_output = gr.Code(label="Training Progress", language="json", lines=12) | |
| status_output = gr.Textbox(label="Final Status", interactive=False) | |
| train_btn.click( | |
| fn=train_and_push_generator, | |
| inputs=[ | |
| hf_token, dataset_input, model_name_input, | |
| layers, embd, heads, context_length, | |
| epochs, lr, weight_decay, warmup_steps, | |
| batch_size, grad_accumulation, sample_limit | |
| ], | |
| outputs=[log_output, status_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |