APT-product / app.py
FlameF0X's picture
Update app.py
11c83cf verified
import gradio as gr
import torch
import os
import threading
import queue
import time
import json
from transformers import (
GPT2Config,
GPT2LMHeadModel,
GPT2Tokenizer,
Trainer,
TrainingArguments,
DataCollatorForLanguageModeling,
TrainerCallback
)
from datasets import load_dataset
from huggingface_hub import whoami, HfApi
# --- Helper Classes ---
class LogQueueCallback(TrainerCallback):
"""A custom callback that pushes logs to a queue for the UI."""
def __init__(self, log_queue):
self.log_queue = log_queue
def on_log(self, args, state, control, logs=None, **kwargs):
if logs:
# Format log dictionary nicely
log_str = f"Step {state.global_step}: {json.dumps(logs)}\n"
self.log_queue.put(log_str)
def get_username(token):
"""Retrieves the username from the HF token."""
if not token:
return None
try:
info = whoami(token=token)
return info['name']
except Exception:
return None
def train_thread_target(
token,
dataset_id,
model_name,
num_layers,
n_embd,
n_head,
context_length,
epochs,
lr,
weight_decay,
warmup_steps,
batch_size,
grad_accumulation,
sample_limit,
log_queue,
result_queue
):
"""
Background thread for training and pushing to user profile.
"""
try:
# 0. Auth & Identity
final_token = token or os.environ.get("HF_TOKEN")
username = get_username(final_token)
if not username:
raise ValueError("Invalid or missing Hugging Face Token. Ensure the token is provided or set as HF_TOKEN secret.")
# Target path is now the USER'S profile
full_repo_id = f"{username}/{model_name}"
log_queue.put(f"🚀 Initializing for user: {username}\n")
log_queue.put(f"📦 Target Repository: https://huggingface.co/{full_repo_id}\n")
# Validation for Transformer logic
if n_embd % n_head != 0:
raise ValueError(f"Embedding dimension ({n_embd}) must be divisible by number of heads ({n_head}).")
# 1. Load Dataset
log_queue.put(f"📚 Loading dataset: {dataset_id} (Limit: {sample_limit})...\n")
try:
# We use the train split; user can specify limit
dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]")
except Exception as e:
raise ValueError(f"Error loading dataset: {e}")
# Auto-detect text column
text_column = "text"
if "text" not in dataset.column_names:
for col in dataset.column_names:
if isinstance(dataset[0][col], str):
text_column = col
break
log_queue.put(f"🔍 Using text column: '{text_column}'\n")
# 2. Tokenize
log_queue.put("✂️ Tokenizing data...\n")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
return tokenizer(
examples[text_column],
padding="max_length",
truncation=True,
max_length=int(context_length)
)
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
# 3. Initialize Model
log_queue.put("🏗️ Building GPT-2 Architecture...\n")
config = GPT2Config(
vocab_size=len(tokenizer),
n_positions=int(context_length),
n_ctx=int(context_length),
n_embd=int(n_embd),
n_layer=int(num_layers),
n_head=int(n_head),
)
model = GPT2LMHeadModel(config)
# 4. Train
log_queue.put("🏋️ Starting Training Loop...\n")
training_args = TrainingArguments(
output_dir="./local_results",
overwrite_output_dir=True,
num_train_epochs=epochs,
per_device_train_batch_size=int(batch_size),
gradient_accumulation_steps=int(grad_accumulation),
learning_rate=lr,
weight_decay=weight_decay,
warmup_steps=int(warmup_steps),
logging_steps=10,
save_strategy="no",
push_to_hub=False,
report_to="none",
use_cpu=not torch.cuda.is_available(),
fp16=torch.cuda.is_available(),
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
train_dataset=tokenized_datasets,
callbacks=[LogQueueCallback(log_queue)]
)
trainer.train()
# 5. Push to User's Personal Hub
log_queue.put(f"☁️ Uploading model to your profile...\n")
model.push_to_hub(full_repo_id, token=final_token)
tokenizer.push_to_hub(full_repo_id, token=final_token)
result_queue.put(f"🎉 Success! Published to: https://huggingface.co/{full_repo_id}")
except Exception as e:
log_queue.put(f"❌ Error: {str(e)}\n")
result_queue.put(None)
# --- Generator for UI updates ---
def train_and_push_generator(
token, dataset_id, model_name,
num_layers, n_embd, n_head, context_length,
epochs, lr, weight_decay, warmup_steps,
batch_size, grad_accumulation, sample_limit
):
effective_token = token or os.environ.get("HF_TOKEN")
if not effective_token:
yield "Error: No Hugging Face Token found. Please enter a 'Write' token below.", ""
return
log_queue = queue.Queue()
result_queue = queue.Queue()
t = threading.Thread(target=train_thread_target, args=(
effective_token, dataset_id, model_name,
num_layers, n_embd, n_head, context_length,
epochs, lr, weight_decay, warmup_steps,
batch_size, grad_accumulation, sample_limit,
log_queue, result_queue
))
t.start()
logs_history = ""
while t.is_alive():
while not log_queue.empty():
logs_history += log_queue.get()
yield logs_history, "Training in progress..."
time.sleep(0.5)
while not log_queue.empty():
logs_history += log_queue.get()
if not result_queue.empty():
result = result_queue.get()
yield logs_history, result or "Training failed. See logs."
else:
yield logs_history, "Process interrupted."
# --- UI Layout ---
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate")) as demo:
gr.Markdown("# 🚀 Personal Auto-PreTrain")
gr.Markdown("Configure a custom GPT-2 architecture and train it directly to **your personal** Hugging Face profile.")
with gr.Row():
hf_token = gr.Textbox(
label="HF Write Token",
placeholder="hf_...",
type="password",
info="Required to create the repo on your profile. Must have 'Write' permissions."
)
model_name_input = gr.Textbox(
label="Model Name",
value="my-custom-gpt2",
placeholder="e.g. tiny-stories-v1"
)
with gr.Tabs():
with gr.TabItem("1. Data Selection"):
with gr.Row():
dataset_input = gr.Textbox(
label="Dataset ID",
value="roneneldan/TinyStories",
placeholder="e.g. wikitext"
)
sample_limit = gr.Number(
label="Training Samples",
value=500,
precision=0
)
context_length = gr.Slider(
minimum=64, maximum=1024, value=128, step=64,
label="Max Context Length"
)
with gr.TabItem("2. Architecture"):
with gr.Row():
layers = gr.Slider(minimum=1, maximum=12, value=2, step=1, label="Layers")
embd = gr.Slider(minimum=64, maximum=1024, value=128, step=64, label="Embedding Dim")
with gr.Row():
heads = gr.Slider(minimum=2, maximum=16, value=4, step=2, label="Attention Heads")
gr.Markdown("_Note: Embedding Dim must be divisible by Attention Heads._")
with gr.TabItem("3. Training Settings"):
with gr.Row():
epochs = gr.Slider(minimum=1, maximum=20, value=1, step=1, label="Epochs")
lr = gr.Number(label="Learning Rate", value=5e-4)
with gr.Row():
batch_size = gr.Slider(minimum=1, maximum=32, value=4, step=1, label="Batch Size")
grad_accumulation = gr.Slider(minimum=1, maximum=16, value=1, step=1, label="Grad Accumulation")
with gr.Row():
weight_decay = gr.Slider(minimum=0.0, maximum=0.1, value=0.01, step=0.01, label="Weight Decay")
warmup_steps = gr.Number(label="Warmup Steps", value=50, precision=0)
train_btn = gr.Button("🔥 Start Training & Push to My Profile", variant="primary")
with gr.Row():
log_output = gr.Code(label="Training Progress", language="json", lines=12)
status_output = gr.Textbox(label="Final Status", interactive=False)
train_btn.click(
fn=train_and_push_generator,
inputs=[
hf_token, dataset_input, model_name_input,
layers, embd, heads, context_length,
epochs, lr, weight_decay, warmup_steps,
batch_size, grad_accumulation, sample_limit
],
outputs=[log_output, status_output]
)
if __name__ == "__main__":
demo.launch()