APT-product

Sleeping

App Files Files Community

APT-product / app.py

FlameF0X

Update app.py

11c83cf verified 15 days ago

raw

history blame contribute delete

9.79 kB

	import gradio as gr
	import torch
	import os
	import threading
	import queue
	import time
	import json
	from transformers import (
	GPT2Config,
	GPT2LMHeadModel,
	GPT2Tokenizer,
	Trainer,
	TrainingArguments,
	DataCollatorForLanguageModeling,
	TrainerCallback
	)
	from datasets import load_dataset
	from huggingface_hub import whoami, HfApi

	# --- Helper Classes ---

	class LogQueueCallback(TrainerCallback):
	"""A custom callback that pushes logs to a queue for the UI."""
	def __init__(self, log_queue):
	self.log_queue = log_queue

	def on_log(self, args, state, control, logs=None, **kwargs):
	if logs:
	# Format log dictionary nicely
	log_str = f"Step {state.global_step}: {json.dumps(logs)}\n"
	self.log_queue.put(log_str)

	def get_username(token):
	"""Retrieves the username from the HF token."""
	if not token:
	return None
	try:
	info = whoami(token=token)
	return info['name']
	except Exception:
	return None

	def train_thread_target(
	token,
	dataset_id,
	model_name,
	num_layers,
	n_embd,
	n_head,
	context_length,
	epochs,
	lr,
	weight_decay,
	warmup_steps,
	batch_size,
	grad_accumulation,
	sample_limit,
	log_queue,
	result_queue
	):
	"""
	Background thread for training and pushing to user profile.
	"""
	try:
	# 0. Auth & Identity
	final_token = token or os.environ.get("HF_TOKEN")
	username = get_username(final_token)

	if not username:
	raise ValueError("Invalid or missing Hugging Face Token. Ensure the token is provided or set as HF_TOKEN secret.")

	# Target path is now the USER'S profile
	full_repo_id = f"{username}/{model_name}"
	log_queue.put(f"🚀 Initializing for user: {username}\n")
	log_queue.put(f"📦 Target Repository: https://huggingface.co/{full_repo_id}\n")

	# Validation for Transformer logic
	if n_embd % n_head != 0:
	raise ValueError(f"Embedding dimension ({n_embd}) must be divisible by number of heads ({n_head}).")

	# 1. Load Dataset
	log_queue.put(f"📚 Loading dataset: {dataset_id} (Limit: {sample_limit})...\n")
	try:
	# We use the train split; user can specify limit
	dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]")
	except Exception as e:
	raise ValueError(f"Error loading dataset: {e}")

	# Auto-detect text column
	text_column = "text"
	if "text" not in dataset.column_names:
	for col in dataset.column_names:
	if isinstance(dataset[0][col], str):
	text_column = col
	break

	log_queue.put(f"🔍 Using text column: '{text_column}'\n")

	# 2. Tokenize
	log_queue.put("✂️ Tokenizing data...\n")
	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
	tokenizer.pad_token = tokenizer.eos_token

	def tokenize_function(examples):
	return tokenizer(
	examples[text_column],
	padding="max_length",
	truncation=True,
	max_length=int(context_length)
	)

	tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

	# 3. Initialize Model
	log_queue.put("🏗️ Building GPT-2 Architecture...\n")
	config = GPT2Config(
	vocab_size=len(tokenizer),
	n_positions=int(context_length),
	n_ctx=int(context_length),
	n_embd=int(n_embd),
	n_layer=int(num_layers),
	n_head=int(n_head),
	)
	model = GPT2LMHeadModel(config)

	# 4. Train
	log_queue.put("🏋️ Starting Training Loop...\n")

	training_args = TrainingArguments(
	output_dir="./local_results",
	overwrite_output_dir=True,
	num_train_epochs=epochs,
	per_device_train_batch_size=int(batch_size),
	gradient_accumulation_steps=int(grad_accumulation),
	learning_rate=lr,
	weight_decay=weight_decay,
	warmup_steps=int(warmup_steps),
	logging_steps=10,
	save_strategy="no",
	push_to_hub=False,
	report_to="none",
	use_cpu=not torch.cuda.is_available(),
	fp16=torch.cuda.is_available(),
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
	train_dataset=tokenized_datasets,
	callbacks=[LogQueueCallback(log_queue)]
	)

	trainer.train()

	# 5. Push to User's Personal Hub
	log_queue.put(f"☁️ Uploading model to your profile...\n")
	model.push_to_hub(full_repo_id, token=final_token)
	tokenizer.push_to_hub(full_repo_id, token=final_token)

	result_queue.put(f"🎉 Success! Published to: https://huggingface.co/{full_repo_id}")

	except Exception as e:
	log_queue.put(f"❌ Error: {str(e)}\n")
	result_queue.put(None)

	# --- Generator for UI updates ---

	def train_and_push_generator(
	token, dataset_id, model_name,
	num_layers, n_embd, n_head, context_length,
	epochs, lr, weight_decay, warmup_steps,
	batch_size, grad_accumulation, sample_limit
	):
	effective_token = token or os.environ.get("HF_TOKEN")

	if not effective_token:
	yield "Error: No Hugging Face Token found. Please enter a 'Write' token below.", ""
	return

	log_queue = queue.Queue()
	result_queue = queue.Queue()

	t = threading.Thread(target=train_thread_target, args=(
	effective_token, dataset_id, model_name,
	num_layers, n_embd, n_head, context_length,
	epochs, lr, weight_decay, warmup_steps,
	batch_size, grad_accumulation, sample_limit,
	log_queue, result_queue
	))
	t.start()

	logs_history = ""
	while t.is_alive():
	while not log_queue.empty():
	logs_history += log_queue.get()
	yield logs_history, "Training in progress..."
	time.sleep(0.5)

	while not log_queue.empty():
	logs_history += log_queue.get()

	if not result_queue.empty():
	result = result_queue.get()
	yield logs_history, result or "Training failed. See logs."
	else:
	yield logs_history, "Process interrupted."

	# --- UI Layout ---

	with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate")) as demo:
	gr.Markdown("# 🚀 Personal Auto-PreTrain")
	gr.Markdown("Configure a custom GPT-2 architecture and train it directly to your personal Hugging Face profile.")

	with gr.Row():
	hf_token = gr.Textbox(
	label="HF Write Token",
	placeholder="hf_...",
	type="password",
	info="Required to create the repo on your profile. Must have 'Write' permissions."
	)
	model_name_input = gr.Textbox(
	label="Model Name",
	value="my-custom-gpt2",
	placeholder="e.g. tiny-stories-v1"
	)

	with gr.Tabs():
	with gr.TabItem("1. Data Selection"):
	with gr.Row():
	dataset_input = gr.Textbox(
	label="Dataset ID",
	value="roneneldan/TinyStories",
	placeholder="e.g. wikitext"
	)
	sample_limit = gr.Number(
	label="Training Samples",
	value=500,
	precision=0
	)
	context_length = gr.Slider(
	minimum=64, maximum=1024, value=128, step=64,
	label="Max Context Length"
	)

	with gr.TabItem("2. Architecture"):
	with gr.Row():
	layers = gr.Slider(minimum=1, maximum=12, value=2, step=1, label="Layers")
	embd = gr.Slider(minimum=64, maximum=1024, value=128, step=64, label="Embedding Dim")
	with gr.Row():
	heads = gr.Slider(minimum=2, maximum=16, value=4, step=2, label="Attention Heads")
	gr.Markdown("_Note: Embedding Dim must be divisible by Attention Heads._")

	with gr.TabItem("3. Training Settings"):
	with gr.Row():
	epochs = gr.Slider(minimum=1, maximum=20, value=1, step=1, label="Epochs")
	lr = gr.Number(label="Learning Rate", value=5e-4)
	with gr.Row():
	batch_size = gr.Slider(minimum=1, maximum=32, value=4, step=1, label="Batch Size")
	grad_accumulation = gr.Slider(minimum=1, maximum=16, value=1, step=1, label="Grad Accumulation")
	with gr.Row():
	weight_decay = gr.Slider(minimum=0.0, maximum=0.1, value=0.01, step=0.01, label="Weight Decay")
	warmup_steps = gr.Number(label="Warmup Steps", value=50, precision=0)

	train_btn = gr.Button("🔥 Start Training & Push to My Profile", variant="primary")

	with gr.Row():
	log_output = gr.Code(label="Training Progress", language="json", lines=12)
	status_output = gr.Textbox(label="Final Status", interactive=False)

	train_btn.click(
	fn=train_and_push_generator,
	inputs=[
	hf_token, dataset_input, model_name_input,
	layers, embd, heads, context_length,
	epochs, lr, weight_decay, warmup_steps,
	batch_size, grad_accumulation, sample_limit
	],
	outputs=[log_output, status_output]
	)

	if __name__ == "__main__":
	demo.launch()