import gradio as gr
import torch
import os
import threading
import queue
import time
import json
from transformers import (
    GPT2Config, 
    GPT2LMHeadModel, 
    GPT2Tokenizer, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling,
    TrainerCallback
)
from datasets import load_dataset
from huggingface_hub import whoami, HfApi

# --- Helper Classes ---

class LogQueueCallback(TrainerCallback):
    """A custom callback that pushes logs to a queue for the UI."""
    def __init__(self, log_queue):
        self.log_queue = log_queue

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            # Format log dictionary nicely
            log_str = f"Step {state.global_step}: {json.dumps(logs)}\n"
            self.log_queue.put(log_str)

def get_username(token):
    """Retrieves the username from the HF token."""
    if not token:
        return None
    try:
        info = whoami(token=token)
        return info['name']
    except Exception:
        return None

def train_thread_target(
    token,
    dataset_id, 
    model_name, 
    num_layers, 
    n_embd, 
    n_head,
    context_length,
    epochs, 
    lr, 
    weight_decay,
    warmup_steps,
    batch_size,
    grad_accumulation,
    sample_limit,
    log_queue, 
    result_queue
):
    """
    Background thread for training and pushing to user profile.
    """
    try:
        # 0. Auth & Identity
        final_token = token or os.environ.get("HF_TOKEN")
        username = get_username(final_token)
        
        if not username:
            raise ValueError("Invalid or missing Hugging Face Token. Ensure the token is provided or set as HF_TOKEN secret.")
            
        # Target path is now the USER'S profile
        full_repo_id = f"{username}/{model_name}"
        log_queue.put(f"🚀 Initializing for user: {username}\n")
        log_queue.put(f"📦 Target Repository: https://huggingface.co/{full_repo_id}\n")

        # Validation for Transformer logic
        if n_embd % n_head != 0:
            raise ValueError(f"Embedding dimension ({n_embd}) must be divisible by number of heads ({n_head}).")

        # 1. Load Dataset
        log_queue.put(f"📚 Loading dataset: {dataset_id} (Limit: {sample_limit})...\n")
        try:
            # We use the train split; user can specify limit
            dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]")
        except Exception as e:
            raise ValueError(f"Error loading dataset: {e}")

        # Auto-detect text column
        text_column = "text"
        if "text" not in dataset.column_names:
            for col in dataset.column_names:
                if isinstance(dataset[0][col], str):
                    text_column = col
                    break
        
        log_queue.put(f"🔍 Using text column: '{text_column}'\n")

        # 2. Tokenize
        log_queue.put("✂️ Tokenizing data...\n")
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        tokenizer.pad_token = tokenizer.eos_token

        def tokenize_function(examples):
            return tokenizer(
                examples[text_column], 
                padding="max_length", 
                truncation=True, 
                max_length=int(context_length)
            )

        tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

        # 3. Initialize Model
        log_queue.put("🏗️ Building GPT-2 Architecture...\n")
        config = GPT2Config(
            vocab_size=len(tokenizer),
            n_positions=int(context_length),
            n_ctx=int(context_length),
            n_embd=int(n_embd),
            n_layer=int(num_layers),
            n_head=int(n_head),
        )
        model = GPT2LMHeadModel(config)

        # 4. Train
        log_queue.put("🏋️ Starting Training Loop...\n")
        
        training_args = TrainingArguments(
            output_dir="./local_results",
            overwrite_output_dir=True,
            num_train_epochs=epochs,
            per_device_train_batch_size=int(batch_size),
            gradient_accumulation_steps=int(grad_accumulation),
            learning_rate=lr,
            weight_decay=weight_decay,
            warmup_steps=int(warmup_steps),
            logging_steps=10,
            save_strategy="no", 
            push_to_hub=False,
            report_to="none",
            use_cpu=not torch.cuda.is_available(),
            fp16=torch.cuda.is_available(),
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
            train_dataset=tokenized_datasets,
            callbacks=[LogQueueCallback(log_queue)]
        )

        trainer.train()

        # 5. Push to User's Personal Hub
        log_queue.put(f"☁️ Uploading model to your profile...\n")
        model.push_to_hub(full_repo_id, token=final_token)
        tokenizer.push_to_hub(full_repo_id, token=final_token)

        result_queue.put(f"🎉 Success! Published to: https://huggingface.co/{full_repo_id}")
    
    except Exception as e:
        log_queue.put(f"❌ Error: {str(e)}\n")
        result_queue.put(None)

# --- Generator for UI updates ---

def train_and_push_generator(
    token, dataset_id, model_name, 
    num_layers, n_embd, n_head, context_length,
    epochs, lr, weight_decay, warmup_steps,
    batch_size, grad_accumulation, sample_limit
):
    effective_token = token or os.environ.get("HF_TOKEN")

    if not effective_token:
        yield "Error: No Hugging Face Token found. Please enter a 'Write' token below.", ""
        return
    
    log_queue = queue.Queue()
    result_queue = queue.Queue()
    
    t = threading.Thread(target=train_thread_target, args=(
        effective_token, dataset_id, model_name, 
        num_layers, n_embd, n_head, context_length,
        epochs, lr, weight_decay, warmup_steps,
        batch_size, grad_accumulation, sample_limit,
        log_queue, result_queue
    ))
    t.start()
    
    logs_history = ""
    while t.is_alive():
        while not log_queue.empty():
            logs_history += log_queue.get()
            yield logs_history, "Training in progress..."
        time.sleep(0.5)
        
    while not log_queue.empty():
        logs_history += log_queue.get()
        
    if not result_queue.empty():
        result = result_queue.get()
        yield logs_history, result or "Training failed. See logs."
    else:
        yield logs_history, "Process interrupted."

# --- UI Layout ---

with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate")) as demo:
    gr.Markdown("# 🚀 Personal Auto-PreTrain")
    gr.Markdown("Configure a custom GPT-2 architecture and train it directly to **your personal** Hugging Face profile.")
    
    with gr.Row():
        hf_token = gr.Textbox(
            label="HF Write Token", 
            placeholder="hf_...", 
            type="password",
            info="Required to create the repo on your profile. Must have 'Write' permissions."
        )
        model_name_input = gr.Textbox(
            label="Model Name", 
            value="my-custom-gpt2",
            placeholder="e.g. tiny-stories-v1"
        )

    with gr.Tabs():
        with gr.TabItem("1. Data Selection"):
            with gr.Row():
                dataset_input = gr.Textbox(
                    label="Dataset ID", 
                    value="roneneldan/TinyStories",
                    placeholder="e.g. wikitext"
                )
                sample_limit = gr.Number(
                    label="Training Samples", 
                    value=500, 
                    precision=0
                )
            context_length = gr.Slider(
                minimum=64, maximum=1024, value=128, step=64, 
                label="Max Context Length"
            )

        with gr.TabItem("2. Architecture"):
            with gr.Row():
                layers = gr.Slider(minimum=1, maximum=12, value=2, step=1, label="Layers")
                embd = gr.Slider(minimum=64, maximum=1024, value=128, step=64, label="Embedding Dim")
            with gr.Row():
                heads = gr.Slider(minimum=2, maximum=16, value=4, step=2, label="Attention Heads")
                gr.Markdown("_Note: Embedding Dim must be divisible by Attention Heads._")

        with gr.TabItem("3. Training Settings"):
            with gr.Row():
                epochs = gr.Slider(minimum=1, maximum=20, value=1, step=1, label="Epochs")
                lr = gr.Number(label="Learning Rate", value=5e-4)
            with gr.Row():
                batch_size = gr.Slider(minimum=1, maximum=32, value=4, step=1, label="Batch Size")
                grad_accumulation = gr.Slider(minimum=1, maximum=16, value=1, step=1, label="Grad Accumulation")
            with gr.Row():
                weight_decay = gr.Slider(minimum=0.0, maximum=0.1, value=0.01, step=0.01, label="Weight Decay")
                warmup_steps = gr.Number(label="Warmup Steps", value=50, precision=0)

    train_btn = gr.Button("🔥 Start Training & Push to My Profile", variant="primary")
    
    with gr.Row():
        log_output = gr.Code(label="Training Progress", language="json", lines=12)
        status_output = gr.Textbox(label="Final Status", interactive=False)

    train_btn.click(
        fn=train_and_push_generator,
        inputs=[
            hf_token, dataset_input, model_name_input, 
            layers, embd, heads, context_length,
            epochs, lr, weight_decay, warmup_steps,
            batch_size, grad_accumulation, sample_limit
        ],
        outputs=[log_output, status_output]
    )

if __name__ == "__main__":
    demo.launch()