APT-product

Sleeping

App Files Files Community

FlameF0X commited on Jan 16

Commit

553a5e7

verified ·

1 Parent(s): 63fe5cf

Create app.py

Browse files

Files changed (1) hide show

app.py +215 -0

app.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import gradio as gr
+import torch
+from transformers import (
+    GPT2Config,
+    GPT2LMHeadModel,
+    GPT2Tokenizer,
+    Trainer,
+    TrainingArguments,
+    DataCollatorForLanguageModeling
+)
+from datasets import load_dataset
+from huggingface_hub import whoami
+import os
+# --- Helper Functions ---
+def get_user_info(token):
+    """Retrieves the username from the HF token."""
+    if not token:
+        return None
+    try:
+        info = whoami(token=token)
+        return info['name']
+    except Exception:
+        return None
+def train_and_push(
+    dataset_id,
+    model_name,
+    num_layers,
+    n_embd,
+    epochs,
+    lr,
+    sample_limit,
+    oauth_token: gr.OAuthToken
+):
+    """
+    Main Logic:
+    1. Authenticate
+    2. Load & Prepare Data
+    3. Initialize Tiny Model
+    4. Train
+    5. Push to Hub
+    """
+    # 1. Authentication Check
+    if oauth_token is None or oauth_token.token is None:
+        raise gr.Error("You must be logged in to train a model!")
+    token = oauth_token.token
+    username = get_user_info(token)
+    if not username:
+        raise gr.Error("Could not retrieve user info. Please try logging in again.")
+    full_repo_id = f"{username}/{model_name}"
+    progress = gr.Progress()
+    try:
+        # 2. Load Dataset
+        progress(0.1, desc=f"Loading dataset: {dataset_id}...")
+        # We try to load the dataset. We'll default to the 'train' split.
+        # We only take a small slice to keep it fast for this demo.
+        try:
+            # Try loading just the first 'sample_limit' rows to save bandwidth/memory
+            dataset = load_dataset(dataset_id, split=f"train[:{int(sample_limit)}]")
+        except Exception as e:
+            raise gr.Error(f"Error loading dataset: {str(e)}. Make sure it exists and has a 'train' split.")
+        # Heuristic: Find the text column (first string column)
+        text_column = "text"
+        if "text" not in dataset.column_names:
+            # simple fallback: look for the first string column
+            for col, dtype in zip(dataset.column_names, dataset.features.values()):
+                if hasattr(dtype, 'dtype') and dtype.dtype == 'string':
+                    text_column = col
+                    break
+        if text_column not in dataset.column_names:
+            raise gr.Error("Could not find a text column in this dataset. Please use a dataset with a 'text' column.")
+        progress(0.2, desc="Tokenizing data...")
+        # We use the standard GPT-2 tokenizer for convenience
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        tokenizer.pad_token = tokenizer.eos_token
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column], padding="max_length", truncation=True, max_length=128)
+        tokenized_datasets = dataset.map(tokenize_function, batched=True)
+        # 3. Initialize Model
+        progress(0.3, desc="Initializing Nano Model...")
+        # We create a custom configuration based on user inputs (Constrained for speed)
+        config = GPT2Config(
+            vocab_size=len(tokenizer),
+            n_positions=128,  # Short context window for speed
+            n_ctx=128,
+            n_embd=int(n_embd),     # Small embedding size
+            n_layer=int(num_layers), # Few layers
+            n_head=4,
+        )
+        model = GPT2LMHeadModel(config)
+        # 4. Training
+        progress(0.4, desc="Starting Training (this might take a minute)...")
+        training_args = TrainingArguments(
+            output_dir="./results",
+            overwrite_output_dir=True,
+            num_train_epochs=epochs,
+            per_device_train_batch_size=8,
+            save_steps=500,
+            save_total_limit=1,
+            prediction_loss_only=True,
+            learning_rate=lr,
+            logging_steps=10,
+            report_to="none", # Don't log to wandb/tensorboard
+            use_cpu=not torch.cuda.is_available(), # Force CPU if no GPU available
+        )
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer, mlm=False
+        )
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            data_collator=data_collator,
+            train_dataset=tokenized_datasets,
+        )
+        trainer.train()
+        # 5. Push to Hub
+        progress(0.9, desc=f"Pushing to {full_repo_id}...")
+        # We push both model and tokenizer
+        model.push_to_hub(full_repo_id, token=token, private=True) # Default to private for safety
+        tokenizer.push_to_hub(full_repo_id, token=token, private=True)
+        return f"🎉 Success! Model trained and pushed to: https://huggingface.co/{full_repo_id}"
+    except Exception as e:
+        raise gr.Error(f"An error occurred: {str(e)}")
+# --- UI Layout ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🚂 Tiny AutoTrain Space
+        Login with your Hugging Face account, pick a dataset, and train a tiny language model from scratch!
+        The model will be automatically uploaded to your profile.
+        """
+    )
+    # Login Button (Native HF Integration)
+    with gr.Row():
+        login_btn = gr.LoginButton(value="Sign in with Hugging Face to Train")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### 1. Data Configuration")
+            dataset_input = gr.Textbox(
+                label="Dataset Name (from Hub)",
+                value="roneneldan/TinyStories",
+                placeholder="e.g. wikitext, roneneldan/TinyStories"
+            )
+            sample_limit = gr.Slider(
+                minimum=100, maximum=5000, value=500, step=100,
+                label="Training Sample Size (Keep small for speed)"
+            )
+        with gr.Column():
+            gr.Markdown("### 2. Model Hyperparameters")
+            model_name_input = gr.Textbox(
+                label="New Model Name",
+                value="my-tiny-model",
+                placeholder="Name of the repo to create"
+            )
+            with gr.Row():
+                layers = gr.Slider(minimum=1, maximum=6, value=2, step=1, label="Layers (Depth)")
+                embd = gr.Slider(minimum=32, maximum=256, value=64, step=32, label="Embedding Size (Width)")
+            with gr.Row():
+                epochs = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Epochs")
+                lr = gr.Number(label="Learning Rate", value=5e-4)
+    train_btn = gr.Button("🚀 Train & Publish", variant="primary")
+    output_text = gr.Textbox(label="Status", interactive=False)
+    # Wire up the button
+    train_btn.click(
+        fn=train_and_push,
+        inputs=[
+            dataset_input,
+            model_name_input,
+            layers,
+            embd,
+            epochs,
+            lr,
+            sample_limit
+        ],
+        outputs=output_text
+    )
+if __name__ == "__main__":
+    demo.launch()