Spaces:

jeromekenny
/

auto_train_distilbert

Sleeping

App Files Files Community

jeromekenny commited on Sep 10, 2025

Commit

afbea16

verified ·

1 Parent(s): 39a1fe7

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -96

app.py CHANGED Viewed

@@ -1,105 +1,88 @@
 import gradio as gr
-import os
-def test_environment():
-    """Test if the environment is working"""
-    try:
-        import torch
-        import transformers
-        import datasets
-        torch_version = torch.__version__
-        transformers_version = transformers.__version__
-        datasets_version = datasets.__version__
-        return f"✅ Environment Test Passed!\n\nVersions:\n• PyTorch: {torch_version}\n• Transformers: {transformers_version}\n• Datasets: {datasets_version}\n\nCPU Available: {torch.cuda.is_available() == False}\nMemory info: Basic setup working"
-    except Exception as e:
-        return f"❌ Environment Error: {str(e)}"
-def simple_training_test():
-    """A very simple test to see if basic model loading works"""
-    try:
-        from transformers import AutoTokenizer
-        # Test with the smallest possible model
-        model_name = "prajjwal1/bert-tiny"  # Only 4.4M parameters!
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        # Test tokenization
-        test_text = "This is a test"
-        tokens = tokenizer(test_text, return_tensors="pt")
-        return f"✅ Basic Model Test Passed!\n\nModel: {model_name}\nTest text: '{test_text}'\nTokens created: {len(tokens['input_ids'][0])} tokens\n\nNext step: Try the actual training!"
-    except Exception as e:
-        return f"❌ Model Loading Error: {str(e)}\n\nThis might be a memory or dependency issue."
-def start_minimal_training():
-    """Minimal training with the tiniest possible setup"""
     try:
-        from transformers import AutoTokenizer, AutoModelForSequenceClassification
-        from datasets import Dataset
-        import torch
-        # Use the smallest model available
-        model_name = "prajjwal1/bert-tiny"
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
-        # Create tiny fake dataset (just 10 samples)
-        fake_data = {
-            'text': ['good movie'] * 5 + ['bad movie'] * 5,
-            'label': [1] * 5 + [0] * 5
-        }
-        dataset = Dataset.from_dict(fake_data)
-        def tokenize(examples):
-            return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=64)
-        tokenized = dataset.map(tokenize, batched=True)
-        return f"✅ Minimal Training Setup Complete!\n\nModel: {model_name} ({sum(p.numel() for p in model.parameters())} parameters)\nDataset: {len(fake_data['text'])} samples\nTokenization: Complete\n\n🎉 This proves the workflow works! You can now try larger datasets."
     except Exception as e:
-        return f"❌ Training Error: {str(e)}\n\nDetailed error for debugging."
-# Create Gradio interface with debugging steps
-with gr.Blocks(title="Debug Training Space") as demo:
-    gr.Markdown("# 🔧 Debug Training Space")
-    gr.Markdown("Let's debug the training issue step by step!")
-    with gr.Tab("🧪 Step 1: Test Environment"):
-        test_env_btn = gr.Button("Test Environment Setup")
-        env_output = gr.Textbox(label="Environment Test Results", lines=10)
-        test_env_btn.click(test_environment, outputs=env_output)
-    with gr.Tab("🤖 Step 2: Test Model Loading"):
-        test_model_btn = gr.Button("Test Basic Model Loading")
-        model_output = gr.Textbox(label="Model Test Results", lines=10)
-        test_model_btn.click(simple_training_test, outputs=model_output)
-    with gr.Tab("⚡ Step 3: Minimal Training"):
-        minimal_train_btn = gr.Button("Run Minimal Training Test")
-        train_output = gr.Textbox(label="Training Test Results", lines=10)
-        minimal_train_btn.click(start_minimal_training, outputs=train_output)
-    with gr.Tab("💡 Next Steps"):
-        gr.Markdown("""
-        ## If All Tests Pass:
-        1. Your environment is working correctly
-        2. The original error was likely due to memory/resource limits on CPU Basic
-        3. Try the **AutoTrain** approach instead (no-code solution)
-        ## If Tests Fail:
-        - Check the specific error messages
-        - The requirements.txt might need adjustment
-        - Consider using a different Space configuration
-        ## Recommended Next Step:
-        **Use AutoTrain directly**: Go to https://huggingface.co/autotrain for the no-code approach your mentor suggested!
-        """)
 demo.launch()

 import gradio as gr
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    TrainingArguments,
+    Trainer
+)
+def train_cpu_optimized():
+    """Train TinyBERT with CPU Basic optimized settings"""
+    model_name = "huawei-noah/TinyBERT_General_4L_312D"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
+    # Load IMDB dataset
+    raw_dataset = load_dataset("imdb")
+    def tokenize_function(examples):
+        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)
+    # CPU-friendly dataset sizes
+    train_ds = raw_dataset["train"].shuffle(seed=42).select(range(500))
+    eval_ds = raw_dataset["test"].shuffle(seed=42).select(range(200))
+    train_dataset = train_ds.map(tokenize_function, batched=True)
+    eval_dataset = eval_ds.map(tokenize_function, batched=True)
+    # CPU-optimized training arguments
+    training_args = TrainingArguments(
+        output_dir="./results",
+        num_train_epochs=3,
+        per_device_train_batch_size=8,
+        per_device_eval_batch_size=16,
+        learning_rate=3e-4,
+        evaluation_strategy="epoch",
+        save_strategy="epoch",
+        logging_steps=25,
+        dataloader_num_workers=0,
+        report_to="none",
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset
+    )
+    # Start training
+    trainer.train()
+    # Save the final model
+    trainer.save_model("./final_tinybert_model")
+    tokenizer.save_pretrained("./final_tinybert_model")
+    return "✅ Training complete! Model saved to ./final_tinybert_model"
+def test_model(text):
+    """Test your trained model"""
     try:
+        from transformers import pipeline
+        pipe = pipeline("sentiment-analysis", model="./final_tinybert_model")
+        result = pipe(text)
+        return f"Prediction: {result[0]['label']} (Confidence: {result[0]['score']:.3f})"
     except Exception as e:
+        return f"Error: {str(e)}. Please train the model first!"
+# Create Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# 🤖 TinyBERT CPU-Optimized Training")
+    gr.Markdown("**Complete ML workflow on CPU Basic - perfectly optimized for your hardware!**")
+    with gr.Tab("🚀 Train Model"):
+        gr.Markdown("This will train TinyBERT on 500 IMDB samples (15-20 minutes)")
+        train_btn = gr.Button("Start CPU-Optimized Training")
+        train_output = gr.Textbox(label="Training Progress", lines=5)
+        train_btn.click(train_cpu_optimized, outputs=train_output)
+    with gr.Tab("🧪 Test Model"):
+        gr.Markdown("Test your trained sentiment analysis model:")
+        test_input = gr.Textbox(label="Enter text to analyze", placeholder="This movie was fantastic!")
+        test_btn = gr.Button("Analyze Sentiment")
+        test_output = gr.Textbox(label="Prediction Result")
+        test_btn.click(test_model, inputs=test_input, outputs=test_output)
 demo.launch()