Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,105 +1,88 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
def
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
import datasets
|
| 10 |
-
|
| 11 |
-
torch_version = torch.__version__
|
| 12 |
-
transformers_version = transformers.__version__
|
| 13 |
-
datasets_version = datasets.__version__
|
| 14 |
-
|
| 15 |
-
return f"β
Environment Test Passed!\n\nVersions:\nβ’ PyTorch: {torch_version}\nβ’ Transformers: {transformers_version}\nβ’ Datasets: {datasets_version}\n\nCPU Available: {torch.cuda.is_available() == False}\nMemory info: Basic setup working"
|
| 16 |
-
|
| 17 |
-
except Exception as e:
|
| 18 |
-
return f"β Environment Error: {str(e)}"
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
return f"β
Basic Model Test Passed!\n\nModel: {model_name}\nTest text: '{test_text}'\nTokens created: {len(tokens['input_ids'][0])} tokens\n\nNext step: Try the actual training!"
|
| 34 |
-
|
| 35 |
-
except Exception as e:
|
| 36 |
-
return f"β Model Loading Error: {str(e)}\n\nThis might be a memory or dependency issue."
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
try:
|
| 41 |
-
from transformers import
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
# Use the smallest model available
|
| 46 |
-
model_name = "prajjwal1/bert-tiny"
|
| 47 |
-
|
| 48 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 49 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
|
| 50 |
-
|
| 51 |
-
# Create tiny fake dataset (just 10 samples)
|
| 52 |
-
fake_data = {
|
| 53 |
-
'text': ['good movie'] * 5 + ['bad movie'] * 5,
|
| 54 |
-
'label': [1] * 5 + [0] * 5
|
| 55 |
-
}
|
| 56 |
-
|
| 57 |
-
dataset = Dataset.from_dict(fake_data)
|
| 58 |
-
|
| 59 |
-
def tokenize(examples):
|
| 60 |
-
return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=64)
|
| 61 |
-
|
| 62 |
-
tokenized = dataset.map(tokenize, batched=True)
|
| 63 |
-
|
| 64 |
-
return f"β
Minimal Training Setup Complete!\n\nModel: {model_name} ({sum(p.numel() for p in model.parameters())} parameters)\nDataset: {len(fake_data['text'])} samples\nTokenization: Complete\n\nπ This proves the workflow works! You can now try larger datasets."
|
| 65 |
-
|
| 66 |
except Exception as e:
|
| 67 |
-
return f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
test_env_btn = gr.Button("Test Environment Setup")
|
| 76 |
-
env_output = gr.Textbox(label="Environment Test Results", lines=10)
|
| 77 |
-
test_env_btn.click(test_environment, outputs=env_output)
|
| 78 |
-
|
| 79 |
-
with gr.Tab("π€ Step 2: Test Model Loading"):
|
| 80 |
-
test_model_btn = gr.Button("Test Basic Model Loading")
|
| 81 |
-
model_output = gr.Textbox(label="Model Test Results", lines=10)
|
| 82 |
-
test_model_btn.click(simple_training_test, outputs=model_output)
|
| 83 |
-
|
| 84 |
-
with gr.Tab("β‘ Step 3: Minimal Training"):
|
| 85 |
-
minimal_train_btn = gr.Button("Run Minimal Training Test")
|
| 86 |
-
train_output = gr.Textbox(label="Training Test Results", lines=10)
|
| 87 |
-
minimal_train_btn.click(start_minimal_training, outputs=train_output)
|
| 88 |
-
|
| 89 |
-
with gr.Tab("π‘ Next Steps"):
|
| 90 |
-
gr.Markdown("""
|
| 91 |
-
## If All Tests Pass:
|
| 92 |
-
1. Your environment is working correctly
|
| 93 |
-
2. The original error was likely due to memory/resource limits on CPU Basic
|
| 94 |
-
3. Try the **AutoTrain** approach instead (no-code solution)
|
| 95 |
-
|
| 96 |
-
## If Tests Fail:
|
| 97 |
-
- Check the specific error messages
|
| 98 |
-
- The requirements.txt might need adjustment
|
| 99 |
-
- Consider using a different Space configuration
|
| 100 |
-
|
| 101 |
-
## Recommended Next Step:
|
| 102 |
-
**Use AutoTrain directly**: Go to https://huggingface.co/autotrain for the no-code approach your mentor suggested!
|
| 103 |
-
""")
|
| 104 |
|
| 105 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import torch
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
from transformers import (
|
| 5 |
+
AutoTokenizer,
|
| 6 |
+
AutoModelForSequenceClassification,
|
| 7 |
+
TrainingArguments,
|
| 8 |
+
Trainer
|
| 9 |
+
)
|
| 10 |
|
| 11 |
+
def train_cpu_optimized():
|
| 12 |
+
"""Train TinyBERT with CPU Basic optimized settings"""
|
| 13 |
+
model_name = "huawei-noah/TinyBERT_General_4L_312D"
|
| 14 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 15 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
# Load IMDB dataset
|
| 18 |
+
raw_dataset = load_dataset("imdb")
|
| 19 |
+
|
| 20 |
+
def tokenize_function(examples):
|
| 21 |
+
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)
|
| 22 |
+
|
| 23 |
+
# CPU-friendly dataset sizes
|
| 24 |
+
train_ds = raw_dataset["train"].shuffle(seed=42).select(range(500))
|
| 25 |
+
eval_ds = raw_dataset["test"].shuffle(seed=42).select(range(200))
|
| 26 |
+
|
| 27 |
+
train_dataset = train_ds.map(tokenize_function, batched=True)
|
| 28 |
+
eval_dataset = eval_ds.map(tokenize_function, batched=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
# CPU-optimized training arguments
|
| 31 |
+
training_args = TrainingArguments(
|
| 32 |
+
output_dir="./results",
|
| 33 |
+
num_train_epochs=3,
|
| 34 |
+
per_device_train_batch_size=8,
|
| 35 |
+
per_device_eval_batch_size=16,
|
| 36 |
+
learning_rate=3e-4,
|
| 37 |
+
evaluation_strategy="epoch",
|
| 38 |
+
save_strategy="epoch",
|
| 39 |
+
logging_steps=25,
|
| 40 |
+
dataloader_num_workers=0,
|
| 41 |
+
report_to="none",
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
trainer = Trainer(
|
| 45 |
+
model=model,
|
| 46 |
+
args=training_args,
|
| 47 |
+
train_dataset=train_dataset,
|
| 48 |
+
eval_dataset=eval_dataset
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Start training
|
| 52 |
+
trainer.train()
|
| 53 |
+
|
| 54 |
+
# Save the final model
|
| 55 |
+
trainer.save_model("./final_tinybert_model")
|
| 56 |
+
tokenizer.save_pretrained("./final_tinybert_model")
|
| 57 |
+
|
| 58 |
+
return "β
Training complete! Model saved to ./final_tinybert_model"
|
| 59 |
+
|
| 60 |
+
def test_model(text):
|
| 61 |
+
"""Test your trained model"""
|
| 62 |
try:
|
| 63 |
+
from transformers import pipeline
|
| 64 |
+
pipe = pipeline("sentiment-analysis", model="./final_tinybert_model")
|
| 65 |
+
result = pipe(text)
|
| 66 |
+
return f"Prediction: {result[0]['label']} (Confidence: {result[0]['score']:.3f})"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
except Exception as e:
|
| 68 |
+
return f"Error: {str(e)}. Please train the model first!"
|
| 69 |
+
|
| 70 |
+
# Create Gradio interface
|
| 71 |
+
with gr.Blocks() as demo:
|
| 72 |
+
gr.Markdown("# π€ TinyBERT CPU-Optimized Training")
|
| 73 |
+
gr.Markdown("**Complete ML workflow on CPU Basic - perfectly optimized for your hardware!**")
|
| 74 |
+
|
| 75 |
+
with gr.Tab("π Train Model"):
|
| 76 |
+
gr.Markdown("This will train TinyBERT on 500 IMDB samples (15-20 minutes)")
|
| 77 |
+
train_btn = gr.Button("Start CPU-Optimized Training")
|
| 78 |
+
train_output = gr.Textbox(label="Training Progress", lines=5)
|
| 79 |
+
train_btn.click(train_cpu_optimized, outputs=train_output)
|
| 80 |
|
| 81 |
+
with gr.Tab("π§ͺ Test Model"):
|
| 82 |
+
gr.Markdown("Test your trained sentiment analysis model:")
|
| 83 |
+
test_input = gr.Textbox(label="Enter text to analyze", placeholder="This movie was fantastic!")
|
| 84 |
+
test_btn = gr.Button("Analyze Sentiment")
|
| 85 |
+
test_output = gr.Textbox(label="Prediction Result")
|
| 86 |
+
test_btn.click(test_model, inputs=test_input, outputs=test_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
demo.launch()
|