jeromekenny commited on
Commit
afbea16
Β·
verified Β·
1 Parent(s): 39a1fe7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -96
app.py CHANGED
@@ -1,105 +1,88 @@
1
  import gradio as gr
2
- import os
 
 
 
 
 
 
 
3
 
4
- def test_environment():
5
- """Test if the environment is working"""
6
- try:
7
- import torch
8
- import transformers
9
- import datasets
10
-
11
- torch_version = torch.__version__
12
- transformers_version = transformers.__version__
13
- datasets_version = datasets.__version__
14
-
15
- return f"βœ… Environment Test Passed!\n\nVersions:\nβ€’ PyTorch: {torch_version}\nβ€’ Transformers: {transformers_version}\nβ€’ Datasets: {datasets_version}\n\nCPU Available: {torch.cuda.is_available() == False}\nMemory info: Basic setup working"
16
-
17
- except Exception as e:
18
- return f"❌ Environment Error: {str(e)}"
19
 
20
- def simple_training_test():
21
- """A very simple test to see if basic model loading works"""
22
- try:
23
- from transformers import AutoTokenizer
24
-
25
- # Test with the smallest possible model
26
- model_name = "prajjwal1/bert-tiny" # Only 4.4M parameters!
27
- tokenizer = AutoTokenizer.from_pretrained(model_name)
28
-
29
- # Test tokenization
30
- test_text = "This is a test"
31
- tokens = tokenizer(test_text, return_tensors="pt")
32
-
33
- return f"βœ… Basic Model Test Passed!\n\nModel: {model_name}\nTest text: '{test_text}'\nTokens created: {len(tokens['input_ids'][0])} tokens\n\nNext step: Try the actual training!"
34
-
35
- except Exception as e:
36
- return f"❌ Model Loading Error: {str(e)}\n\nThis might be a memory or dependency issue."
37
 
38
- def start_minimal_training():
39
- """Minimal training with the tiniest possible setup"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  try:
41
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
42
- from datasets import Dataset
43
- import torch
44
-
45
- # Use the smallest model available
46
- model_name = "prajjwal1/bert-tiny"
47
-
48
- tokenizer = AutoTokenizer.from_pretrained(model_name)
49
- model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
50
-
51
- # Create tiny fake dataset (just 10 samples)
52
- fake_data = {
53
- 'text': ['good movie'] * 5 + ['bad movie'] * 5,
54
- 'label': [1] * 5 + [0] * 5
55
- }
56
-
57
- dataset = Dataset.from_dict(fake_data)
58
-
59
- def tokenize(examples):
60
- return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=64)
61
-
62
- tokenized = dataset.map(tokenize, batched=True)
63
-
64
- return f"βœ… Minimal Training Setup Complete!\n\nModel: {model_name} ({sum(p.numel() for p in model.parameters())} parameters)\nDataset: {len(fake_data['text'])} samples\nTokenization: Complete\n\nπŸŽ‰ This proves the workflow works! You can now try larger datasets."
65
-
66
  except Exception as e:
67
- return f"❌ Training Error: {str(e)}\n\nDetailed error for debugging."
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- # Create Gradio interface with debugging steps
70
- with gr.Blocks(title="Debug Training Space") as demo:
71
- gr.Markdown("# πŸ”§ Debug Training Space")
72
- gr.Markdown("Let's debug the training issue step by step!")
73
-
74
- with gr.Tab("πŸ§ͺ Step 1: Test Environment"):
75
- test_env_btn = gr.Button("Test Environment Setup")
76
- env_output = gr.Textbox(label="Environment Test Results", lines=10)
77
- test_env_btn.click(test_environment, outputs=env_output)
78
-
79
- with gr.Tab("πŸ€– Step 2: Test Model Loading"):
80
- test_model_btn = gr.Button("Test Basic Model Loading")
81
- model_output = gr.Textbox(label="Model Test Results", lines=10)
82
- test_model_btn.click(simple_training_test, outputs=model_output)
83
-
84
- with gr.Tab("⚑ Step 3: Minimal Training"):
85
- minimal_train_btn = gr.Button("Run Minimal Training Test")
86
- train_output = gr.Textbox(label="Training Test Results", lines=10)
87
- minimal_train_btn.click(start_minimal_training, outputs=train_output)
88
-
89
- with gr.Tab("πŸ’‘ Next Steps"):
90
- gr.Markdown("""
91
- ## If All Tests Pass:
92
- 1. Your environment is working correctly
93
- 2. The original error was likely due to memory/resource limits on CPU Basic
94
- 3. Try the **AutoTrain** approach instead (no-code solution)
95
-
96
- ## If Tests Fail:
97
- - Check the specific error messages
98
- - The requirements.txt might need adjustment
99
- - Consider using a different Space configuration
100
-
101
- ## Recommended Next Step:
102
- **Use AutoTrain directly**: Go to https://huggingface.co/autotrain for the no-code approach your mentor suggested!
103
- """)
104
 
105
  demo.launch()
 
1
  import gradio as gr
2
+ import torch
3
+ from datasets import load_dataset
4
+ from transformers import (
5
+ AutoTokenizer,
6
+ AutoModelForSequenceClassification,
7
+ TrainingArguments,
8
+ Trainer
9
+ )
10
 
11
+ def train_cpu_optimized():
12
+ """Train TinyBERT with CPU Basic optimized settings"""
13
+ model_name = "huawei-noah/TinyBERT_General_4L_312D"
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
15
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # Load IMDB dataset
18
+ raw_dataset = load_dataset("imdb")
19
+
20
+ def tokenize_function(examples):
21
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)
22
+
23
+ # CPU-friendly dataset sizes
24
+ train_ds = raw_dataset["train"].shuffle(seed=42).select(range(500))
25
+ eval_ds = raw_dataset["test"].shuffle(seed=42).select(range(200))
26
+
27
+ train_dataset = train_ds.map(tokenize_function, batched=True)
28
+ eval_dataset = eval_ds.map(tokenize_function, batched=True)
 
 
 
 
 
29
 
30
+ # CPU-optimized training arguments
31
+ training_args = TrainingArguments(
32
+ output_dir="./results",
33
+ num_train_epochs=3,
34
+ per_device_train_batch_size=8,
35
+ per_device_eval_batch_size=16,
36
+ learning_rate=3e-4,
37
+ evaluation_strategy="epoch",
38
+ save_strategy="epoch",
39
+ logging_steps=25,
40
+ dataloader_num_workers=0,
41
+ report_to="none",
42
+ )
43
+
44
+ trainer = Trainer(
45
+ model=model,
46
+ args=training_args,
47
+ train_dataset=train_dataset,
48
+ eval_dataset=eval_dataset
49
+ )
50
+
51
+ # Start training
52
+ trainer.train()
53
+
54
+ # Save the final model
55
+ trainer.save_model("./final_tinybert_model")
56
+ tokenizer.save_pretrained("./final_tinybert_model")
57
+
58
+ return "βœ… Training complete! Model saved to ./final_tinybert_model"
59
+
60
+ def test_model(text):
61
+ """Test your trained model"""
62
  try:
63
+ from transformers import pipeline
64
+ pipe = pipeline("sentiment-analysis", model="./final_tinybert_model")
65
+ result = pipe(text)
66
+ return f"Prediction: {result[0]['label']} (Confidence: {result[0]['score']:.3f})"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  except Exception as e:
68
+ return f"Error: {str(e)}. Please train the model first!"
69
+
70
+ # Create Gradio interface
71
+ with gr.Blocks() as demo:
72
+ gr.Markdown("# πŸ€– TinyBERT CPU-Optimized Training")
73
+ gr.Markdown("**Complete ML workflow on CPU Basic - perfectly optimized for your hardware!**")
74
+
75
+ with gr.Tab("πŸš€ Train Model"):
76
+ gr.Markdown("This will train TinyBERT on 500 IMDB samples (15-20 minutes)")
77
+ train_btn = gr.Button("Start CPU-Optimized Training")
78
+ train_output = gr.Textbox(label="Training Progress", lines=5)
79
+ train_btn.click(train_cpu_optimized, outputs=train_output)
80
 
81
+ with gr.Tab("πŸ§ͺ Test Model"):
82
+ gr.Markdown("Test your trained sentiment analysis model:")
83
+ test_input = gr.Textbox(label="Enter text to analyze", placeholder="This movie was fantastic!")
84
+ test_btn = gr.Button("Analyze Sentiment")
85
+ test_output = gr.Textbox(label="Prediction Result")
86
+ test_btn.click(test_model, inputs=test_input, outputs=test_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  demo.launch()