prasenjeet099 commited on
Commit
3ca2f47
·
verified ·
1 Parent(s): badc9b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +181 -109
app.py CHANGED
@@ -1,11 +1,18 @@
1
  import streamlit as st
2
  import torch
3
- import time
4
- import os
5
- import pandas as pd
6
- from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
7
  from datasets import load_dataset, Dataset
 
 
 
 
8
  import matplotlib.pyplot as plt
 
 
 
 
 
 
9
 
10
  # Set up Streamlit page
11
  st.set_page_config(page_title="AutoTrain AI", page_icon="🚀", layout="wide")
@@ -14,95 +21,136 @@ st.subheader("Train AI models using PyTorch & Hugging Face Transformers")
14
 
15
  # Sidebar Configuration
16
  st.sidebar.header("Configuration")
17
- hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984"])
18
- task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis"])
19
  hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
20
- model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base", "None (Custom Model)"])
21
- dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "Custom"])
22
 
23
- # Custom Dataset or Predefined Dataset
24
  custom_dataset = None
25
  if dataset_source == "Custom":
26
- file = st.sidebar.file_uploader("Upload Custom Dataset", type=["csv", "json"])
27
- if file is not None:
28
- custom_dataset = pd.read_csv(file) if file.name.endswith(".csv") else pd.read_json(file)
29
- st.sidebar.write(f"Dataset uploaded with {len(custom_dataset)} rows")
30
-
31
- # Training Parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
33
  batch_size = st.sidebar.selectbox("Batch Size", [8, 16, 32, 64], index=1)
34
  learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 2e-5, format="%.6f")
 
35
 
36
  # Check if GPU/TPU is available
37
- device = "cpu" # Default to CPU
38
- if torch.cuda.is_available() and hardware in ["Single GPU", "Multi-GPU"]:
39
- device = "cuda"
40
- elif os.environ.get('COLAB_TPU_ADDR'): # Check if on Google Colab with TPU
41
- try:
42
- import torch_xla
43
- import torch_xla.core.xla_model as xm
44
- device = xm.xla_device() # Set the device to TPU
45
- except ImportError:
46
- st.error("TPU support is available only with 'torch_xla'. Please install it.")
47
- elif hardware == "TPU":
48
- st.error("TPU is not available in this environment. Please use GPU or CPU.")
49
 
50
  st.sidebar.write(f"**Using Device:** {device.upper()}")
51
 
52
- # Checkpoint Handling
53
- resume_training = st.sidebar.checkbox("Resume Training from Checkpoint")
54
- checkpoint_path = "checkpoint.pth" if resume_training else None
 
 
 
55
 
56
- # File Paths
57
- log_file = "train_log.txt"
58
- metrics_file = "metrics.csv"
59
 
60
- # Training Buttons
61
- st.write("### Model Training Control")
62
- start_train = st.button("Start Training 🚀")
63
- stop_train = st.button("Stop Training ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- # Live Logs Display
66
- st.write("### Training Logs (Live Updates)")
67
- log_area = st.empty()
68
 
69
- # Live Training Metrics
70
- st.write("### Training Metrics 📊")
71
 
72
- # Training Function
73
- def train_model():
74
- st.success(f"Training started for {task} with {model_choice} on {device.upper()}")
75
 
76
- # Load model & tokenizer
77
- if model_choice != "None (Custom Model)":
78
- tokenizer = AutoTokenizer.from_pretrained(model_choice)
79
- model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
80
- else:
81
- # For custom model, assume user will upload a pre-trained model or enter model code
82
- st.error("Custom model support not yet implemented. Please use a base model.")
83
- return
84
 
85
- # Load dataset
86
- if dataset_source != "Custom":
87
- dataset = load_dataset(dataset_source)
88
- else:
89
- # Assuming custom dataset is a CSV
90
- dataset = Dataset.from_pandas(custom_dataset)
91
 
92
- # Tokenization function
93
  def tokenize_function(examples):
94
- return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
95
 
96
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
97
- train_dataset = tokenized_datasets["train"]
98
- eval_dataset = tokenized_datasets.get("validation", tokenized_datasets["test"])
99
 
100
  # Checkpoint Handling
101
- if resume_training and os.path.exists(checkpoint_path):
 
102
  model.load_state_dict(torch.load(checkpoint_path))
 
103
 
104
  # Move model to device
105
- model.to(device)
106
 
107
  # Training arguments
108
  training_args = TrainingArguments(
@@ -114,7 +162,7 @@ def train_model():
114
  per_device_eval_batch_size=batch_size,
115
  num_train_epochs=epochs,
116
  save_strategy="epoch",
117
- learning_rate=learning_rate
118
  )
119
 
120
  # Trainer setup
@@ -125,51 +173,75 @@ def train_model():
125
  eval_dataset=eval_dataset,
126
  )
127
 
128
- # Progress bar for training
129
  progress_bar = st.progress(0)
130
 
131
- # Training Loop
132
- metrics = []
133
- with open(log_file, "w") as log_file_handle:
134
- log_file_handle.write("Starting training...\n")
135
- log_file_handle.flush()
136
-
137
- for epoch in range(epochs):
138
- trainer.train()
139
- results = trainer.evaluate()
140
-
141
- # Save Checkpoint
142
- torch.save(model.state_dict(), f"checkpoint_epoch_{epoch+1}.pth")
143
-
144
- # Log results
145
- log_text = f"Epoch {epoch+1}: Loss = {results['eval_loss']:.4f}, Accuracy = {results.get('eval_accuracy', 0):.4f}\n"
146
- log_file_handle.write(log_text)
147
- log_file_handle.flush()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
- # Save metrics
150
- metrics.append({"epoch": epoch+1, "loss": results["eval_loss"], "accuracy": results.get("eval_accuracy", 0)})
151
- pd.DataFrame(metrics).to_csv(metrics_file, index=False)
152
-
153
- # Update logs & metrics in UI
154
- log_area.text(log_text)
155
- st.line_chart(pd.DataFrame(metrics).set_index("epoch"))
156
-
157
- # Update progress bar
158
- progress = (epoch + 1) / epochs
159
- progress_bar.progress(progress)
160
-
161
- time.sleep(2)
162
-
163
- # Display final results
164
- st.write("### Final Results 📈")
165
- final_metrics = pd.DataFrame(metrics)
166
- st.line_chart(final_metrics.set_index("epoch"))
167
- st.write(final_metrics)
168
-
169
- # Start Training
170
- if start_train:
171
  train_model()
172
 
173
- # Stop Training
174
- if stop_train:
175
- st.warning("Training stopped manually.")
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import torch
3
+ from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoModelForSeq2SeqLM
 
 
 
4
  from datasets import load_dataset, Dataset
5
+ import pandas as pd
6
+ import numpy as np
7
+ import os
8
+ import time
9
  import matplotlib.pyplot as plt
10
+ from sklearn.metrics import classification_report, confusion_matrix
11
+ import optuna # Hyperparameter tuning
12
+ from sklearn.metrics import precision_recall_curve
13
+ import seaborn as sns
14
+ from torch.utils.data import DataLoader
15
+ import shutil
16
 
17
  # Set up Streamlit page
18
  st.set_page_config(page_title="AutoTrain AI", page_icon="🚀", layout="wide")
 
21
 
22
  # Sidebar Configuration
23
  st.sidebar.header("Configuration")
24
+ hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984", "custom_model"])
25
+ task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis", "Question Answering", "Named Entity Recognition (NER)", "Text Generation", "Text Summarization"])
26
  hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
27
+ model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base", "t5-small", "bert-large-uncased", "custom_model"])
28
+ dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "squad", "conll2003", "Custom"])
29
 
30
+ # Custom Dataset Upload
31
  custom_dataset = None
32
  if dataset_source == "Custom":
33
+ custom_dataset_file = st.sidebar.file_uploader("Upload Custom Dataset", type=["csv", "json"])
34
+ if custom_dataset_file:
35
+ custom_dataset = pd.read_csv(custom_dataset_file) if custom_dataset_file.name.endswith('csv') else pd.read_json(custom_dataset_file)
36
+
37
+ # Column Mapping and Split
38
+ column_mapping = {
39
+ "Text Classification": {"input": "sentence", "label": "label"},
40
+ "Sentiment Analysis": {"input": "text", "label": "label"},
41
+ "Question Answering": {"input": "question", "context": "context", "label": "answer"},
42
+ "Named Entity Recognition (NER)": {"input": "tokens", "label": "labels"},
43
+ }
44
+
45
+ split_mapping = {
46
+ "Text Classification": ["train", "validation"],
47
+ "Sentiment Analysis": ["train", "test"],
48
+ "Question Answering": ["train", "validation"],
49
+ "Named Entity Recognition (NER)": ["train", "validation"],
50
+ }
51
+
52
+ # Hyperparameters and Training Configuration
53
  epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
54
  batch_size = st.sidebar.selectbox("Batch Size", [8, 16, 32, 64], index=1)
55
  learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 2e-5, format="%.6f")
56
+ optimizer_choice = st.sidebar.selectbox("Optimizer", ["AdamW", "SGD"])
57
 
58
  # Check if GPU/TPU is available
59
+ device = "cuda" if torch.cuda.is_available() and hardware in ["Single GPU", "Multi-GPU"] else "cpu"
60
+ if hardware == "TPU":
61
+ device = "tpu"
 
 
 
 
 
 
 
 
 
62
 
63
  st.sidebar.write(f"**Using Device:** {device.upper()}")
64
 
65
+ # Hyperparameter Tuning with Optuna
66
+ study = None
67
+ if st.sidebar.button("Start Hyperparameter Tuning"):
68
+ def objective(trial):
69
+ learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 1e-3)
70
+ batch_size = trial.suggest_int("batch_size", 8, 64, step=8)
71
 
72
+ # Load dataset and model
73
+ tokenizer = AutoTokenizer.from_pretrained(model_choice)
74
+ model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
75
 
76
+ # Load dataset and tokenize
77
+ dataset = load_dataset(dataset_source)
78
+ def tokenize_function(examples):
79
+ return tokenizer(examples[column_mapping[task]["input"]], truncation=True, padding="max_length")
80
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
81
+ train_dataset = tokenized_datasets[split_mapping[task][0]]
82
+ eval_dataset = tokenized_datasets[split_mapping[task][1]]
83
+
84
+ # Training arguments
85
+ training_args = TrainingArguments(
86
+ output_dir="./results",
87
+ evaluation_strategy="epoch",
88
+ logging_dir="./logs",
89
+ logging_steps=5,
90
+ per_device_train_batch_size=batch_size,
91
+ per_device_eval_batch_size=batch_size,
92
+ num_train_epochs=epochs,
93
+ save_strategy="epoch",
94
+ learning_rate=learning_rate,
95
+ )
96
+
97
+ # Trainer setup
98
+ trainer = Trainer(
99
+ model=model,
100
+ args=training_args,
101
+ train_dataset=train_dataset,
102
+ eval_dataset=eval_dataset,
103
+ )
104
+
105
+ trainer.train()
106
+ results = trainer.evaluate()
107
+ return results["eval_loss"]
108
+
109
+ study = optuna.create_study(direction="minimize")
110
+ study.optimize(objective, n_trials=10)
111
+
112
+ # Display Best Hyperparameters
113
+ st.write("Best Hyperparameters found: ", study.best_params)
114
+
115
+ # Model Training Function with Checkpoints and Saving
116
+ def train_model():
117
+ # Load tokenizer and model based on task
118
+ tokenizer = AutoTokenizer.from_pretrained(model_choice)
119
 
120
+ # Select Model Type Based on Task
121
+ if task == "Text Classification" or task == "Sentiment Analysis":
122
+ model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
123
 
124
+ elif task == "Question Answering":
125
+ model = AutoModelForQuestionAnswering.from_pretrained(model_choice)
126
 
127
+ elif task == "Named Entity Recognition (NER)":
128
+ model = AutoModelForTokenClassification.from_pretrained(model_choice, num_labels=9)
 
129
 
130
+ elif task == "Text Generation":
131
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_choice)
 
 
 
 
 
 
132
 
133
+ elif task == "Text Summarization":
134
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_choice)
135
+
136
+ # Load dataset and tokenize
137
+ dataset = load_dataset(dataset_source)
 
138
 
 
139
  def tokenize_function(examples):
140
+ return tokenizer(examples[column_mapping[task]["input"]], truncation=True, padding="max_length")
141
 
142
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
143
+ train_dataset = tokenized_datasets[split_mapping[task][0]]
144
+ eval_dataset = tokenized_datasets[split_mapping[task][1]]
145
 
146
  # Checkpoint Handling
147
+ checkpoint_path = "checkpoint.pth"
148
+ if os.path.exists(checkpoint_path):
149
  model.load_state_dict(torch.load(checkpoint_path))
150
+ st.write("Resuming from checkpoint...")
151
 
152
  # Move model to device
153
+ model.to(torch.device(device))
154
 
155
  # Training arguments
156
  training_args = TrainingArguments(
 
162
  per_device_eval_batch_size=batch_size,
163
  num_train_epochs=epochs,
164
  save_strategy="epoch",
165
+ learning_rate=learning_rate,
166
  )
167
 
168
  # Trainer setup
 
173
  eval_dataset=eval_dataset,
174
  )
175
 
176
+ # Progress Bar Setup
177
  progress_bar = st.progress(0)
178
 
179
+ # Training Loop with Progress Bar
180
+ for epoch in range(epochs):
181
+ trainer.train()
182
+ results = trainer.evaluate()
183
+
184
+ # Save Checkpoint after each epoch
185
+ torch.save(model.state_dict(), f"checkpoint_epoch_{epoch+1}.pth")
186
+
187
+ # Update Progress Bar
188
+ progress_bar.progress((epoch + 1) / epochs)
189
+
190
+ # Display Results
191
+ st.write(f"Epoch {epoch+1}/{epochs} - Loss: {results['eval_loss']:.4f}")
192
+
193
+ # Show training metrics chart
194
+ metrics = {"Epoch": epoch + 1, "Loss": results['eval_loss']}
195
+ st.line_chart(pd.DataFrame([metrics]).set_index("Epoch"))
196
+
197
+ time.sleep(2)
198
+
199
+ # Enhanced Model Evaluation with Confusion Matrix and Precision-Recall Curve
200
+ predictions, labels, _ = trainer.predict(eval_dataset)
201
+ pred_labels = np.argmax(predictions, axis=-1)
202
+
203
+ # Classification Report
204
+ report = classification_report(labels, pred_labels, output_dict=True)
205
+ st.write("Classification Report:")
206
+ st.write(report)
207
+
208
+ # Confusion Matrix
209
+ cm = confusion_matrix(labels, pred_labels)
210
+ fig, ax = plt.subplots(figsize=(6, 6))
211
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(labels), yticklabels=np.unique(labels))
212
+ st.pyplot(fig)
213
+
214
+ # Precision-Recall Curve
215
+ precision, recall, _ = precision_recall_curve(labels, predictions[:, 1])
216
+ plt.figure(figsize=(6, 6))
217
+ plt.plot(recall, precision, marker=".", label="Precision-Recall Curve")
218
+ plt.xlabel("Recall")
219
+ plt.ylabel("Precision")
220
+ plt.title("Precision-Recall Curve")
221
+ st.pyplot(plt)
222
+
223
+ # Save Model Function
224
+ def save_model(model, model_name="trained_model"):
225
+ output_dir = f"./models/{model_name}"
226
+ model.save_pretrained(output_dir)
227
+ tokenizer.save_pretrained(output_dir)
228
+ st.write(f"Model saved to {output_dir}")
229
+
230
+ # Stop Training Button
231
+ if st.sidebar.button("Stop Training"):
232
+ st.warning("Training stopped manually.")
233
 
234
+ # Training Buttons
235
+ if st.sidebar.button("Start Training"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  train_model()
237
 
238
+ # Model Inference Interface
239
+ if st.sidebar.button("Test Model Inference"):
240
+ input_text = st.text_area("Input Text for Inference", "Enter text here to get predictions")
241
+ if input_text:
242
+ inputs = tokenizer(input_text, return_tensors="pt").to(device)
243
+ with torch.no_grad():
244
+ model.eval()
245
+ outputs = model(**inputs)
246
+ prediction = torch.argmax(outputs.logits, dim=-1)
247
+ st.write(f"Predicted Label: {prediction.item()}")