Files changed (1) hide show
  1. app.py +49 -88
app.py CHANGED
@@ -5,9 +5,7 @@ import os
5
  import pandas as pd
6
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
7
  from datasets import load_dataset, Dataset
8
- from sklearn.metrics import confusion_matrix
9
- from sklearn.model_selection import train_test_split
10
- from tqdm import tqdm # For progress bar during training
11
 
12
  # Set up Streamlit page
13
  st.set_page_config(page_title="AutoTrain AI", page_icon="πŸš€", layout="wide")
@@ -19,12 +17,16 @@ st.sidebar.header("Configuration")
19
  hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984"])
20
  task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis"])
21
  hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
22
- model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base"])
23
  dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "Custom"])
24
 
25
- # Column Mapping for custom datasets
26
- text_column = st.sidebar.text_input("Text Column", "text")
27
- label_column = st.sidebar.text_input("Label Column", "label")
 
 
 
 
28
 
29
  # Training Parameters
30
  epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
@@ -32,9 +34,18 @@ batch_size = st.sidebar.selectbox("Batch Size", [8, 16, 32, 64], index=1)
32
  learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 2e-5, format="%.6f")
33
 
34
  # Check if GPU/TPU is available
35
- device = "cuda" if torch.cuda.is_available() and hardware in ["Single GPU", "Multi-GPU"] else "cpu"
36
- if hardware == "TPU":
37
- device = "tpu"
 
 
 
 
 
 
 
 
 
38
 
39
  st.sidebar.write(f"**Using Device:** {device.upper()}")
40
 
@@ -57,54 +68,41 @@ log_area = st.empty()
57
 
58
  # Live Training Metrics
59
  st.write("### Training Metrics πŸ“Š")
60
- progress_bar = st.progress(0) # Initialize progress bar
61
 
62
  # Training Function
63
  def train_model():
64
  st.success(f"Training started for {task} with {model_choice} on {device.upper()}")
65
 
66
  # Load model & tokenizer
67
- tokenizer = AutoTokenizer.from_pretrained(model_choice)
68
- model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2) # Adjust num_labels as necessary
 
 
 
 
 
69
 
70
  # Load dataset
71
- if dataset_source.lower() != "custom":
72
  dataset = load_dataset(dataset_source)
73
  else:
74
- # Handle Custom Dataset
75
- uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
76
- if uploaded_file is not None:
77
- dataset_df = pd.read_csv(uploaded_file)
78
- dataset = Dataset.from_pandas(dataset_df)
79
 
80
  # Tokenization function
81
  def tokenize_function(examples):
82
- return tokenizer(examples[text_column], truncation=True, padding="max_length")
83
 
84
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
85
-
86
- # Handle missing or non-standard splits
87
- if "train" in tokenized_datasets:
88
- train_dataset = tokenized_datasets["train"]
89
- else:
90
- # Create a custom split if no train split exists
91
- train_dataset = tokenized_datasets
92
- train_dataset, eval_dataset = train_test_split(train_dataset, test_size=0.1)
93
-
94
- # Check for validation or test split
95
- if "validation" in tokenized_datasets:
96
- eval_dataset = tokenized_datasets["validation"]
97
- elif "test" in tokenized_datasets:
98
- eval_dataset = tokenized_datasets["test"]
99
- else:
100
- raise ValueError("Dataset does not have a 'validation' or 'test' split.")
101
 
102
  # Checkpoint Handling
103
  if resume_training and os.path.exists(checkpoint_path):
104
  model.load_state_dict(torch.load(checkpoint_path))
105
 
106
  # Move model to device
107
- model.to(torch.device(device))
108
 
109
  # Training arguments
110
  training_args = TrainingArguments(
@@ -127,27 +125,17 @@ def train_model():
127
  eval_dataset=eval_dataset,
128
  )
129
 
130
- # Training Loop with Progress Bar
131
- metrics = []
132
- loss_values = [] # To store loss values for plotting
133
- accuracy_values = [] # To store accuracy values for plotting
134
- all_preds = [] # To store predictions for confusion matrix
135
- all_labels = [] # To store true labels for confusion matrix
136
 
 
 
137
  with open(log_file, "w") as log_file_handle:
138
  log_file_handle.write("Starting training...\n")
139
  log_file_handle.flush()
140
 
141
  for epoch in range(epochs):
142
- # Initialize progress bar for this epoch
143
- progress_bar.progress(0) # Reset progress bar at the start of each epoch
144
-
145
- # Training with tqdm for real-time progress bar
146
- for step, batch in enumerate(trainer.get_train_dataloader()):
147
- trainer.training_step(model, batch) # Perform a training step
148
- progress_bar.progress((step + 1) / len(trainer.get_train_dataloader())) # Update progress bar
149
-
150
- # Evaluate the model at the end of each epoch
151
  results = trainer.evaluate()
152
 
153
  # Save Checkpoint
@@ -162,48 +150,21 @@ def train_model():
162
  metrics.append({"epoch": epoch+1, "loss": results["eval_loss"], "accuracy": results.get("eval_accuracy", 0)})
163
  pd.DataFrame(metrics).to_csv(metrics_file, index=False)
164
 
165
- loss_values.append(results["eval_loss"])
166
- accuracy_values.append(results.get("eval_accuracy", 0))
167
-
168
- # Collect predictions and labels for confusion matrix
169
- all_preds.extend(results.get("logits", []))
170
- all_labels.extend(eval_dataset["label"])
171
-
172
  # Update logs & metrics in UI
173
  log_area.text(log_text)
174
  st.line_chart(pd.DataFrame(metrics).set_index("epoch"))
175
 
 
 
 
 
176
  time.sleep(2)
177
 
178
- # After training, plot charts for loss, accuracy, and confusion matrix
179
- plot_metrics(loss_values, accuracy_values)
180
- plot_confusion_matrix(all_labels, all_preds)
181
-
182
- def plot_metrics(loss_values, accuracy_values):
183
- # Plot Loss Curve using Streamlit chart
184
- metrics_df = pd.DataFrame({
185
- "Epoch": range(1, len(loss_values) + 1),
186
- "Loss": loss_values,
187
- "Accuracy": accuracy_values
188
- })
189
-
190
- st.write("### Training Loss and Accuracy Curve")
191
- st.line_chart(metrics_df.set_index("Epoch"))
192
-
193
- def plot_confusion_matrix(true_labels, preds):
194
- # Convert logits to predicted class labels
195
- pred_labels = torch.argmax(torch.tensor(preds), axis=1).numpy()
196
-
197
- # Compute confusion matrix
198
- cm = confusion_matrix(true_labels, pred_labels)
199
-
200
- # Plot confusion matrix using Streamlit chart
201
- fig, ax = plt.subplots(figsize=(8, 6))
202
- ax = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
203
- ax.set_title("Confusion Matrix")
204
- ax.set_xlabel("Predicted Label")
205
- ax.set_ylabel("True Label")
206
- st.pyplot(fig)
207
 
208
  # Start Training
209
  if start_train:
 
5
  import pandas as pd
6
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
7
  from datasets import load_dataset, Dataset
8
+ import matplotlib.pyplot as plt
 
 
9
 
10
  # Set up Streamlit page
11
  st.set_page_config(page_title="AutoTrain AI", page_icon="πŸš€", layout="wide")
 
17
  hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984"])
18
  task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis"])
19
  hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
20
+ model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base", "None (Custom Model)"])
21
  dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "Custom"])
22
 
23
+ # Custom Dataset or Predefined Dataset
24
+ custom_dataset = None
25
+ if dataset_source == "Custom":
26
+ file = st.sidebar.file_uploader("Upload Custom Dataset", type=["csv", "json"])
27
+ if file is not None:
28
+ custom_dataset = pd.read_csv(file) if file.name.endswith(".csv") else pd.read_json(file)
29
+ st.sidebar.write(f"Dataset uploaded with {len(custom_dataset)} rows")
30
 
31
  # Training Parameters
32
  epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
 
34
  learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 2e-5, format="%.6f")
35
 
36
  # Check if GPU/TPU is available
37
+ device = "cpu" # Default to CPU
38
+ if torch.cuda.is_available() and hardware in ["Single GPU", "Multi-GPU"]:
39
+ device = "cuda"
40
+ elif os.environ.get('COLAB_TPU_ADDR'): # Check if on Google Colab with TPU
41
+ try:
42
+ import torch_xla
43
+ import torch_xla.core.xla_model as xm
44
+ device = xm.xla_device() # Set the device to TPU
45
+ except ImportError:
46
+ st.error("TPU support is available only with 'torch_xla'. Please install it.")
47
+ elif hardware == "TPU":
48
+ st.error("TPU is not available in this environment. Please use GPU or CPU.")
49
 
50
  st.sidebar.write(f"**Using Device:** {device.upper()}")
51
 
 
68
 
69
  # Live Training Metrics
70
  st.write("### Training Metrics πŸ“Š")
 
71
 
72
  # Training Function
73
  def train_model():
74
  st.success(f"Training started for {task} with {model_choice} on {device.upper()}")
75
 
76
  # Load model & tokenizer
77
+ if model_choice != "None (Custom Model)":
78
+ tokenizer = AutoTokenizer.from_pretrained(model_choice)
79
+ model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
80
+ else:
81
+ # For custom model, assume user will upload a pre-trained model or enter model code
82
+ st.error("Custom model support not yet implemented. Please use a base model.")
83
+ return
84
 
85
  # Load dataset
86
+ if dataset_source != "Custom":
87
  dataset = load_dataset(dataset_source)
88
  else:
89
+ # Assuming custom dataset is a CSV
90
+ dataset = Dataset.from_pandas(custom_dataset)
 
 
 
91
 
92
  # Tokenization function
93
  def tokenize_function(examples):
94
+ return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
95
 
96
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
97
+ train_dataset = tokenized_datasets["train"]
98
+ eval_dataset = tokenized_datasets.get("validation", tokenized_datasets["test"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  # Checkpoint Handling
101
  if resume_training and os.path.exists(checkpoint_path):
102
  model.load_state_dict(torch.load(checkpoint_path))
103
 
104
  # Move model to device
105
+ model.to(device)
106
 
107
  # Training arguments
108
  training_args = TrainingArguments(
 
125
  eval_dataset=eval_dataset,
126
  )
127
 
128
+ # Progress bar for training
129
+ progress_bar = st.progress(0)
 
 
 
 
130
 
131
+ # Training Loop
132
+ metrics = []
133
  with open(log_file, "w") as log_file_handle:
134
  log_file_handle.write("Starting training...\n")
135
  log_file_handle.flush()
136
 
137
  for epoch in range(epochs):
138
+ trainer.train()
 
 
 
 
 
 
 
 
139
  results = trainer.evaluate()
140
 
141
  # Save Checkpoint
 
150
  metrics.append({"epoch": epoch+1, "loss": results["eval_loss"], "accuracy": results.get("eval_accuracy", 0)})
151
  pd.DataFrame(metrics).to_csv(metrics_file, index=False)
152
 
 
 
 
 
 
 
 
153
  # Update logs & metrics in UI
154
  log_area.text(log_text)
155
  st.line_chart(pd.DataFrame(metrics).set_index("epoch"))
156
 
157
+ # Update progress bar
158
+ progress = (epoch + 1) / epochs
159
+ progress_bar.progress(progress)
160
+
161
  time.sleep(2)
162
 
163
+ # Display final results
164
+ st.write("### Final Results πŸ“ˆ")
165
+ final_metrics = pd.DataFrame(metrics)
166
+ st.line_chart(final_metrics.set_index("epoch"))
167
+ st.write(final_metrics)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  # Start Training
170
  if start_train: