prasenjeet099 commited on
Commit
254a40b
·
verified ·
1 Parent(s): 2417ba9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -178
app.py CHANGED
@@ -1,18 +1,10 @@
1
  import streamlit as st
2
  import torch
3
- from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoModelForSeq2SeqLM
4
- from datasets import load_dataset, Dataset
5
- import pandas as pd
6
- import numpy as np
7
  import os
 
8
  import time
9
- import matplotlib.pyplot as plt
10
- from sklearn.metrics import classification_report, confusion_matrix
11
- import optuna # Hyperparameter tuning
12
- from sklearn.metrics import precision_recall_curve
13
- import seaborn as sns
14
- from torch.utils.data import DataLoader
15
- import shutil
16
 
17
  # Set up Streamlit page
18
  st.set_page_config(page_title="AutoTrain AI", page_icon="🚀", layout="wide")
@@ -21,39 +13,16 @@ st.subheader("Train AI models using PyTorch & Hugging Face Transformers")
21
 
22
  # Sidebar Configuration
23
  st.sidebar.header("Configuration")
24
- hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984", "custom_model"])
25
- task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis", "Question Answering", "Named Entity Recognition (NER)", "Text Generation", "Text Summarization"])
26
  hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
27
- model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base", "t5-small", "bert-large-uncased", "custom_model"])
28
- dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "squad", "conll2003", "Custom"])
29
-
30
- # Custom Dataset Upload
31
- custom_dataset = None
32
- if dataset_source == "Custom":
33
- custom_dataset_file = st.sidebar.file_uploader("Upload Custom Dataset", type=["csv", "json"])
34
- if custom_dataset_file:
35
- custom_dataset = pd.read_csv(custom_dataset_file) if custom_dataset_file.name.endswith('csv') else pd.read_json(custom_dataset_file)
36
-
37
- # Column Mapping and Split
38
- column_mapping = {
39
- "Text Classification": {"input": "sentence", "label": "label"},
40
- "Sentiment Analysis": {"input": "text", "label": "label"},
41
- "Question Answering": {"input": "question", "context": "context", "label": "answer"},
42
- "Named Entity Recognition (NER)": {"input": "tokens", "label": "labels"},
43
- }
44
-
45
- split_mapping = {
46
- "Text Classification": ["train", "validation"],
47
- "Sentiment Analysis": ["train", "test"],
48
- "Question Answering": ["train", "validation"],
49
- "Named Entity Recognition (NER)": ["train", "validation"],
50
- }
51
-
52
- # Hyperparameters and Training Configuration
53
  epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
54
  batch_size = st.sidebar.selectbox("Batch Size", [8, 16, 32, 64], index=1)
55
  learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 2e-5, format="%.6f")
56
- optimizer_choice = st.sidebar.selectbox("Optimizer", ["AdamW", "SGD"])
57
 
58
  # Check if GPU/TPU is available
59
  device = "cuda" if torch.cuda.is_available() and hardware in ["Single GPU", "Multi-GPU"] else "cpu"
@@ -62,92 +31,65 @@ if hardware == "TPU":
62
 
63
  st.sidebar.write(f"**Using Device:** {device.upper()}")
64
 
65
- # Hyperparameter Tuning with Optuna
66
- study = None
67
- if st.sidebar.button("Start Hyperparameter Tuning"):
68
- def objective(trial):
69
- learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 1e-3)
70
- batch_size = trial.suggest_int("batch_size", 8, 64, step=8)
71
 
72
- # Load dataset and model
73
- tokenizer = AutoTokenizer.from_pretrained(model_choice)
74
- model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
75
 
76
- # Load dataset and tokenize
77
- dataset = load_dataset(dataset_source)
78
- def tokenize_function(examples):
79
- return tokenizer(examples[column_mapping[task]["input"]], truncation=True, padding="max_length")
80
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
81
- train_dataset = tokenized_datasets[split_mapping[task][0]]
82
- eval_dataset = tokenized_datasets[split_mapping[task][1]]
83
-
84
- # Training arguments
85
- training_args = TrainingArguments(
86
- output_dir="./results",
87
- evaluation_strategy="epoch",
88
- logging_dir="./logs",
89
- logging_steps=5,
90
- per_device_train_batch_size=batch_size,
91
- per_device_eval_batch_size=batch_size,
92
- num_train_epochs=epochs,
93
- save_strategy="epoch",
94
- learning_rate=learning_rate,
95
- )
96
-
97
- # Trainer setup
98
- trainer = Trainer(
99
- model=model,
100
- args=training_args,
101
- train_dataset=train_dataset,
102
- eval_dataset=eval_dataset,
103
- )
104
-
105
- trainer.train()
106
- results = trainer.evaluate()
107
- return results["eval_loss"]
108
-
109
- study = optuna.create_study(direction="minimize")
110
- study.optimize(objective, n_trials=10)
111
-
112
- # Display Best Hyperparameters
113
- st.write("Best Hyperparameters found: ", study.best_params)
114
-
115
- # Model Training Function with Checkpoints and Saving
116
- def train_model():
117
- # Load tokenizer and model based on task
118
- tokenizer = AutoTokenizer.from_pretrained(model_choice)
119
-
120
- # Select Model Type Based on Task
121
- if task == "Text Classification" or task == "Sentiment Analysis":
122
- model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
123
-
124
- elif task == "Question Answering":
125
- model = AutoModelForQuestionAnswering.from_pretrained(model_choice)
126
-
127
- elif task == "Named Entity Recognition (NER)":
128
- model = AutoModelForTokenClassification.from_pretrained(model_choice, num_labels=9)
129
 
130
- elif task == "Text Generation":
131
- model = AutoModelForSeq2SeqLM.from_pretrained(model_choice)
 
132
 
133
- elif task == "Text Summarization":
134
- model = AutoModelForSeq2SeqLM.from_pretrained(model_choice)
135
 
136
- # Load dataset and tokenize
137
- dataset = load_dataset(dataset_source)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
 
 
 
 
 
 
 
 
 
 
 
139
  def tokenize_function(examples):
140
- return tokenizer(examples[column_mapping[task]["input"]], truncation=True, padding="max_length")
141
 
 
142
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
143
- train_dataset = tokenized_datasets[split_mapping[task][0]]
144
- eval_dataset = tokenized_datasets[split_mapping[task][1]]
145
 
146
  # Checkpoint Handling
147
- checkpoint_path = "checkpoint.pth"
148
- if os.path.exists(checkpoint_path):
149
  model.load_state_dict(torch.load(checkpoint_path))
150
- st.write("Resuming from checkpoint...")
151
 
152
  # Move model to device
153
  model.to(torch.device(device))
@@ -162,7 +104,7 @@ def train_model():
162
  per_device_eval_batch_size=batch_size,
163
  num_train_epochs=epochs,
164
  save_strategy="epoch",
165
- learning_rate=learning_rate,
166
  )
167
 
168
  # Trainer setup
@@ -173,75 +115,38 @@ def train_model():
173
  eval_dataset=eval_dataset,
174
  )
175
 
176
- # Progress Bar Setup
177
- progress_bar = st.progress(0)
178
-
179
- # Training Loop with Progress Bar
180
- for epoch in range(epochs):
181
- trainer.train()
182
- results = trainer.evaluate()
183
-
184
- # Save Checkpoint after each epoch
185
- torch.save(model.state_dict(), f"checkpoint_epoch_{epoch+1}.pth")
186
-
187
- # Update Progress Bar
188
- progress_bar.progress((epoch + 1) / epochs)
189
 
190
- # Display Results
191
- st.write(f"Epoch {epoch+1}/{epochs} - Loss: {results['eval_loss']:.4f}")
 
192
 
193
- # Show training metrics chart
194
- metrics = {"Epoch": epoch + 1, "Loss": results['eval_loss']}
195
- st.line_chart(pd.DataFrame([metrics]).set_index("Epoch"))
196
 
197
- time.sleep(2)
 
 
 
198
 
199
- # Enhanced Model Evaluation with Confusion Matrix and Precision-Recall Curve
200
- predictions, labels, _ = trainer.predict(eval_dataset)
201
- pred_labels = np.argmax(predictions, axis=-1)
202
 
203
- # Classification Report
204
- report = classification_report(labels, pred_labels, output_dict=True)
205
- st.write("Classification Report:")
206
- st.write(report)
207
 
208
- # Confusion Matrix
209
- cm = confusion_matrix(labels, pred_labels)
210
- fig, ax = plt.subplots(figsize=(6, 6))
211
- sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(labels), yticklabels=np.unique(labels))
212
- st.pyplot(fig)
213
 
214
- # Precision-Recall Curve
215
- precision, recall, _ = precision_recall_curve(labels, predictions[:, 1])
216
- plt.figure(figsize=(6, 6))
217
- plt.plot(recall, precision, marker=".", label="Precision-Recall Curve")
218
- plt.xlabel("Recall")
219
- plt.ylabel("Precision")
220
- plt.title("Precision-Recall Curve")
221
- st.pyplot(plt)
222
-
223
- # Save Model Function
224
- def save_model(model, model_name="trained_model"):
225
- output_dir = f"./models/{model_name}"
226
- model.save_pretrained(output_dir)
227
- tokenizer.save_pretrained(output_dir)
228
- st.write(f"Model saved to {output_dir}")
229
-
230
- # Stop Training Button
231
- if st.sidebar.button("Stop Training"):
232
- st.warning("Training stopped manually.")
233
-
234
- # Training Buttons
235
- if st.sidebar.button("Start Training"):
236
  train_model()
237
 
238
- # Model Inference Interface
239
- if st.sidebar.button("Test Model Inference"):
240
- input_text = st.text_area("Input Text for Inference", "Enter text here to get predictions")
241
- if input_text:
242
- inputs = tokenizer(input_text, return_tensors="pt").to(device)
243
- with torch.no_grad():
244
- model.eval()
245
- outputs = model(**inputs)
246
- prediction = torch.argmax(outputs.logits, dim=-1)
247
- st.write(f"Predicted Label: {prediction.item()}")
 
1
  import streamlit as st
2
  import torch
3
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
4
+ from datasets import load_dataset
 
 
5
  import os
6
+ import pandas as pd
7
  import time
 
 
 
 
 
 
 
8
 
9
  # Set up Streamlit page
10
  st.set_page_config(page_title="AutoTrain AI", page_icon="🚀", layout="wide")
 
13
 
14
  # Sidebar Configuration
15
  st.sidebar.header("Configuration")
16
+ hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984"])
17
+ task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis", "Text Generation", "Translation"])
18
  hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
19
+ model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base", "Custom Model"])
20
+ dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "Custom"])
21
+
22
+ # Training Parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
24
  batch_size = st.sidebar.selectbox("Batch Size", [8, 16, 32, 64], index=1)
25
  learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 2e-5, format="%.6f")
 
26
 
27
  # Check if GPU/TPU is available
28
  device = "cuda" if torch.cuda.is_available() and hardware in ["Single GPU", "Multi-GPU"] else "cpu"
 
31
 
32
  st.sidebar.write(f"**Using Device:** {device.upper()}")
33
 
34
+ # Checkpoint Handling
35
+ resume_training = st.sidebar.checkbox("Resume Training from Checkpoint")
36
+ checkpoint_path = "checkpoint.pth" if resume_training else None
 
 
 
37
 
38
+ # File Paths
39
+ log_file = "train_log.txt"
40
+ metrics_file = "metrics.csv"
41
 
42
+ # Training Buttons
43
+ st.write("### Model Training Control")
44
+ start_train = st.button("Start Training 🚀")
45
+ stop_train = st.button("Stop Training ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ # Live Logs Display
48
+ st.write("### Training Logs (Live Updates)")
49
+ log_area = st.empty()
50
 
51
+ # Live Training Metrics
52
+ st.write("### Training Metrics 📊")
53
 
54
+ # Training Function
55
+ def train_model():
56
+ st.success(f"Training started for {task} with {model_choice} on {device.upper()}")
57
+
58
+ # Load model & tokenizer
59
+ tokenizer = AutoTokenizer.from_pretrained(model_choice) if model_choice != "Custom Model" else None
60
+ model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2) if model_choice != "Custom Model" else None
61
+
62
+ # Load dataset
63
+ if dataset_source == "Custom":
64
+ uploaded_file = st.sidebar.file_uploader("Upload your dataset", type=["csv", "json"])
65
+ if uploaded_file is not None:
66
+ dataset = pd.read_csv(uploaded_file) if uploaded_file.name.endswith(".csv") else pd.read_json(uploaded_file)
67
+ dataset = dataset.to_dict(orient="records")
68
+ else:
69
+ dataset = load_dataset(dataset_source)
70
 
71
+ # Check available columns and ask user for the column name to tokenize
72
+ if "train" in dataset:
73
+ train_data = dataset["train"]
74
+ columns = train_data.features.keys()
75
+ text_column = st.sidebar.selectbox("Select Text Column", list(columns))
76
+ label_column = st.sidebar.selectbox("Select Label Column", list(columns))
77
+ else:
78
+ st.error("Dataset not in correct format. Ensure it has a 'train' split.")
79
+ return
80
+
81
+ # Tokenization function
82
  def tokenize_function(examples):
83
+ return tokenizer(examples[text_column], truncation=True, padding="max_length")
84
 
85
+ # Apply tokenization
86
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
87
+ train_dataset = tokenized_datasets["train"]
88
+ eval_dataset = tokenized_datasets["test"] if "test" in dataset else tokenized_datasets["validation"]
89
 
90
  # Checkpoint Handling
91
+ if resume_training and os.path.exists(checkpoint_path):
 
92
  model.load_state_dict(torch.load(checkpoint_path))
 
93
 
94
  # Move model to device
95
  model.to(torch.device(device))
 
104
  per_device_eval_batch_size=batch_size,
105
  num_train_epochs=epochs,
106
  save_strategy="epoch",
107
+ learning_rate=learning_rate
108
  )
109
 
110
  # Trainer setup
 
115
  eval_dataset=eval_dataset,
116
  )
117
 
118
+ # Training Loop
119
+ metrics = []
120
+ with open(log_file, "w") as log_file_handle:
121
+ log_file_handle.write("Starting training...\n")
122
+ log_file_handle.flush()
 
 
 
 
 
 
 
 
123
 
124
+ for epoch in range(epochs):
125
+ trainer.train()
126
+ results = trainer.evaluate()
127
 
128
+ # Save Checkpoint
129
+ torch.save(model.state_dict(), f"checkpoint_epoch_{epoch+1}.pth")
 
130
 
131
+ # Log results
132
+ log_text = f"Epoch {epoch+1}: Loss = {results['eval_loss']:.4f}, Accuracy = {results.get('eval_accuracy', 0):.4f}\n"
133
+ log_file_handle.write(log_text)
134
+ log_file_handle.flush()
135
 
136
+ # Save metrics
137
+ metrics.append({"epoch": epoch+1, "loss": results["eval_loss"], "accuracy": results.get("eval_accuracy", 0)})
138
+ pd.DataFrame(metrics).to_csv(metrics_file, index=False)
139
 
140
+ # Update logs & metrics in UI
141
+ log_area.text(log_text)
142
+ st.line_chart(pd.DataFrame(metrics).set_index("epoch"))
 
143
 
144
+ time.sleep(2)
 
 
 
 
145
 
146
+ # Start Training
147
+ if start_train:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  train_model()
149
 
150
+ # Stop Training
151
+ if stop_train:
152
+ st.warning("Training stopped manually.")