Spaces:

zltd
/

LLM-Tuner

Sleeping

App Files Files Community

Update app.py

by prasenjeet099 - opened Mar 1, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+49

-88

Files changed (1) hide show

app.py +49 -88

app.py CHANGED Viewed

@@ -5,9 +5,7 @@ import os
 import pandas as pd
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
 from datasets import load_dataset, Dataset
-from sklearn.metrics import confusion_matrix
-from sklearn.model_selection import train_test_split
-from tqdm import tqdm  # For progress bar during training
 # Set up Streamlit page
 st.set_page_config(page_title="AutoTrain AI", page_icon="🚀", layout="wide")
@@ -19,12 +17,16 @@ st.sidebar.header("Configuration")
 hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984"])
 task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis"])
 hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
-model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base"])
 dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "Custom"])
-# Column Mapping for custom datasets
-text_column = st.sidebar.text_input("Text Column", "text")
-label_column = st.sidebar.text_input("Label Column", "label")
 # Training Parameters
 epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
@@ -32,9 +34,18 @@ batch_size = st.sidebar.selectbox("Batch Size", [8, 16, 32, 64], index=1)
 learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 2e-5, format="%.6f")
 # Check if GPU/TPU is available
-device = "cuda" if torch.cuda.is_available() and hardware in ["Single GPU", "Multi-GPU"] else "cpu"
-if hardware == "TPU":
-    device = "tpu"
 st.sidebar.write(f"**Using Device:** {device.upper()}")
@@ -57,54 +68,41 @@ log_area = st.empty()
 # Live Training Metrics
 st.write("### Training Metrics 📊")
-progress_bar = st.progress(0)  # Initialize progress bar
 # Training Function
 def train_model():
     st.success(f"Training started for {task} with {model_choice} on {device.upper()}")
     # Load model & tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_choice)
-    model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)  # Adjust num_labels as necessary
     # Load dataset
-    if dataset_source.lower() != "custom":
         dataset = load_dataset(dataset_source)
     else:
-        # Handle Custom Dataset
-        uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
-        if uploaded_file is not None:
-            dataset_df = pd.read_csv(uploaded_file)
-            dataset = Dataset.from_pandas(dataset_df)
     # Tokenization function
     def tokenize_function(examples):
-        return tokenizer(examples[text_column], truncation=True, padding="max_length")
     tokenized_datasets = dataset.map(tokenize_function, batched=True)
-    # Handle missing or non-standard splits
-    if "train" in tokenized_datasets:
-        train_dataset = tokenized_datasets["train"]
-    else:
-        # Create a custom split if no train split exists
-        train_dataset = tokenized_datasets
-        train_dataset, eval_dataset = train_test_split(train_dataset, test_size=0.1)
-    # Check for validation or test split
-    if "validation" in tokenized_datasets:
-        eval_dataset = tokenized_datasets["validation"]
-    elif "test" in tokenized_datasets:
-        eval_dataset = tokenized_datasets["test"]
-    else:
-        raise ValueError("Dataset does not have a 'validation' or 'test' split.")
     # Checkpoint Handling
     if resume_training and os.path.exists(checkpoint_path):
         model.load_state_dict(torch.load(checkpoint_path))
     # Move model to device
-    model.to(torch.device(device))
     # Training arguments
     training_args = TrainingArguments(
@@ -127,27 +125,17 @@ def train_model():
         eval_dataset=eval_dataset,
     )
-    # Training Loop with Progress Bar
-    metrics = []
-    loss_values = []  # To store loss values for plotting
-    accuracy_values = []  # To store accuracy values for plotting
-    all_preds = []  # To store predictions for confusion matrix
-    all_labels = []  # To store true labels for confusion matrix
     with open(log_file, "w") as log_file_handle:
         log_file_handle.write("Starting training...\n")
         log_file_handle.flush()
         for epoch in range(epochs):
-            # Initialize progress bar for this epoch
-            progress_bar.progress(0)  # Reset progress bar at the start of each epoch
-            # Training with tqdm for real-time progress bar
-            for step, batch in enumerate(trainer.get_train_dataloader()):
-                trainer.training_step(model, batch)  # Perform a training step
-                progress_bar.progress((step + 1) / len(trainer.get_train_dataloader()))  # Update progress bar
-            # Evaluate the model at the end of each epoch
             results = trainer.evaluate()
             # Save Checkpoint
@@ -162,48 +150,21 @@ def train_model():
             metrics.append({"epoch": epoch+1, "loss": results["eval_loss"], "accuracy": results.get("eval_accuracy", 0)})
             pd.DataFrame(metrics).to_csv(metrics_file, index=False)
-            loss_values.append(results["eval_loss"])
-            accuracy_values.append(results.get("eval_accuracy", 0))
-            # Collect predictions and labels for confusion matrix
-            all_preds.extend(results.get("logits", []))
-            all_labels.extend(eval_dataset["label"])
             # Update logs & metrics in UI
             log_area.text(log_text)
             st.line_chart(pd.DataFrame(metrics).set_index("epoch"))
             time.sleep(2)
-    # After training, plot charts for loss, accuracy, and confusion matrix
-    plot_metrics(loss_values, accuracy_values)
-    plot_confusion_matrix(all_labels, all_preds)
-def plot_metrics(loss_values, accuracy_values):
-    # Plot Loss Curve using Streamlit chart
-    metrics_df = pd.DataFrame({
-        "Epoch": range(1, len(loss_values) + 1),
-        "Loss": loss_values,
-        "Accuracy": accuracy_values
-    })
-    st.write("### Training Loss and Accuracy Curve")
-    st.line_chart(metrics_df.set_index("Epoch"))
-def plot_confusion_matrix(true_labels, preds):
-    # Convert logits to predicted class labels
-    pred_labels = torch.argmax(torch.tensor(preds), axis=1).numpy()
-    # Compute confusion matrix
-    cm = confusion_matrix(true_labels, pred_labels)
-    # Plot confusion matrix using Streamlit chart
-    fig, ax = plt.subplots(figsize=(8, 6))
-    ax = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
-    ax.set_title("Confusion Matrix")
-    ax.set_xlabel("Predicted Label")
-    ax.set_ylabel("True Label")
-    st.pyplot(fig)
 # Start Training
 if start_train:

 import pandas as pd
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
 from datasets import load_dataset, Dataset
+import matplotlib.pyplot as plt
 # Set up Streamlit page
 st.set_page_config(page_title="AutoTrain AI", page_icon="🚀", layout="wide")
 hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984"])
 task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis"])
 hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
+model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base", "None (Custom Model)"])
 dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "Custom"])
+# Custom Dataset or Predefined Dataset
+custom_dataset = None
+if dataset_source == "Custom":
+    file = st.sidebar.file_uploader("Upload Custom Dataset", type=["csv", "json"])
+    if file is not None:
+        custom_dataset = pd.read_csv(file) if file.name.endswith(".csv") else pd.read_json(file)
+        st.sidebar.write(f"Dataset uploaded with {len(custom_dataset)} rows")
 # Training Parameters
 epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
 learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 2e-5, format="%.6f")
 # Check if GPU/TPU is available
+device = "cpu"  # Default to CPU
+if torch.cuda.is_available() and hardware in ["Single GPU", "Multi-GPU"]:
+    device = "cuda"
+elif os.environ.get('COLAB_TPU_ADDR'):  # Check if on Google Colab with TPU
+    try:
+        import torch_xla
+        import torch_xla.core.xla_model as xm
+        device = xm.xla_device()  # Set the device to TPU
+    except ImportError:
+        st.error("TPU support is available only with 'torch_xla'. Please install it.")
+elif hardware == "TPU":
+    st.error("TPU is not available in this environment. Please use GPU or CPU.")
 st.sidebar.write(f"**Using Device:** {device.upper()}")
 # Live Training Metrics
 st.write("### Training Metrics 📊")
 # Training Function
 def train_model():
     st.success(f"Training started for {task} with {model_choice} on {device.upper()}")
     # Load model & tokenizer
+    if model_choice != "None (Custom Model)":
+        tokenizer = AutoTokenizer.from_pretrained(model_choice)
+        model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
+    else:
+        # For custom model, assume user will upload a pre-trained model or enter model code
+        st.error("Custom model support not yet implemented. Please use a base model.")
+        return
     # Load dataset
+    if dataset_source != "Custom":
         dataset = load_dataset(dataset_source)
     else:
+        # Assuming custom dataset is a CSV
+        dataset = Dataset.from_pandas(custom_dataset)
     # Tokenization function
     def tokenize_function(examples):
+        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
     tokenized_datasets = dataset.map(tokenize_function, batched=True)
+    train_dataset = tokenized_datasets["train"]
+    eval_dataset = tokenized_datasets.get("validation", tokenized_datasets["test"])
     # Checkpoint Handling
     if resume_training and os.path.exists(checkpoint_path):
         model.load_state_dict(torch.load(checkpoint_path))
     # Move model to device
+    model.to(device)
     # Training arguments
     training_args = TrainingArguments(
         eval_dataset=eval_dataset,
     )
+    # Progress bar for training
+    progress_bar = st.progress(0)
+    # Training Loop
+    metrics = []
     with open(log_file, "w") as log_file_handle:
         log_file_handle.write("Starting training...\n")
         log_file_handle.flush()
         for epoch in range(epochs):
+            trainer.train()
             results = trainer.evaluate()
             # Save Checkpoint
             metrics.append({"epoch": epoch+1, "loss": results["eval_loss"], "accuracy": results.get("eval_accuracy", 0)})
             pd.DataFrame(metrics).to_csv(metrics_file, index=False)
             # Update logs & metrics in UI
             log_area.text(log_text)
             st.line_chart(pd.DataFrame(metrics).set_index("epoch"))
+            # Update progress bar
+            progress = (epoch + 1) / epochs
+            progress_bar.progress(progress)
             time.sleep(2)
+    # Display final results
+    st.write("### Final Results 📈")
+    final_metrics = pd.DataFrame(metrics)
+    st.line_chart(final_metrics.set_index("epoch"))
+    st.write(final_metrics)
 # Start Training
 if start_train: