Spaces:

zltd
/

LLM-Tuner

Sleeping

App Files Files Community

prasenjeet099 commited on Mar 1, 2025

Commit

3ca2f47

verified ·

1 Parent(s): badc9b4

Update app.py

Browse files

Files changed (1) hide show

app.py +181 -109

app.py CHANGED Viewed

@@ -1,11 +1,18 @@
 import streamlit as st
 import torch
-import time
-import os
-import pandas as pd
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
 from datasets import load_dataset, Dataset
 import matplotlib.pyplot as plt
 # Set up Streamlit page
 st.set_page_config(page_title="AutoTrain AI", page_icon="🚀", layout="wide")
@@ -14,95 +21,136 @@ st.subheader("Train AI models using PyTorch & Hugging Face Transformers")
 # Sidebar Configuration
 st.sidebar.header("Configuration")
-hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984"])
-task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis"])
 hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
-model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base", "None (Custom Model)"])
-dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "Custom"])
-# Custom Dataset or Predefined Dataset
 custom_dataset = None
 if dataset_source == "Custom":
-    file = st.sidebar.file_uploader("Upload Custom Dataset", type=["csv", "json"])
-    if file is not None:
-        custom_dataset = pd.read_csv(file) if file.name.endswith(".csv") else pd.read_json(file)
-        st.sidebar.write(f"Dataset uploaded with {len(custom_dataset)} rows")
-# Training Parameters
 epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
 batch_size = st.sidebar.selectbox("Batch Size", [8, 16, 32, 64], index=1)
 learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 2e-5, format="%.6f")
 # Check if GPU/TPU is available
-device = "cpu"  # Default to CPU
-if torch.cuda.is_available() and hardware in ["Single GPU", "Multi-GPU"]:
-    device = "cuda"
-elif os.environ.get('COLAB_TPU_ADDR'):  # Check if on Google Colab with TPU
-    try:
-        import torch_xla
-        import torch_xla.core.xla_model as xm
-        device = xm.xla_device()  # Set the device to TPU
-    except ImportError:
-        st.error("TPU support is available only with 'torch_xla'. Please install it.")
-elif hardware == "TPU":
-    st.error("TPU is not available in this environment. Please use GPU or CPU.")
 st.sidebar.write(f"**Using Device:** {device.upper()}")
-# Checkpoint Handling
-resume_training = st.sidebar.checkbox("Resume Training from Checkpoint")
-checkpoint_path = "checkpoint.pth" if resume_training else None
-# File Paths
-log_file = "train_log.txt"
-metrics_file = "metrics.csv"
-# Training Buttons
-st.write("### Model Training Control")
-start_train = st.button("Start Training 🚀")
-stop_train = st.button("Stop Training ⛔")
-# Live Logs Display
-st.write("### Training Logs (Live Updates)")
-log_area = st.empty()
-# Live Training Metrics
-st.write("### Training Metrics 📊")
-# Training Function
-def train_model():
-    st.success(f"Training started for {task} with {model_choice} on {device.upper()}")
-    # Load model & tokenizer
-    if model_choice != "None (Custom Model)":
-        tokenizer = AutoTokenizer.from_pretrained(model_choice)
-        model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
-    else:
-        # For custom model, assume user will upload a pre-trained model or enter model code
-        st.error("Custom model support not yet implemented. Please use a base model.")
-        return
-    # Load dataset
-    if dataset_source != "Custom":
-        dataset = load_dataset(dataset_source)
-    else:
-        # Assuming custom dataset is a CSV
-        dataset = Dataset.from_pandas(custom_dataset)
-    # Tokenization function
     def tokenize_function(examples):
-        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
     tokenized_datasets = dataset.map(tokenize_function, batched=True)
-    train_dataset = tokenized_datasets["train"]
-    eval_dataset = tokenized_datasets.get("validation", tokenized_datasets["test"])
     # Checkpoint Handling
-    if resume_training and os.path.exists(checkpoint_path):
         model.load_state_dict(torch.load(checkpoint_path))
     # Move model to device
-    model.to(device)
     # Training arguments
     training_args = TrainingArguments(
@@ -114,7 +162,7 @@ def train_model():
         per_device_eval_batch_size=batch_size,
         num_train_epochs=epochs,
         save_strategy="epoch",
-        learning_rate=learning_rate
     )
     # Trainer setup
@@ -125,51 +173,75 @@ def train_model():
         eval_dataset=eval_dataset,
     )
-    # Progress bar for training
     progress_bar = st.progress(0)
-    # Training Loop
-    metrics = []
-    with open(log_file, "w") as log_file_handle:
-        log_file_handle.write("Starting training...\n")
-        log_file_handle.flush()
-        for epoch in range(epochs):
-            trainer.train()
-            results = trainer.evaluate()
-            # Save Checkpoint
-            torch.save(model.state_dict(), f"checkpoint_epoch_{epoch+1}.pth")
-            # Log results
-            log_text = f"Epoch {epoch+1}: Loss = {results['eval_loss']:.4f}, Accuracy = {results.get('eval_accuracy', 0):.4f}\n"
-            log_file_handle.write(log_text)
-            log_file_handle.flush()
-            # Save metrics
-            metrics.append({"epoch": epoch+1, "loss": results["eval_loss"], "accuracy": results.get("eval_accuracy", 0)})
-            pd.DataFrame(metrics).to_csv(metrics_file, index=False)
-            # Update logs & metrics in UI
-            log_area.text(log_text)
-            st.line_chart(pd.DataFrame(metrics).set_index("epoch"))
-            # Update progress bar
-            progress = (epoch + 1) / epochs
-            progress_bar.progress(progress)
-            time.sleep(2)
-    # Display final results
-    st.write("### Final Results 📈")
-    final_metrics = pd.DataFrame(metrics)
-    st.line_chart(final_metrics.set_index("epoch"))
-    st.write(final_metrics)
-# Start Training
-if start_train:
     train_model()
-# Stop Training
-if stop_train:
-    st.warning("Training stopped manually.")

 import streamlit as st
 import torch
+from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoModelForSeq2SeqLM
 from datasets import load_dataset, Dataset
+import pandas as pd
+import numpy as np
+import os
+import time
 import matplotlib.pyplot as plt
+from sklearn.metrics import classification_report, confusion_matrix
+import optuna  # Hyperparameter tuning
+from sklearn.metrics import precision_recall_curve
+import seaborn as sns
+from torch.utils.data import DataLoader
+import shutil
 # Set up Streamlit page
 st.set_page_config(page_title="AutoTrain AI", page_icon="🚀", layout="wide")
 # Sidebar Configuration
 st.sidebar.header("Configuration")
+hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984", "custom_model"])
+task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis", "Question Answering", "Named Entity Recognition (NER)", "Text Generation", "Text Summarization"])
 hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
+model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base", "t5-small", "bert-large-uncased", "custom_model"])
+dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "squad", "conll2003", "Custom"])
+# Custom Dataset Upload
 custom_dataset = None
 if dataset_source == "Custom":
+    custom_dataset_file = st.sidebar.file_uploader("Upload Custom Dataset", type=["csv", "json"])
+    if custom_dataset_file:
+        custom_dataset = pd.read_csv(custom_dataset_file) if custom_dataset_file.name.endswith('csv') else pd.read_json(custom_dataset_file)
+# Column Mapping and Split
+column_mapping = {
+    "Text Classification": {"input": "sentence", "label": "label"},
+    "Sentiment Analysis": {"input": "text", "label": "label"},
+    "Question Answering": {"input": "question", "context": "context", "label": "answer"},
+    "Named Entity Recognition (NER)": {"input": "tokens", "label": "labels"},
+}
+split_mapping = {
+    "Text Classification": ["train", "validation"],
+    "Sentiment Analysis": ["train", "test"],
+    "Question Answering": ["train", "validation"],
+    "Named Entity Recognition (NER)": ["train", "validation"],
+}
+# Hyperparameters and Training Configuration
 epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
 batch_size = st.sidebar.selectbox("Batch Size", [8, 16, 32, 64], index=1)
 learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 2e-5, format="%.6f")
+optimizer_choice = st.sidebar.selectbox("Optimizer", ["AdamW", "SGD"])
 # Check if GPU/TPU is available
+device = "cuda" if torch.cuda.is_available() and hardware in ["Single GPU", "Multi-GPU"] else "cpu"
+if hardware == "TPU":
+    device = "tpu"
 st.sidebar.write(f"**Using Device:** {device.upper()}")
+# Hyperparameter Tuning with Optuna
+study = None
+if st.sidebar.button("Start Hyperparameter Tuning"):
+    def objective(trial):
+        learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 1e-3)
+        batch_size = trial.suggest_int("batch_size", 8, 64, step=8)
+        # Load dataset and model
+        tokenizer = AutoTokenizer.from_pretrained(model_choice)
+        model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
+        # Load dataset and tokenize
+        dataset = load_dataset(dataset_source)
+        def tokenize_function(examples):
+            return tokenizer(examples[column_mapping[task]["input"]], truncation=True, padding="max_length")
+        tokenized_datasets = dataset.map(tokenize_function, batched=True)
+        train_dataset = tokenized_datasets[split_mapping[task][0]]
+        eval_dataset = tokenized_datasets[split_mapping[task][1]]
+        # Training arguments
+        training_args = TrainingArguments(
+            output_dir="./results",
+            evaluation_strategy="epoch",
+            logging_dir="./logs",
+            logging_steps=5,
+            per_device_train_batch_size=batch_size,
+            per_device_eval_batch_size=batch_size,
+            num_train_epochs=epochs,
+            save_strategy="epoch",
+            learning_rate=learning_rate,
+        )
+        # Trainer setup
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+        )
+        trainer.train()
+        results = trainer.evaluate()
+        return results["eval_loss"]
+    study = optuna.create_study(direction="minimize")
+    study.optimize(objective, n_trials=10)
+    # Display Best Hyperparameters
+    st.write("Best Hyperparameters found: ", study.best_params)
+# Model Training Function with Checkpoints and Saving
+def train_model():
+    # Load tokenizer and model based on task
+    tokenizer = AutoTokenizer.from_pretrained(model_choice)
+    # Select Model Type Based on Task
+    if task == "Text Classification" or task == "Sentiment Analysis":
+        model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
+    elif task == "Question Answering":
+        model = AutoModelForQuestionAnswering.from_pretrained(model_choice)
+    elif task == "Named Entity Recognition (NER)":
+        model = AutoModelForTokenClassification.from_pretrained(model_choice, num_labels=9)
+    elif task == "Text Generation":
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_choice)
+    elif task == "Text Summarization":
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_choice)
+    # Load dataset and tokenize
+    dataset = load_dataset(dataset_source)
     def tokenize_function(examples):
+        return tokenizer(examples[column_mapping[task]["input"]], truncation=True, padding="max_length")
     tokenized_datasets = dataset.map(tokenize_function, batched=True)
+    train_dataset = tokenized_datasets[split_mapping[task][0]]
+    eval_dataset = tokenized_datasets[split_mapping[task][1]]
     # Checkpoint Handling
+    checkpoint_path = "checkpoint.pth"
+    if os.path.exists(checkpoint_path):
         model.load_state_dict(torch.load(checkpoint_path))
+        st.write("Resuming from checkpoint...")
     # Move model to device
+    model.to(torch.device(device))
     # Training arguments
     training_args = TrainingArguments(
         per_device_eval_batch_size=batch_size,
         num_train_epochs=epochs,
         save_strategy="epoch",
+        learning_rate=learning_rate,
     )
     # Trainer setup
         eval_dataset=eval_dataset,
     )
+    # Progress Bar Setup
     progress_bar = st.progress(0)
+    # Training Loop with Progress Bar
+    for epoch in range(epochs):
+        trainer.train()
+        results = trainer.evaluate()
+        # Save Checkpoint after each epoch
+        torch.save(model.state_dict(), f"checkpoint_epoch_{epoch+1}.pth")
+        # Update Progress Bar
+        progress_bar.progress((epoch + 1) / epochs)
+        # Display Results
+        st.write(f"Epoch {epoch+1}/{epochs} - Loss: {results['eval_loss']:.4f}")
+        # Show training metrics chart
+        metrics = {"Epoch": epoch + 1, "Loss": results['eval_loss']}
+        st.line_chart(pd.DataFrame([metrics]).set_index("Epoch"))
+        time.sleep(2)
+    # Enhanced Model Evaluation with Confusion Matrix and Precision-Recall Curve
+    predictions, labels, _ = trainer.predict(eval_dataset)
+    pred_labels = np.argmax(predictions, axis=-1)
+    # Classification Report
+    report = classification_report(labels, pred_labels, output_dict=True)
+    st.write("Classification Report:")
+    st.write(report)
+    # Confusion Matrix
+    cm = confusion_matrix(labels, pred_labels)
+    fig, ax = plt.subplots(figsize=(6, 6))
+    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(labels), yticklabels=np.unique(labels))
+    st.pyplot(fig)
+    # Precision-Recall Curve
+    precision, recall, _ = precision_recall_curve(labels, predictions[:, 1])
+    plt.figure(figsize=(6, 6))
+    plt.plot(recall, precision, marker=".", label="Precision-Recall Curve")
+    plt.xlabel("Recall")
+    plt.ylabel("Precision")
+    plt.title("Precision-Recall Curve")
+    st.pyplot(plt)
+# Save Model Function
+def save_model(model, model_name="trained_model"):
+    output_dir = f"./models/{model_name}"
+    model.save_pretrained(output_dir)
+    tokenizer.save_pretrained(output_dir)
+    st.write(f"Model saved to {output_dir}")
+# Stop Training Button
+if st.sidebar.button("Stop Training"):
+    st.warning("Training stopped manually.")
+# Training Buttons
+if st.sidebar.button("Start Training"):
     train_model()
+# Model Inference Interface
+if st.sidebar.button("Test Model Inference"):
+    input_text = st.text_area("Input Text for Inference", "Enter text here to get predictions")
+    if input_text:
+        inputs = tokenizer(input_text, return_tensors="pt").to(device)
+        with torch.no_grad():
+            model.eval()
+            outputs = model(**inputs)
+            prediction = torch.argmax(outputs.logits, dim=-1)
+            st.write(f"Predicted Label: {prediction.item()}")