Spaces:

zltd
/

LLM-Tuner

Sleeping

App Files Files Community

prasenjeet099 commited on Mar 1, 2025

Commit

254a40b

verified ·

1 Parent(s): 2417ba9

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -178

app.py CHANGED Viewed

@@ -1,18 +1,10 @@
 import streamlit as st
 import torch
-from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoModelForSeq2SeqLM
-from datasets import load_dataset, Dataset
-import pandas as pd
-import numpy as np
 import os
 import time
-import matplotlib.pyplot as plt
-from sklearn.metrics import classification_report, confusion_matrix
-import optuna  # Hyperparameter tuning
-from sklearn.metrics import precision_recall_curve
-import seaborn as sns
-from torch.utils.data import DataLoader
-import shutil
 # Set up Streamlit page
 st.set_page_config(page_title="AutoTrain AI", page_icon="🚀", layout="wide")
@@ -21,39 +13,16 @@ st.subheader("Train AI models using PyTorch & Hugging Face Transformers")
 # Sidebar Configuration
 st.sidebar.header("Configuration")
-hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984", "custom_model"])
-task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis", "Question Answering", "Named Entity Recognition (NER)", "Text Generation", "Text Summarization"])
 hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
-model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base", "t5-small", "bert-large-uncased", "custom_model"])
-dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "squad", "conll2003", "Custom"])
-# Custom Dataset Upload
-custom_dataset = None
-if dataset_source == "Custom":
-    custom_dataset_file = st.sidebar.file_uploader("Upload Custom Dataset", type=["csv", "json"])
-    if custom_dataset_file:
-        custom_dataset = pd.read_csv(custom_dataset_file) if custom_dataset_file.name.endswith('csv') else pd.read_json(custom_dataset_file)
-# Column Mapping and Split
-column_mapping = {
-    "Text Classification": {"input": "sentence", "label": "label"},
-    "Sentiment Analysis": {"input": "text", "label": "label"},
-    "Question Answering": {"input": "question", "context": "context", "label": "answer"},
-    "Named Entity Recognition (NER)": {"input": "tokens", "label": "labels"},
-}
-split_mapping = {
-    "Text Classification": ["train", "validation"],
-    "Sentiment Analysis": ["train", "test"],
-    "Question Answering": ["train", "validation"],
-    "Named Entity Recognition (NER)": ["train", "validation"],
-}
-# Hyperparameters and Training Configuration
 epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
 batch_size = st.sidebar.selectbox("Batch Size", [8, 16, 32, 64], index=1)
 learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 2e-5, format="%.6f")
-optimizer_choice = st.sidebar.selectbox("Optimizer", ["AdamW", "SGD"])
 # Check if GPU/TPU is available
 device = "cuda" if torch.cuda.is_available() and hardware in ["Single GPU", "Multi-GPU"] else "cpu"
@@ -62,92 +31,65 @@ if hardware == "TPU":
 st.sidebar.write(f"**Using Device:** {device.upper()}")
-# Hyperparameter Tuning with Optuna
-study = None
-if st.sidebar.button("Start Hyperparameter Tuning"):
-    def objective(trial):
-        learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 1e-3)
-        batch_size = trial.suggest_int("batch_size", 8, 64, step=8)
-        # Load dataset and model
-        tokenizer = AutoTokenizer.from_pretrained(model_choice)
-        model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
-        # Load dataset and tokenize
-        dataset = load_dataset(dataset_source)
-        def tokenize_function(examples):
-            return tokenizer(examples[column_mapping[task]["input"]], truncation=True, padding="max_length")
-        tokenized_datasets = dataset.map(tokenize_function, batched=True)
-        train_dataset = tokenized_datasets[split_mapping[task][0]]
-        eval_dataset = tokenized_datasets[split_mapping[task][1]]
-        # Training arguments
-        training_args = TrainingArguments(
-            output_dir="./results",
-            evaluation_strategy="epoch",
-            logging_dir="./logs",
-            logging_steps=5,
-            per_device_train_batch_size=batch_size,
-            per_device_eval_batch_size=batch_size,
-            num_train_epochs=epochs,
-            save_strategy="epoch",
-            learning_rate=learning_rate,
-        )
-        # Trainer setup
-        trainer = Trainer(
-            model=model,
-            args=training_args,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-        )
-        trainer.train()
-        results = trainer.evaluate()
-        return results["eval_loss"]
-    study = optuna.create_study(direction="minimize")
-    study.optimize(objective, n_trials=10)
-    # Display Best Hyperparameters
-    st.write("Best Hyperparameters found: ", study.best_params)
-# Model Training Function with Checkpoints and Saving
-def train_model():
-    # Load tokenizer and model based on task
-    tokenizer = AutoTokenizer.from_pretrained(model_choice)
-    # Select Model Type Based on Task
-    if task == "Text Classification" or task == "Sentiment Analysis":
-        model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
-    elif task == "Question Answering":
-        model = AutoModelForQuestionAnswering.from_pretrained(model_choice)
-    elif task == "Named Entity Recognition (NER)":
-        model = AutoModelForTokenClassification.from_pretrained(model_choice, num_labels=9)
-    elif task == "Text Generation":
-        model = AutoModelForSeq2SeqLM.from_pretrained(model_choice)
-    elif task == "Text Summarization":
-        model = AutoModelForSeq2SeqLM.from_pretrained(model_choice)
-    # Load dataset and tokenize
-    dataset = load_dataset(dataset_source)
     def tokenize_function(examples):
-        return tokenizer(examples[column_mapping[task]["input"]], truncation=True, padding="max_length")
     tokenized_datasets = dataset.map(tokenize_function, batched=True)
-    train_dataset = tokenized_datasets[split_mapping[task][0]]
-    eval_dataset = tokenized_datasets[split_mapping[task][1]]
     # Checkpoint Handling
-    checkpoint_path = "checkpoint.pth"
-    if os.path.exists(checkpoint_path):
         model.load_state_dict(torch.load(checkpoint_path))
-        st.write("Resuming from checkpoint...")
     # Move model to device
     model.to(torch.device(device))
@@ -162,7 +104,7 @@ def train_model():
         per_device_eval_batch_size=batch_size,
         num_train_epochs=epochs,
         save_strategy="epoch",
-        learning_rate=learning_rate,
     )
     # Trainer setup
@@ -173,75 +115,38 @@ def train_model():
         eval_dataset=eval_dataset,
     )
-    # Progress Bar Setup
-    progress_bar = st.progress(0)
-    # Training Loop with Progress Bar
-    for epoch in range(epochs):
-        trainer.train()
-        results = trainer.evaluate()
-        # Save Checkpoint after each epoch
-        torch.save(model.state_dict(), f"checkpoint_epoch_{epoch+1}.pth")
-        # Update Progress Bar
-        progress_bar.progress((epoch + 1) / epochs)
-        # Display Results
-        st.write(f"Epoch {epoch+1}/{epochs} - Loss: {results['eval_loss']:.4f}")
-        # Show training metrics chart
-        metrics = {"Epoch": epoch + 1, "Loss": results['eval_loss']}
-        st.line_chart(pd.DataFrame([metrics]).set_index("Epoch"))
-        time.sleep(2)
-    # Enhanced Model Evaluation with Confusion Matrix and Precision-Recall Curve
-    predictions, labels, _ = trainer.predict(eval_dataset)
-    pred_labels = np.argmax(predictions, axis=-1)
-    # Classification Report
-    report = classification_report(labels, pred_labels, output_dict=True)
-    st.write("Classification Report:")
-    st.write(report)
-    # Confusion Matrix
-    cm = confusion_matrix(labels, pred_labels)
-    fig, ax = plt.subplots(figsize=(6, 6))
-    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(labels), yticklabels=np.unique(labels))
-    st.pyplot(fig)
-    # Precision-Recall Curve
-    precision, recall, _ = precision_recall_curve(labels, predictions[:, 1])
-    plt.figure(figsize=(6, 6))
-    plt.plot(recall, precision, marker=".", label="Precision-Recall Curve")
-    plt.xlabel("Recall")
-    plt.ylabel("Precision")
-    plt.title("Precision-Recall Curve")
-    st.pyplot(plt)
-# Save Model Function
-def save_model(model, model_name="trained_model"):
-    output_dir = f"./models/{model_name}"
-    model.save_pretrained(output_dir)
-    tokenizer.save_pretrained(output_dir)
-    st.write(f"Model saved to {output_dir}")
-# Stop Training Button
-if st.sidebar.button("Stop Training"):
-    st.warning("Training stopped manually.")
-# Training Buttons
-if st.sidebar.button("Start Training"):
     train_model()
-# Model Inference Interface
-if st.sidebar.button("Test Model Inference"):
-    input_text = st.text_area("Input Text for Inference", "Enter text here to get predictions")
-    if input_text:
-        inputs = tokenizer(input_text, return_tensors="pt").to(device)
-        with torch.no_grad():
-            model.eval()
-            outputs = model(**inputs)
-            prediction = torch.argmax(outputs.logits, dim=-1)
-            st.write(f"Predicted Label: {prediction.item()}")

 import streamlit as st
 import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
+from datasets import load_dataset
 import os
+import pandas as pd
 import time
 # Set up Streamlit page
 st.set_page_config(page_title="AutoTrain AI", page_icon="🚀", layout="wide")
 # Sidebar Configuration
 st.sidebar.header("Configuration")
+hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984"])
+task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis", "Text Generation", "Translation"])
 hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
+model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base", "Custom Model"])
+dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "Custom"])
+# Training Parameters
 epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
 batch_size = st.sidebar.selectbox("Batch Size", [8, 16, 32, 64], index=1)
 learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 2e-5, format="%.6f")
 # Check if GPU/TPU is available
 device = "cuda" if torch.cuda.is_available() and hardware in ["Single GPU", "Multi-GPU"] else "cpu"
 st.sidebar.write(f"**Using Device:** {device.upper()}")
+# Checkpoint Handling
+resume_training = st.sidebar.checkbox("Resume Training from Checkpoint")
+checkpoint_path = "checkpoint.pth" if resume_training else None
+# File Paths
+log_file = "train_log.txt"
+metrics_file = "metrics.csv"
+# Training Buttons
+st.write("### Model Training Control")
+start_train = st.button("Start Training 🚀")
+stop_train = st.button("Stop Training ⛔")
+# Live Logs Display
+st.write("### Training Logs (Live Updates)")
+log_area = st.empty()
+# Live Training Metrics
+st.write("### Training Metrics 📊")
+# Training Function
+def train_model():
+    st.success(f"Training started for {task} with {model_choice} on {device.upper()}")
+    # Load model & tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_choice) if model_choice != "Custom Model" else None
+    model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2) if model_choice != "Custom Model" else None
+    # Load dataset
+    if dataset_source == "Custom":
+        uploaded_file = st.sidebar.file_uploader("Upload your dataset", type=["csv", "json"])
+        if uploaded_file is not None:
+            dataset = pd.read_csv(uploaded_file) if uploaded_file.name.endswith(".csv") else pd.read_json(uploaded_file)
+            dataset = dataset.to_dict(orient="records")
+    else:
+        dataset = load_dataset(dataset_source)
+    # Check available columns and ask user for the column name to tokenize
+    if "train" in dataset:
+        train_data = dataset["train"]
+        columns = train_data.features.keys()
+        text_column = st.sidebar.selectbox("Select Text Column", list(columns))
+        label_column = st.sidebar.selectbox("Select Label Column", list(columns))
+    else:
+        st.error("Dataset not in correct format. Ensure it has a 'train' split.")
+        return
+    # Tokenization function
     def tokenize_function(examples):
+        return tokenizer(examples[text_column], truncation=True, padding="max_length")
+    # Apply tokenization
     tokenized_datasets = dataset.map(tokenize_function, batched=True)
+    train_dataset = tokenized_datasets["train"]
+    eval_dataset = tokenized_datasets["test"] if "test" in dataset else tokenized_datasets["validation"]
     # Checkpoint Handling
+    if resume_training and os.path.exists(checkpoint_path):
         model.load_state_dict(torch.load(checkpoint_path))
     # Move model to device
     model.to(torch.device(device))
         per_device_eval_batch_size=batch_size,
         num_train_epochs=epochs,
         save_strategy="epoch",
+        learning_rate=learning_rate
     )
     # Trainer setup
         eval_dataset=eval_dataset,
     )
+    # Training Loop
+    metrics = []
+    with open(log_file, "w") as log_file_handle:
+        log_file_handle.write("Starting training...\n")
+        log_file_handle.flush()
+        for epoch in range(epochs):
+            trainer.train()
+            results = trainer.evaluate()
+            # Save Checkpoint
+            torch.save(model.state_dict(), f"checkpoint_epoch_{epoch+1}.pth")
+            # Log results
+            log_text = f"Epoch {epoch+1}: Loss = {results['eval_loss']:.4f}, Accuracy = {results.get('eval_accuracy', 0):.4f}\n"
+            log_file_handle.write(log_text)
+            log_file_handle.flush()
+            # Save metrics
+            metrics.append({"epoch": epoch+1, "loss": results["eval_loss"], "accuracy": results.get("eval_accuracy", 0)})
+            pd.DataFrame(metrics).to_csv(metrics_file, index=False)
+            # Update logs & metrics in UI
+            log_area.text(log_text)
+            st.line_chart(pd.DataFrame(metrics).set_index("epoch"))
+            time.sleep(2)
+# Start Training
+if start_train:
     train_model()
+# Stop Training
+if stop_train:
+    st.warning("Training stopped manually.")