Spaces:

luohoa97
/

train

Sleeping

App Files Files Community

luohoa97 commited on Jul 27, 2024

Commit

40510d6

verified ·

1 Parent(s): d0fe7a8

Create app.py

Browse files

Files changed (1) hide show

app.py +134 -0

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import streamlit as st
+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
+from sklearn.model_selection import train_test_split
+import os
+import json
+# Dataset class for PyTorch
+class TextDataset(Dataset):
+    def __init__(self, encodings, labels):
+        self.encodings = encodings
+        self.labels = labels
+    def __getitem__(self, idx):
+        # Return input_ids, attention_mask, and labels for each item
+        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+        item['labels'] = torch.tensor(self.labels[idx])  # Adding labels for loss calculation
+        return item
+    def __len__(self):
+        return len(self.labels)
+# Function to load configuration
+def load_config(config_path='config.json'):
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    return config
+# Main function
+def main():
+    st.title("CSV Data Processing and Model Training 🧠")
+    # Load configuration
+    config = load_config()
+    # Upload multiple CSV files
+    uploaded_files = st.file_uploader("Upload CSV files", accept_multiple_files=True, type="csv")
+    if uploaded_files:
+        combined_texts = []
+        # Process each uploaded CSV file
+        for uploaded_file in uploaded_files:
+            df = pd.read_csv(uploaded_file)
+            # Combine all columns into a single text string for each row
+            combined_texts.extend(df.astype(str).agg(' '.join, axis=1))
+        # Check the combined text
+        st.write("Combined text for training:", combined_texts[:5])  # Show first 5 for verification
+        # Ask the user if they want to load an existing model or train a new one
+        use_existing_model = st.checkbox("Load an existing local model?", value=False)
+        if use_existing_model:
+            # Allow the user to select a local model directory
+            model_path = st.text_input("Enter the path to the local model directory:", value="")
+            if model_path and os.path.exists(model_path):
+                model = AutoModelForSequenceClassification.from_pretrained(model_path)
+                st.write(f"Loaded model from {model_path} successfully! 🎉")
+            else:
+                st.warning("Please provide a valid model directory path.")
+                return
+        else:
+            # Initialize a new model
+            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
+        # Initialize tokenizer
+        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+        # Tokenize combined text data
+        inputs = tokenizer(combined_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
+        # Create dummy labels (e.g., 0s for all entries)
+        labels = [0] * len(combined_texts)  # Dummy labels for all data
+        # Split data into training and validation sets
+        train_inputs, val_inputs, train_labels, val_labels = train_test_split(
+            inputs['input_ids'], labels, test_size=0.2, random_state=42
+        )
+        # Prepare datasets
+        train_dataset = TextDataset(encodings={'input_ids': train_inputs}, labels=train_labels)
+        val_dataset = TextDataset(encodings={'input_ids': val_inputs}, labels=val_labels)
+        # Determine number of threads from config
+        num_workers = config.get('num_workers', 4)
+        # Set up DataLoaders
+        train_dataloader = DataLoader(train_dataset, batch_size=8, num_workers=num_workers)
+        val_dataloader = DataLoader(val_dataset, batch_size=8, num_workers=num_workers)
+        # Training arguments
+        training_args = TrainingArguments(
+            output_dir='./results',          # output directory
+            num_train_epochs=1,              # total number of training epochs
+            per_device_train_batch_size=8,   # batch size per device during training
+            per_device_eval_batch_size=8,    # batch size for evaluation
+            warmup_steps=500,                # number of warmup steps for learning rate scheduler
+            weight_decay=0.01,               # strength of weight decay
+            logging_dir='./logs',            # directory for storing logs
+            logging_steps=10,
+            evaluation_strategy="epoch"
+        )
+        # Initialize Trainer
+        trainer = Trainer(
+            model=model,                         # the instantiated 🤗 Transformers model to be trained
+            args=training_args,                  # training arguments, defined above
+            train_dataset=train_dataset,         # training dataset
+            eval_dataset=val_dataset             # evaluation dataset
+        )
+        # Start training
+        trainer.train()
+        # Ask the user for a directory to save the trained model
+        save_path = st.text_input("Enter the directory path to save the trained model:", value="./trained_model")
+        if save_path:
+            os.makedirs(save_path, exist_ok=True)
+            model.save_pretrained(save_path)
+            tokenizer.save_pretrained(save_path)
+            st.write(f"Model saved successfully to {save_path}! 🎉")
+        else:
+            st.warning("Please provide a valid directory path to save the model.")
+        # Notify user of training completion
+        st.success("Training completed successfully! 🚀")
+if __name__ == "__main__":
+    main()