prasenjeet099 commited on
Commit
70abc44
Β·
verified Β·
1 Parent(s): c99b642

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -69
app.py CHANGED
@@ -4,8 +4,8 @@ import time
4
  import os
5
  import pandas as pd
6
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
7
- from datasets import load_dataset, Dataset, DatasetDict
8
- from sklearn.model_selection import train_test_split
9
 
10
  # Set up Streamlit page
11
  st.set_page_config(page_title="AutoTrain AI", page_icon="πŸš€", layout="wide")
@@ -18,19 +18,7 @@ hf_user = st.sidebar.selectbox("Hugging Face User", ["hennings1984"])
18
  task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis"])
19
  hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
20
  model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base"])
21
-
22
- # Dataset Configuration
23
- dataset_source = st.sidebar.selectbox("Dataset Source", ["Hugging Face", "Upload Your Dataset"])
24
-
25
- if dataset_source == "Hugging Face":
26
- # Choose Hugging Face dataset
27
- dataset_name = st.sidebar.text_input("Enter Hugging Face Dataset Name", "imdb")
28
- else:
29
- # Upload Custom Dataset
30
- uploaded_file = st.sidebar.file_uploader("Upload Your Dataset (CSV/TSV/JSON)", type=["csv", "json", "tsv"])
31
- # Allow the user to map columns
32
- text_column = st.sidebar.text_input("Text Column Name", "text")
33
- label_column = st.sidebar.text_input("Label Column Name", "label")
34
 
35
  # Training Parameters
36
  epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
@@ -63,44 +51,7 @@ log_area = st.empty()
63
 
64
  # Live Training Metrics
65
  st.write("### Training Metrics πŸ“Š")
66
-
67
- # Dataset Loading Logic
68
- def load_custom_dataset(uploaded_file, text_column, label_column):
69
- if uploaded_file is not None:
70
- file_type = uploaded_file.name.split('.')[-1]
71
- if file_type == "csv":
72
- df = pd.read_csv(uploaded_file)
73
- elif file_type == "json":
74
- df = pd.read_json(uploaded_file)
75
- elif file_type == "tsv":
76
- df = pd.read_csv(uploaded_file, sep="\t")
77
- else:
78
- st.error("Unsupported file type")
79
- return None
80
-
81
- # Map the columns based on user input
82
- df = df[[text_column, label_column]]
83
- return df
84
- return None
85
-
86
- def load_huggingface_dataset(dataset_name):
87
- return load_dataset(dataset_name)
88
-
89
- def split_dataset(df, text_column, label_column):
90
- # Split the dataset into train, validation, and test (80/10/10 split)
91
- train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
92
- val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
93
-
94
- # Convert to Hugging Face Dataset format
95
- train_dataset = Dataset.from_pandas(train_df)
96
- val_dataset = Dataset.from_pandas(val_df)
97
- test_dataset = Dataset.from_pandas(test_df)
98
-
99
- return DatasetDict({
100
- "train": train_dataset,
101
- "validation": val_dataset,
102
- "test": test_dataset
103
- })
104
 
105
  # Training Function
106
  def train_model():
@@ -108,26 +59,21 @@ def train_model():
108
 
109
  # Load model & tokenizer
110
  tokenizer = AutoTokenizer.from_pretrained(model_choice)
111
- model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2)
112
 
113
  # Load dataset
114
- if dataset_source == "Hugging Face":
115
- dataset = load_huggingface_dataset(dataset_name)
116
- else:
117
- dataset_df = load_custom_dataset(uploaded_file, text_column, label_column)
118
- if dataset_df is not None:
119
- # Split dataset if it's not already split
120
- dataset = split_dataset(dataset_df, text_column, label_column)
121
-
122
- # Check the dataset structure to identify the correct column name
123
- st.write(f"Dataset columns: {dataset['train'].column_names}")
124
-
125
  # Tokenization function
126
  def tokenize_function(examples):
127
- # Adjust this based on the actual column name
128
- return tokenizer(examples[text_column], truncation=True, padding="max_length")
129
 
130
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
 
 
 
 
 
131
  train_dataset = tokenized_datasets["train"]
132
  eval_dataset = tokenized_datasets["validation"]
133
 
@@ -159,14 +105,22 @@ def train_model():
159
  eval_dataset=eval_dataset,
160
  )
161
 
162
- # Training Loop
163
  metrics = []
164
  with open(log_file, "w") as log_file_handle:
165
  log_file_handle.write("Starting training...\n")
166
  log_file_handle.flush()
167
 
168
  for epoch in range(epochs):
169
- trainer.train()
 
 
 
 
 
 
 
 
170
  results = trainer.evaluate()
171
 
172
  # Save Checkpoint
 
4
  import os
5
  import pandas as pd
6
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
7
+ from datasets import load_dataset
8
+ from tqdm import tqdm # For progress bar during training
9
 
10
  # Set up Streamlit page
11
  st.set_page_config(page_title="AutoTrain AI", page_icon="πŸš€", layout="wide")
 
18
  task = st.sidebar.selectbox("Select Task", ["Text Classification", "Sentiment Analysis"])
19
  hardware = st.sidebar.selectbox("Hardware", ["CPU", "Single GPU", "Multi-GPU", "TPU"])
20
  model_choice = st.sidebar.selectbox("Choose Model", ["bert-base-uncased", "distilbert-base-uncased", "roberta-base"])
21
+ dataset_source = st.sidebar.selectbox("Dataset Source", ["glue/sst2", "imdb", "ag_news", "Custom"])
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # Training Parameters
24
  epochs = st.sidebar.slider("Number of Epochs", 1, 10, 3)
 
51
 
52
  # Live Training Metrics
53
  st.write("### Training Metrics πŸ“Š")
54
+ progress_bar = st.progress(0) # Initialize progress bar
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  # Training Function
57
  def train_model():
 
59
 
60
  # Load model & tokenizer
61
  tokenizer = AutoTokenizer.from_pretrained(model_choice)
62
+ model = AutoModelForSequenceClassification.from_pretrained(model_choice, num_labels=2) # Adjust num_labels as necessary
63
 
64
  # Load dataset
65
+ dataset = load_dataset(dataset_source)
66
+
 
 
 
 
 
 
 
 
 
67
  # Tokenization function
68
  def tokenize_function(examples):
69
+ return tokenizer(examples["text"], truncation=True, padding="max_length")
 
70
 
71
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
72
+
73
+ # Ensure that the dataset has the correct label column (adjust the label column name if necessary)
74
+ if "label" not in tokenized_datasets["train"].features:
75
+ raise ValueError("Dataset does not have a 'label' column for supervised training")
76
+
77
  train_dataset = tokenized_datasets["train"]
78
  eval_dataset = tokenized_datasets["validation"]
79
 
 
105
  eval_dataset=eval_dataset,
106
  )
107
 
108
+ # Training Loop with Progress Bar
109
  metrics = []
110
  with open(log_file, "w") as log_file_handle:
111
  log_file_handle.write("Starting training...\n")
112
  log_file_handle.flush()
113
 
114
  for epoch in range(epochs):
115
+ # Initialize progress bar for this epoch
116
+ progress_bar.progress(0) # Reset progress bar at the start of each epoch
117
+
118
+ # Training with tqdm for real-time progress bar
119
+ for step, batch in enumerate(trainer.get_train_dataloader()):
120
+ trainer.training_step(model, batch) # Perform a training step
121
+ progress_bar.progress((step + 1) / len(trainer.get_train_dataloader())) # Update progress bar
122
+
123
+ # Evaluate the model at the end of each epoch
124
  results = trainer.evaluate()
125
 
126
  # Save Checkpoint