Spaces:
Paused
Paused
Update app.py
#1
by
Reyad-Ahmmed
- opened
app.py
CHANGED
|
@@ -70,14 +70,13 @@ if (should_train_model=='1'): #train model
|
|
| 70 |
#settings
|
| 71 |
model_save_path = path_to_save_trained_model_to
|
| 72 |
bias_non_fleet = 1.0
|
| 73 |
-
epochs_to_run =
|
| 74 |
|
| 75 |
file_path_train = train_file + ".csv"
|
| 76 |
file_path_test = test_file + ".csv"
|
| 77 |
|
| 78 |
# Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model
|
| 79 |
-
|
| 80 |
-
file_train_df = fetch_and_update_training_data(file_path_train)
|
| 81 |
file_test_df = pd.read_csv(file_path_test)
|
| 82 |
|
| 83 |
|
|
@@ -93,10 +92,9 @@ if (should_train_model=='1'): #train model
|
|
| 93 |
|
| 94 |
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
|
| 95 |
|
|
|
|
|
|
|
| 96 |
tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
|
| 97 |
-
|
| 98 |
-
#tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
|
| 99 |
-
|
| 100 |
# I made sure to add all the ones in the training and eval data to this list
|
| 101 |
# since we are training using data that only contains the left tag - we don't need right tags added to this list
|
| 102 |
new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]']
|
|
@@ -104,9 +102,9 @@ if (should_train_model=='1'): #train model
|
|
| 104 |
|
| 105 |
|
| 106 |
# Model
|
| 107 |
-
model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('
|
| 108 |
-
|
| 109 |
-
|
| 110 |
|
| 111 |
# Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
|
| 112 |
model.resize_token_embeddings(len(tokenizer))
|
|
@@ -146,6 +144,8 @@ if (should_train_model=='1'): #train model
|
|
| 146 |
emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
|
| 147 |
emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
|
| 148 |
|
|
|
|
|
|
|
| 149 |
# Step 4: Split dataset into train and validation
|
| 150 |
# Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
|
| 151 |
# and one for "validation" with test dataset)
|
|
@@ -154,10 +154,12 @@ if (should_train_model=='1'): #train model
|
|
| 154 |
'validation': emotions_dataset_test
|
| 155 |
})
|
| 156 |
|
|
|
|
| 157 |
# Define the tokenize function
|
| 158 |
def tokenize(batch):
|
| 159 |
return tokenizer(batch["text"], padding=True, truncation=True)
|
| 160 |
|
|
|
|
| 161 |
# Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
|
| 162 |
# this will add the "input_id" and "attention_mask" columns
|
| 163 |
emotions_encoded = emotions_encoded.map(tokenize, batched=True)
|
|
@@ -179,6 +181,7 @@ if (should_train_model=='1'): #train model
|
|
| 179 |
accuracy = (preds == labels).astype(float).mean()
|
| 180 |
return {"accuracy": accuracy}
|
| 181 |
|
|
|
|
| 182 |
training_args = TrainingArguments(
|
| 183 |
output_dir='./results',
|
| 184 |
num_train_epochs=epochs_to_run,
|
|
@@ -192,6 +195,10 @@ if (should_train_model=='1'): #train model
|
|
| 192 |
evaluation_strategy="epoch",
|
| 193 |
)
|
| 194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
# This is needed b/c loss_fn is swapped out in order to use weighted loss
|
| 196 |
# Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label
|
| 197 |
class CustomTrainer(Trainer):
|
|
@@ -207,6 +214,15 @@ if (should_train_model=='1'): #train model
|
|
| 207 |
|
| 208 |
return (loss, outputs) if return_outputs else loss
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
trainer = Trainer(
|
| 211 |
model=model,
|
| 212 |
args=training_args,
|
|
@@ -215,6 +231,14 @@ if (should_train_model=='1'): #train model
|
|
| 215 |
tokenizer=tokenizer
|
| 216 |
)
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
# send validation prompts through the model - will be used in error-analysis matrix below
|
| 219 |
preds_output = trainer.predict(emotions_encoded["validation"])
|
| 220 |
|
|
@@ -280,7 +304,7 @@ if (should_train_model=='1'): #train model
|
|
| 280 |
|
| 281 |
# Save the model and tokenizer
|
| 282 |
model.save_pretrained(f"./{model_save_path}")
|
| 283 |
-
tokenizer.save_pretrained(
|
| 284 |
|
| 285 |
#for push repository
|
| 286 |
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
|
|
@@ -296,15 +320,25 @@ if (should_train_model=='1'): #train model
|
|
| 296 |
create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
|
| 297 |
|
| 298 |
# Upload the model and tokenizer to the Hugging Face repository
|
|
|
|
| 299 |
upload_folder(
|
| 300 |
folder_path=f"{model_save_path}",
|
| 301 |
path_in_repo=f"{model_save_path}",
|
| 302 |
repo_id=repo_name,
|
| 303 |
token=api_token,
|
| 304 |
-
commit_message="Push model
|
|
|
|
| 305 |
)
|
| 306 |
|
| 307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
else:
|
| 309 |
print('Load Pre-trained')
|
| 310 |
model_save_path = f"./{model_save_path}"
|
|
@@ -314,6 +348,10 @@ else:
|
|
| 314 |
model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
|
| 315 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
|
| 316 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
#Function to classify user input
|
| 318 |
def classify_user_input(user_input):
|
| 319 |
while True:
|
|
@@ -366,4 +404,4 @@ def classify_user_input(user_input):
|
|
| 366 |
|
| 367 |
|
| 368 |
iface = gr.Interface(fn=classify_user_input, inputs="text", outputs="text")
|
| 369 |
-
iface.launch(share=True)
|
|
|
|
| 70 |
#settings
|
| 71 |
model_save_path = path_to_save_trained_model_to
|
| 72 |
bias_non_fleet = 1.0
|
| 73 |
+
epochs_to_run = 15
|
| 74 |
|
| 75 |
file_path_train = train_file + ".csv"
|
| 76 |
file_path_test = test_file + ".csv"
|
| 77 |
|
| 78 |
# Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model
|
| 79 |
+
file_train_df = pd.read_csv(file_path_train)
|
|
|
|
| 80 |
file_test_df = pd.read_csv(file_path_test)
|
| 81 |
|
| 82 |
|
|
|
|
| 92 |
|
| 93 |
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
|
| 94 |
|
| 95 |
+
# Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base)
|
| 96 |
+
# tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer')
|
| 97 |
tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
|
|
|
|
|
|
|
|
|
|
| 98 |
# I made sure to add all the ones in the training and eval data to this list
|
| 99 |
# since we are training using data that only contains the left tag - we don't need right tags added to this list
|
| 100 |
new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]']
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
# Model
|
| 105 |
+
model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
|
| 106 |
+
# model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
|
| 107 |
+
|
| 108 |
|
| 109 |
# Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
|
| 110 |
model.resize_token_embeddings(len(tokenizer))
|
|
|
|
| 144 |
emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
|
| 145 |
emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
|
| 146 |
|
| 147 |
+
|
| 148 |
+
|
| 149 |
# Step 4: Split dataset into train and validation
|
| 150 |
# Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
|
| 151 |
# and one for "validation" with test dataset)
|
|
|
|
| 154 |
'validation': emotions_dataset_test
|
| 155 |
})
|
| 156 |
|
| 157 |
+
|
| 158 |
# Define the tokenize function
|
| 159 |
def tokenize(batch):
|
| 160 |
return tokenizer(batch["text"], padding=True, truncation=True)
|
| 161 |
|
| 162 |
+
|
| 163 |
# Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
|
| 164 |
# this will add the "input_id" and "attention_mask" columns
|
| 165 |
emotions_encoded = emotions_encoded.map(tokenize, batched=True)
|
|
|
|
| 181 |
accuracy = (preds == labels).astype(float).mean()
|
| 182 |
return {"accuracy": accuracy}
|
| 183 |
|
| 184 |
+
|
| 185 |
training_args = TrainingArguments(
|
| 186 |
output_dir='./results',
|
| 187 |
num_train_epochs=epochs_to_run,
|
|
|
|
| 195 |
evaluation_strategy="epoch",
|
| 196 |
)
|
| 197 |
|
| 198 |
+
# notice the bias_non_float in next line (it is given a value at top of code)
|
| 199 |
+
# class_weights = torch.tensor([1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,bias_non_fleet,1.0,1.0]) # Replace with your actual class weights
|
| 200 |
+
# class_weights = class_weights.to('cuda' if torch.cuda.is_available() else 'cpu')
|
| 201 |
+
|
| 202 |
# This is needed b/c loss_fn is swapped out in order to use weighted loss
|
| 203 |
# Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label
|
| 204 |
class CustomTrainer(Trainer):
|
|
|
|
| 214 |
|
| 215 |
return (loss, outputs) if return_outputs else loss
|
| 216 |
|
| 217 |
+
|
| 218 |
+
# trainer = CustomTrainer(
|
| 219 |
+
# model=model,
|
| 220 |
+
# compute_metrics=compute_metrics,
|
| 221 |
+
# args=training_args,
|
| 222 |
+
# train_dataset=emotions_encoded["train"],
|
| 223 |
+
# eval_dataset=emotions_encoded["validation"],
|
| 224 |
+
# tokenizer=tokenizer )
|
| 225 |
+
|
| 226 |
trainer = Trainer(
|
| 227 |
model=model,
|
| 228 |
args=training_args,
|
|
|
|
| 231 |
tokenizer=tokenizer
|
| 232 |
)
|
| 233 |
|
| 234 |
+
# Train the model and set timer to measure the training time
|
| 235 |
+
start_time = time.time()
|
| 236 |
+
trainer.train()
|
| 237 |
+
end_time = time.time()
|
| 238 |
+
execution_time = end_time - start_time
|
| 239 |
+
|
| 240 |
+
print(f"Execution Time: {execution_time:.2f} seconds")
|
| 241 |
+
|
| 242 |
# send validation prompts through the model - will be used in error-analysis matrix below
|
| 243 |
preds_output = trainer.predict(emotions_encoded["validation"])
|
| 244 |
|
|
|
|
| 304 |
|
| 305 |
# Save the model and tokenizer
|
| 306 |
model.save_pretrained(f"./{model_save_path}")
|
| 307 |
+
tokenizer.save_pretrained('./saved_fleet_tokenizer')
|
| 308 |
|
| 309 |
#for push repository
|
| 310 |
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
|
|
|
|
| 320 |
create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
|
| 321 |
|
| 322 |
# Upload the model and tokenizer to the Hugging Face repository
|
| 323 |
+
|
| 324 |
upload_folder(
|
| 325 |
folder_path=f"{model_save_path}",
|
| 326 |
path_in_repo=f"{model_save_path}",
|
| 327 |
repo_id=repo_name,
|
| 328 |
token=api_token,
|
| 329 |
+
commit_message="Push fleet model",
|
| 330 |
+
#overwrite=True # Force overwrite existing files
|
| 331 |
)
|
| 332 |
|
| 333 |
+
upload_folder(
|
| 334 |
+
folder_path="saved_fleet_tokenizer",
|
| 335 |
+
path_in_repo="saved_fleet_tokenizer",
|
| 336 |
+
repo_id=repo_name,
|
| 337 |
+
token=api_token,
|
| 338 |
+
commit_message="Push fleet tokenizer",
|
| 339 |
+
#overwrite=True # Force overwrite existing files
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
else:
|
| 343 |
print('Load Pre-trained')
|
| 344 |
model_save_path = f"./{model_save_path}"
|
|
|
|
| 348 |
model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
|
| 349 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
|
| 350 |
|
| 351 |
+
#Define the label mappings (this must match the mapping used during training)
|
| 352 |
+
label_mapping = model.config.label_mapping
|
| 353 |
+
label_mapping_reverse = {value: key for key, value in label_mapping.items()}
|
| 354 |
+
|
| 355 |
#Function to classify user input
|
| 356 |
def classify_user_input(user_input):
|
| 357 |
while True:
|
|
|
|
| 404 |
|
| 405 |
|
| 406 |
iface = gr.Interface(fn=classify_user_input, inputs="text", outputs="text")
|
| 407 |
+
iface.launch(share=True)
|