Rulga commited on
Commit
3b59cc2
·
1 Parent(s): b0e22d6

Enhance training method in FineTuner: Add detailed logging for training process, dataset loading, tokenization, and error handling to improve debugging and traceability.

Browse files
Files changed (1) hide show
  1. src/training/fine_tuner.py +99 -41
src/training/fine_tuner.py CHANGED
@@ -230,58 +230,116 @@ class FineTuner:
230
  (success, message)
231
  """
232
  try:
233
- # Prepare model for training
 
 
 
 
 
 
 
234
  self.prepare_model_for_training()
235
 
236
- # Load dataset
237
- dataset = load_dataset('json', data_files=training_data_path)['train']
 
 
 
 
 
 
 
 
 
 
 
238
 
239
- # Tokenize dataset
240
- tokenized_dataset = self.tokenize_dataset(dataset)
 
 
 
 
 
 
241
 
242
- # Create data collator
243
- data_collator = DataCollatorForLanguageModeling(
244
- tokenizer=self.tokenizer,
245
- mlm=False
246
- )
 
 
 
 
 
247
 
248
- # Setup training arguments
249
- training_args = TrainingArguments(
250
- output_dir=self.output_dir,
251
- num_train_epochs=num_train_epochs,
252
- per_device_train_batch_size=per_device_train_batch_size,
253
- gradient_accumulation_steps=gradient_accumulation_steps,
254
- learning_rate=learning_rate,
255
- weight_decay=0.01,
256
- warmup_ratio=0.1,
257
- logging_steps=logging_steps,
258
- save_strategy=save_strategy,
259
- save_total_limit=2,
260
- remove_unused_columns=False,
261
- push_to_hub=False,
262
- report_to="tensorboard",
263
- load_best_model_at_end=True
264
- )
 
 
 
 
 
265
 
266
- # Create trainer
267
- trainer = Trainer(
268
- model=self.model,
269
- args=training_args,
270
- train_dataset=tokenized_dataset,
271
- data_collator=data_collator,
272
- tokenizer=self.tokenizer
273
- )
 
 
 
 
 
274
 
275
- # Start training
276
- trainer.train()
 
 
 
 
 
 
277
 
278
- # Save model
279
- trainer.save_model()
 
 
 
 
 
 
280
 
281
- return True, f"Model successfully trained and saved to {self.output_dir}"
 
 
282
 
283
  except Exception as e:
284
- return False, f"Training failed: {str(e)}"
 
 
 
 
 
285
 
286
  def upload_model_to_hub(
287
  self,
 
230
  (success, message)
231
  """
232
  try:
233
+ logger.info(f"Starting training process with parameters:")
234
+ logger.info(f"- Training data path: {training_data_path}")
235
+ logger.info(f"- Number of epochs: {num_train_epochs}")
236
+ logger.info(f"- Batch size: {per_device_train_batch_size}")
237
+ logger.info(f"- Learning rate: {learning_rate}")
238
+ logger.info(f"- Device: {self.device}")
239
+
240
+ logger.info("Preparing model for training...")
241
  self.prepare_model_for_training()
242
 
243
+ logger.info("Loading dataset...")
244
+ if not os.path.exists(training_data_path):
245
+ error_msg = f"Training data file not found: {training_data_path}"
246
+ logger.error(error_msg)
247
+ return False, error_msg
248
+
249
+ try:
250
+ dataset = load_dataset('json', data_files=training_data_path)['train']
251
+ logger.info(f"Dataset loaded successfully. Size: {len(dataset)} examples")
252
+ except Exception as e:
253
+ error_msg = f"Failed to load dataset: {str(e)}"
254
+ logger.error(error_msg)
255
+ return False, error_msg
256
 
257
+ logger.info("Tokenizing dataset...")
258
+ try:
259
+ tokenized_dataset = self.tokenize_dataset(dataset)
260
+ logger.info("Dataset tokenized successfully")
261
+ except Exception as e:
262
+ error_msg = f"Failed to tokenize dataset: {str(e)}"
263
+ logger.error(error_msg)
264
+ return False, error_msg
265
 
266
+ logger.info("Creating data collator...")
267
+ try:
268
+ data_collator = DataCollatorForLanguageModeling(
269
+ tokenizer=self.tokenizer,
270
+ mlm=False
271
+ )
272
+ except Exception as e:
273
+ error_msg = f"Failed to create data collator: {str(e)}"
274
+ logger.error(error_msg)
275
+ return False, error_msg
276
 
277
+ logger.info("Setting up training arguments...")
278
+ try:
279
+ training_args = TrainingArguments(
280
+ output_dir=self.output_dir,
281
+ num_train_epochs=num_train_epochs,
282
+ per_device_train_batch_size=per_device_train_batch_size,
283
+ gradient_accumulation_steps=gradient_accumulation_steps,
284
+ learning_rate=learning_rate,
285
+ weight_decay=0.01,
286
+ warmup_ratio=0.1,
287
+ logging_steps=logging_steps,
288
+ save_strategy=save_strategy,
289
+ save_total_limit=2,
290
+ remove_unused_columns=False,
291
+ push_to_hub=False,
292
+ report_to="tensorboard",
293
+ load_best_model_at_end=True
294
+ )
295
+ except Exception as e:
296
+ error_msg = f"Failed to setup training arguments: {str(e)}"
297
+ logger.error(error_msg)
298
+ return False, error_msg
299
 
300
+ logger.info("Initializing trainer...")
301
+ try:
302
+ trainer = Trainer(
303
+ model=self.model,
304
+ args=training_args,
305
+ train_dataset=tokenized_dataset,
306
+ data_collator=data_collator,
307
+ tokenizer=self.tokenizer
308
+ )
309
+ except Exception as e:
310
+ error_msg = f"Failed to initialize trainer: {str(e)}"
311
+ logger.error(error_msg)
312
+ return False, error_msg
313
 
314
+ logger.info("Starting training...")
315
+ try:
316
+ trainer.train()
317
+ logger.info("Training completed successfully")
318
+ except Exception as e:
319
+ error_msg = f"Training failed: {str(e)}"
320
+ logger.error(error_msg)
321
+ return False, error_msg
322
 
323
+ logger.info("Saving model...")
324
+ try:
325
+ trainer.save_model()
326
+ logger.info(f"Model saved to {self.output_dir}")
327
+ except Exception as e:
328
+ error_msg = f"Failed to save model: {str(e)}"
329
+ logger.error(error_msg)
330
+ return False, error_msg
331
 
332
+ success_msg = f"Model successfully trained and saved to {self.output_dir}"
333
+ logger.info(success_msg)
334
+ return True, success_msg
335
 
336
  except Exception as e:
337
+ error_msg = f"Unexpected error during training: {str(e)}"
338
+ logger.error(error_msg)
339
+ # Log full traceback for debugging
340
+ import traceback
341
+ logger.error(f"Full traceback:\n{traceback.format_exc()}")
342
+ return False, error_msg
343
 
344
  def upload_model_to_hub(
345
  self,