Spaces:

CreatorIQ-org
/

rlhf_docker

Sleeping

App Files Files Community

b2u commited on Dec 3, 2024

Commit

1589415

1 Parent(s): 1331b4b

adding proper training cycle handling

Browse files

Files changed (1) hide show

model.py +42 -54

model.py CHANGED Viewed

@@ -17,13 +17,6 @@ current_dir = Path(__file__).parent
 logger = logging.getLogger(__name__)
-# Add these debug lines
-logger.info("=== DEBUG INFO ===")
-logger.info(f"Python path: {sys.path}")
-logger.info(f"Current directory: {os.getcwd()}")
-logger.info(f"Directory contents: {os.listdir('.')}")
-logger.info("=== END DEBUG INFO ===")
 # Move TextDataset class here
 class TextDataset(Dataset):
     def __init__(self, texts, labels, tokenizer, max_length=128):
@@ -43,10 +36,10 @@ class BertClassifier(LabelStudioMLBase):
         super(BertClassifier, self).__init__(project_id=project_id, label_config=label_config)
         # Load training configuration from environment variables
-        self.learning_rate = float(os.getenv('LEARNING_RATE', 2e-5))
-        self.num_train_epochs = int(os.getenv('NUM_TRAIN_EPOCHS', 2))
-        self.weight_decay = float(os.getenv('WEIGHT_DECAY', 0.01))
-        self.start_training_threshold = int(os.getenv('START_TRAINING_EACH_N_UPDATES', 1))
         logger.info("=== Training Configuration ===")
         logger.info(f"✓ Learning rate: {self.learning_rate}")
@@ -55,38 +48,28 @@ class BertClassifier(LabelStudioMLBase):
         logger.info(f"✓ Training threshold: {self.start_training_threshold}")
         logger.info("============================")
-        logger.info(f"Initializing BertClassifier with project_id: {project_id}")
-        logger.info(f"Label config length: {len(label_config) if label_config else 0}")
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         logger.info(f"Using device: {self.device}")
-        # Define categories
-        self.categories = [
-            'affiliate_classification', 'brand', 'business_and_career',
-            'content_quality', 'date', 'demographic', 'event',
-            'faith_and_religion', 'gaming', 'health',
-            'internal_categorization', 'location', 'number',
-            'performance', 'post_type', 'pricing_tier',
-            'product', 'profession', 'pii', 'social_network',
-            'style_and_fashion', 'no_category'
-        ]
-        self.model_dir = os.path.join(os.path.dirname(__file__), 'model')
-        os.makedirs(self.model_dir, exist_ok=True)
-        # Initialize model and tokenizer
-        try:
-            self._model = AutoModelForSequenceClassification.from_pretrained(
-                'bert-base-uncased',
-                num_labels=len(self.categories)
-            )
-            self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
-            self._model.to(self.device)
-            logger.info("Successfully loaded BERT model and tokenizer")
-        except Exception as e:
-            logger.error(f"Error loading model: {str(e)}")
-            logger.error("Full error details:", exc_info=True)
     def predict(self, tasks, **kwargs):
         """Generate predictions for a list of tasks."""
@@ -150,7 +133,6 @@ class BertClassifier(LabelStudioMLBase):
         return predictions
     def fit(self, event_data, data=None, **kwargs):
-        """Train the model on a single annotation."""
         start_time = datetime.now()
         logger.info("=== FIT METHOD CALLED ===")
@@ -196,11 +178,12 @@ class BertClassifier(LabelStudioMLBase):
                             self._model.train()
                             logger.info("Starting training...")
-                            # Multi-epoch training
                             for epoch in range(self.num_train_epochs):
                                 logger.info(f"Starting epoch {epoch + 1}/{self.num_train_epochs}")
-                                # Single example training
                                 for batch in train_loader:
                                     optimizer.zero_grad()
@@ -217,27 +200,32 @@ class BertClassifier(LabelStudioMLBase):
                                     )
                                     loss = outputs.loss
-                                    logger.info(f"Training loss: {loss.item():.4f}")
                                     # Backward pass
                                     loss.backward()
                                     optimizer.step()
-                                # Save the model
-                                model_path = os.path.join(self.model_dir, 'model_state.pt')
-                                torch.save(self._model.state_dict(), model_path)
-                                logger.info(f"✓ Model saved to {model_path}")
-                                return {
-                                    'status': 'ok',
-                                    'message': f'Successfully trained on: {text[:50]}... -> {label}',
-                                    'time_taken': str(datetime.now() - start_time)
-                                }
                         except Exception as e:
                             logger.error(f"Training error: {str(e)}")
-                            logger.error("Full error details:", exc_info=True)
-                            return {'status': 'error', 'message': f'Training failed: {str(e)}'}
         except Exception as e:
             logger.error(f"Error in fit method: {str(e)}")

 logger = logging.getLogger(__name__)
 # Move TextDataset class here
 class TextDataset(Dataset):
     def __init__(self, texts, labels, tokenizer, max_length=128):
         super(BertClassifier, self).__init__(project_id=project_id, label_config=label_config)
         # Load training configuration from environment variables
+        self.learning_rate = float(os.getenv('LEARNING_RATE'))
+        self.num_train_epochs = int(os.getenv('NUM_TRAIN_EPOCHS'))
+        self.weight_decay = float(os.getenv('WEIGHT_DECAY'))
+        self.start_training_threshold = int(os.getenv('START_TRAINING_EACH_N_UPDATES'))
         logger.info("=== Training Configuration ===")
         logger.info(f"✓ Learning rate: {self.learning_rate}")
         logger.info(f"✓ Training threshold: {self.start_training_threshold}")
         logger.info("============================")
+        # Initialize model and move to device
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         logger.info(f"Using device: {self.device}")
+        # Initialize model
+        self._model = AutoModelForSequenceClassification.from_pretrained(
+            'bert-base-uncased',
+            num_labels=len(self.categories)
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+        # Load saved model if exists
+        model_path = os.path.join(self.model_dir, 'model_state.pt')
+        if os.path.exists(model_path):
+            try:
+                self._model.load_state_dict(torch.load(model_path))
+                logger.info(f"✓ Loaded saved model from {model_path}")
+            except Exception as e:
+                logger.error(f"Failed to load model: {str(e)}")
+        self._model.to(self.device)
+        logger.info("✓ Model ready")
     def predict(self, tasks, **kwargs):
         """Generate predictions for a list of tasks."""
         return predictions
     def fit(self, event_data, data=None, **kwargs):
         start_time = datetime.now()
         logger.info("=== FIT METHOD CALLED ===")
                             self._model.train()
                             logger.info("Starting training...")
+                            # Training loop
+                            total_loss = 0
                             for epoch in range(self.num_train_epochs):
                                 logger.info(f"Starting epoch {epoch + 1}/{self.num_train_epochs}")
+                                epoch_loss = 0
                                 for batch in train_loader:
                                     optimizer.zero_grad()
                                     )
                                     loss = outputs.loss
+                                    epoch_loss += loss.item()
                                     # Backward pass
                                     loss.backward()
                                     optimizer.step()
+                                avg_epoch_loss = epoch_loss / len(train_loader)
+                                total_loss += avg_epoch_loss
+                                logger.info(f"Epoch {epoch + 1} loss: {avg_epoch_loss:.4f}")
+                            avg_training_loss = total_loss / self.num_train_epochs
+                            logger.info(f"Average training loss: {avg_training_loss:.4f}")
+                            # Save model
+                            model_path = os.path.join(self.model_dir, 'model_state.pt')
+                            torch.save(self._model.state_dict(), model_path)
+                            logger.info(f"✓ Model saved to {model_path}")
+                            return {
+                                'status': 'ok',
+                                'message': f'Training completed with avg loss: {avg_training_loss:.4f}'
+                            }
                         except Exception as e:
                             logger.error(f"Training error: {str(e)}")
+                            return {'status': 'error', 'message': str(e)}
         except Exception as e:
             logger.error(f"Error in fit method: {str(e)}")