Spaces:

CreatorIQ-org
/

rlhf_docker

Sleeping

App Files Files Community

b2u commited on Nov 29, 2024

Commit

4debd04

1 Parent(s): 1818eaa

Training logic added

Browse files

Files changed (3) hide show

model.py +86 -13
utils/__init__.py +0 -0
utils/dataset.py +25 -0

model.py CHANGED Viewed

@@ -2,9 +2,13 @@ import torch
 import logging
 import os
 import json
 from label_studio_ml.model import LabelStudioMLBase
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from sklearn.preprocessing import LabelEncoder
 logger = logging.getLogger(__name__)
@@ -13,12 +17,12 @@ class BertClassifier(LabelStudioMLBase):
         super(BertClassifier, self).__init__(project_id=project_id, label_config=label_config)
         logger.info(f"Initializing BertClassifier with project_id: {project_id}")
-        logger.info(f"Label config: {label_config}")
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         logger.info(f"Using device: {self.device}")
-        # Define categories that match your Label Studio config
         self.categories = [
             'affiliate_classification', 'brand', 'business_and_career',
             'content_quality', 'date', 'demographic', 'event',
@@ -30,8 +34,7 @@ class BertClassifier(LabelStudioMLBase):
         ]
         self.model_dir = os.path.join(os.path.dirname(__file__), 'model')
-        self._model = None
-        self.tokenizer = None
         # Initialize model and tokenizer
         try:
@@ -60,13 +63,10 @@ class BertClassifier(LabelStudioMLBase):
             for task in tasks:
                 logger.info(f"Processing task ID: {task.get('id')}")
-                # Get the text to classify
                 text = task['data'].get('text', '')
-                logger.info(f"Text to predict: {text}")
                 try:
-                    # Tokenize the text
                     inputs = self.tokenizer(
                         text,
                         truncation=True,
@@ -74,8 +74,7 @@ class BertClassifier(LabelStudioMLBase):
                         return_tensors='pt'
                     ).to(self.device)
-                    # Get model prediction
-                    self._model.eval()  # Set to evaluation mode
                     with torch.no_grad():
                         outputs = self._model(**inputs)
                         probs = torch.softmax(outputs.logits, dim=1)
@@ -102,7 +101,6 @@ class BertClassifier(LabelStudioMLBase):
                 except Exception as e:
                     logger.error(f"Error processing individual task: {str(e)}")
                     logger.error("Full error details:", exc_info=True)
-                    # Add empty prediction for failed task
                     predictions.append({
                         'result': [],
                         'score': 0,
@@ -119,5 +117,80 @@ class BertClassifier(LabelStudioMLBase):
     def fit(self, completions, workdir=None, **kwargs):
         """Train model on labeled data"""
-        logger.info('Starting model training...')
-        return {'status': 'ok'}

 import logging
 import os
 import json
+from datetime import datetime
 from label_studio_ml.model import LabelStudioMLBase
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from torch.utils.data import DataLoader
+from torch.optim import AdamW
 from sklearn.preprocessing import LabelEncoder
+from utils.dataset import TextDataset
 logger = logging.getLogger(__name__)
         super(BertClassifier, self).__init__(project_id=project_id, label_config=label_config)
         logger.info(f"Initializing BertClassifier with project_id: {project_id}")
+        logger.info(f"Label config length: {len(label_config) if label_config else 0}")
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         logger.info(f"Using device: {self.device}")
+        # Define categories
         self.categories = [
             'affiliate_classification', 'brand', 'business_and_career',
             'content_quality', 'date', 'demographic', 'event',
         ]
         self.model_dir = os.path.join(os.path.dirname(__file__), 'model')
+        os.makedirs(self.model_dir, exist_ok=True)
         # Initialize model and tokenizer
         try:
             for task in tasks:
                 logger.info(f"Processing task ID: {task.get('id')}")
                 text = task['data'].get('text', '')
+                logger.info(f"Text to predict: {text[:100]}...")
                 try:
                     inputs = self.tokenizer(
                         text,
                         truncation=True,
                         return_tensors='pt'
                     ).to(self.device)
+                    self._model.eval()
                     with torch.no_grad():
                         outputs = self._model(**inputs)
                         probs = torch.softmax(outputs.logits, dim=1)
                 except Exception as e:
                     logger.error(f"Error processing individual task: {str(e)}")
                     logger.error("Full error details:", exc_info=True)
                     predictions.append({
                         'result': [],
                         'score': 0,
     def fit(self, completions, workdir=None, **kwargs):
         """Train model on labeled data"""
+        try:
+            logger.info('=== STARTING MODEL TRAINING ===')
+            logger.info(f'Received {len(completions)} completions for training')
+            # Extract training data
+            texts = []
+            labels = []
+            label_encoder = LabelEncoder()
+            for completion in completions:
+                logger.info(f"Processing completion: {completion.get('id')}")
+                text = completion['data'].get('text', '')
+                annotations = completion.get('annotations', [])
+                if annotations:
+                    label = annotations[0].get('result', [])[0].get('value', {}).get('choices', [])[0]
+                    texts.append(text)
+                    labels.append(label)
+                    logger.info(f"Added training example: '{text[:50]}...' -> {label}")
+            if not texts:
+                logger.warning("No valid training examples found")
+                return {'status': 'error', 'message': 'No valid training examples found'}
+            logger.info(f'Prepared {len(texts)} examples for training')
+            # Encode labels
+            encoded_labels = label_encoder.fit_transform(labels)
+            # Create dataset
+            dataset = TextDataset(texts, encoded_labels, self.tokenizer)
+            train_loader = DataLoader(dataset, batch_size=8, shuffle=True)
+            # Training setup
+            optimizer = AdamW(self._model.parameters(), lr=2e-5)
+            self._model.train()
+            # Training loop
+            num_epochs = 3
+            logger.info(f"Starting training for {num_epochs} epochs")
+            for epoch in range(num_epochs):
+                total_loss = 0
+                for batch in train_loader:
+                    optimizer.zero_grad()
+                    input_ids = batch['input_ids'].to(self.device)
+                    attention_mask = batch['attention_mask'].to(self.device)
+                    labels = batch['labels'].to(self.device)
+                    outputs = self._model(
+                        input_ids=input_ids,
+                        attention_mask=attention_mask,
+                        labels=labels
+                    )
+                    loss = outputs.loss
+                    total_loss += loss.item()
+                    loss.backward()
+                    optimizer.step()
+                avg_loss = total_loss / len(train_loader)
+                logger.info(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
+            # Save the model
+            save_path = os.path.join(self.model_dir, 'trained_model')
+            self._model.save_pretrained(save_path)
+            self.tokenizer.save_pretrained(save_path)
+            logger.info(f"Model saved to {save_path}")
+            logger.info('=== TRAINING COMPLETED SUCCESSFULLY ===')
+            return {'status': 'ok', 'message': f'Model trained on {len(texts)} examples'}
+        except Exception as e:
+            logger.error(f"Error during training: {str(e)}")
+            logger.error("Full error details:", exc_info=True)
+            return {'status': 'error', 'message': str(e)}

utils/__init__.py ADDED Viewed

File without changes

utils/dataset.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+from torch.utils.data import Dataset
+class TextDataset(Dataset):
+    def __init__(self, texts, labels, tokenizer, max_length=128):
+        """
+        Initialize dataset for text classification
+        Args:
+            texts: list of input texts
+            labels: list of corresponding labels
+            tokenizer: HuggingFace tokenizer
+            max_length: maximum sequence length
+        """
+        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
+        self.labels = labels
+    def __getitem__(self, idx):
+        """Return a single training example"""
+        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+        item['labels'] = torch.tensor(self.labels[idx])
+        return item
+    def __len__(self):
+        """Return the number of examples in dataset"""
+        return len(self.labels)