Spaces:

CreatorIQ-org
/

rlhf_docker

Sleeping

App Files Files Community

b2u commited on Dec 2, 2024

Commit

b9a9837

1 Parent(s): 49d18ad

simplifying

Browse files

Files changed (1) hide show

model.py +30 -151

model.py CHANGED Viewed

@@ -141,156 +141,35 @@ class BertClassifier(LabelStudioMLBase):
         logger.info(f"Returning {len(predictions)} predictions")
         return predictions
-    def fit(self, completions, workdir=None, **kwargs):
-        try:
-            logger.info('=== STARTING MODEL TRAINING ===')
-            logger.info(f'Received signal: {completions}')
-            # If we receive a training signal, fetch the actual completions from Label Studio
-            if isinstance(completions, str) and completions in ['START_TRAINING', 'ANNOTATION_CREATED']:
-                try:
-                    # Get completions from Label Studio using the SDK
-                    annotations = self.get_completions()
-                    logger.info(f'Fetched {len(annotations)} annotations from Label Studio')
-                    completions = annotations
-                except Exception as e:
-                    logger.error(f"Error fetching completions from Label Studio: {str(e)}")
-                    logger.error("Full error details:", exc_info=True)
-                    return {'status': 'error', 'message': 'Failed to fetch completions'}
-            if not completions:
-                logger.error("No completions to process")
-                return {'status': 'error', 'message': 'No completions available'}
-            texts = []
-            labels = []
-            # If completions is a list of single characters, join them
-            if isinstance(completions, list) and all(isinstance(c, str) and len(c) == 1 for c in completions):
-                completions = ''.join(completions)
-                logger.info(f'Joined completions: {completions}')
-            # Handle completions as a single string if needed
-            if isinstance(completions, str):
-                try:
-                    completions = json.loads(completions)
-                    logger.info('Successfully parsed completions JSON')
-                except json.JSONDecodeError as e:
-                    logger.error(f"Failed to parse completions string as JSON: {str(e)}")
-                    logger.error(f"Problematic string: {completions}")
-                    return {'status': 'error', 'message': 'Invalid completions format'}
-            # Ensure completions is a list
-            if not isinstance(completions, list):
-                completions = [completions]
-            logger.info(f'Processing {len(completions)} items')
-            for completion in completions:
-                logger.info(f"Completion type: {type(completion)}")
-                logger.info(f"Completion content: {completion}")
-                try:
-                    # Convert string completion to dict if needed
-                    if isinstance(completion, str):
-                        completion = json.loads(completion)
-                    # Extract completion data
-                    completion_id = completion.get('id', 'unknown')
-                    logger.info(f"Processing completion ID: {completion_id}")
-                    # Get the task data containing the text
-                    text = completion.get('data', {}).get('text', '')
-                    # Get annotations/results
-                    annotations = completion.get('annotations', [])
-                    if not annotations and 'result' in completion:
-                        annotations = [{'result': completion['result']}]
-                    # Process each annotation
-                    for annotation in annotations:
-                        results = annotation.get('result', [])
-                        # Find the choices result
-                        for result in results:
-                            if result.get('type') == 'choices':
-                                choices = result.get('value', {}).get('choices', [])
-                                if choices:
-                                    label = choices[0]  # Take the first choice
-                                    if text and label:
-                                        texts.append(text)
-                                        labels.append(label)
-                                        logger.info(f"Added example - Text: {text[:50]}... Label: {label}")
-                except Exception as e:
-                    logger.error(f"Error processing completion: {str(e)}")
-                    logger.error("Full error details:", exc_info=True)
-                    continue
-            if not texts or not labels:
-                logger.error("No valid training examples found")
-                return {'status': 'error', 'message': 'No valid training examples found'}
-            # Convert labels to integers
-            label_encoder = LabelEncoder()
-            encoded_labels = label_encoder.fit_transform(labels)
-            # Save label encoder for inference
-            self.label_encoder = label_encoder
-            logger.info(f"Label mapping: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")
-            # Create dataset
-            dataset = TextDataset(texts, encoded_labels, self.tokenizer)
-            dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
-            # Training settings
-            optimizer = AdamW(self.model.parameters(), lr=float(os.getenv('LEARNING_RATE', '2e-5')))
-            num_epochs = int(os.getenv('NUM_TRAIN_EPOCHS', '3'))
-            # Training loop
-            logger.info(f"Starting training for {num_epochs} epochs")
-            self.model.train()
-            for epoch in range(num_epochs):
-                total_loss = 0
-                for batch in dataloader:
-                    optimizer.zero_grad()
-                    input_ids = batch['input_ids'].to(self.device)
-                    attention_mask = batch['attention_mask'].to(self.device)
-                    labels = batch['labels'].to(self.device)
-                    outputs = self.model(
-                        input_ids=input_ids,
-                        attention_mask=attention_mask,
-                        labels=labels
-                    )
-                    loss = outputs.loss
-                    total_loss += loss.item()
-                    loss.backward()
-                    optimizer.step()
-                avg_loss = total_loss / len(dataloader)
-                logger.info(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
-            # Save the fine-tuned model
-            model_dir = os.path.join(os.getenv('MODEL_DIR', ''), os.getenv('FINETUNED_MODEL_NAME', 'finetuned_model'))
-            os.makedirs(model_dir, exist_ok=True)
-            self.model.save_pretrained(model_dir)
-            self.tokenizer.save_pretrained(model_dir)
-            # Save label encoder
-            with open(os.path.join(model_dir, 'label_encoder.json'), 'w') as f:
-                json.dump({
-                    'classes': label_encoder.classes_.tolist()
-                }, f)
-            logger.info(f"Model and label encoder saved to {model_dir}")
-            return {'status': 'ok', 'message': f'Training completed with {len(texts)} examples'}
-        except Exception as e:
-            logger.error(f"Error during training: {str(e)}")
-            logger.error("Full error details:", exc_info=True)
-            return {'status': 'error', 'message': str(e)}

         logger.info(f"Returning {len(predictions)} predictions")
         return predictions
+    def fit(self, event, data, **kwargs):
+        """Train the model on the labeled data."""
+        logger.info(f"Received event: {event}")
+        # Check if the event is one that should trigger training
+        if event in ['ANNOTATION_CREATED', 'ANNOTATION_UPDATED']:
+            try:
+                # Fetch the full annotation data if not included in the payload
+                task_id = data.get('task_id')
+                if task_id:
+                    annotation = self.label_studio_client.get_task(task_id)
+                    logger.info(f"Fetched annotation for task ID: {task_id}")
+                else:
+                    logger.error("No task ID found in event data")
+                    return {'status': 'error', 'message': 'No task ID found'}
+                # Extract text and label from the annotation
+                text = annotation.get('data', {}).get('text', '')
+                results = annotation.get('annotations', [{}])[0].get('result', [])
+                for result in results:
+                    if result.get('type') == 'choices':
+                        label = result.get('value', {}).get('choices', [])[0]
+                        # Add your training logic here using text and label
+                        logger.info(f"Training on text: {text[:50]}... with label: {label}")
+                        # Example: self.train_model(text, label)
+            except Exception as e:
+                logger.error(f"Error during training: {str(e)}")
+                logger.error("Full error details:", exc_info=True)
+                return {'status': 'error', 'message': str(e)}
+        return {'status': 'ok', 'message': 'Training completed'}