Spaces:

CreatorIQ-org
/

rlhf_docker

Sleeping

App Files Files Community

b2u commited on Nov 29, 2024

Commit

c6dedc8

1 Parent(s): da5d943

keep debugging

Browse files

Files changed (1) hide show

model.py +98 -52

model.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch
 import logging
 import pathlib
 import pickle
 from typing import List, Dict, Optional
 from label_studio_ml.model import LabelStudioMLBase
 from transformers import (
@@ -70,10 +71,11 @@ class BertClassifier(LabelStudioMLBase):
     _model = None
     def __init__(self, project_id=None, label_config=None, **kwargs):
-        # Initialize parent class properly
         super(BertClassifier, self).__init__(project_id=project_id, label_config=label_config)
-        # Your existing initialization code
         self.label_encoder = LabelEncoder()
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.instruction_template = os.getenv('MODEL_INSTRUCTIONS', '{text}')
@@ -82,9 +84,6 @@ class BertClassifier(LabelStudioMLBase):
         self.model_dir = os.path.join(os.path.dirname(__file__), 'model')
         os.makedirs(self.model_dir, exist_ok=True)
-        # Skip Label Studio client initialization
-        self.label_studio_client = None
         # Define your categories
         self.categories = [
             'affiliate_classification', 'brand', 'business_and_career',
@@ -196,6 +195,30 @@ class BertClassifier(LabelStudioMLBase):
             logger.error("Full error details:", exc_info=True)
             raise
     def fit(self, completions, workdir=None, **kwargs):
         """Train model on labeled data"""
         logger.info('Starting model training...')
@@ -210,57 +233,57 @@ class BertClassifier(LabelStudioMLBase):
             # Extract training data
             texts, labels = [], []
-            try:
-                # Get interface info
-                from_name, to_name, value = self.label_interface.get_first_tag_occurence('Choices', 'Text')
-                # Get tasks from Label Studio
                 tasks = self.get_tasks()
-                logger.info(f"Found {len(tasks)} tasks")
-                for task in tasks:
-                    try:
-                        # Get text from task
-                        text = task['data'].get(value)
-                        if not text:
-                            logger.warning(f"No text found in task {task.get('id')}")
-                            continue
-                        # Get annotations
-                        annotations = task.get('annotations', [])
-                        if not annotations:
-                            logger.warning(f"No annotations found for task {task.get('id')}")
                             continue
-                        for annotation in annotations:
-                            try:
-                                # Get choices from result
-                                results = annotation.get('result', [])
-                                if not results:
-                                    logger.warning(f"No results found in annotation for task {task.get('id')}")
-                                    continue
-                                for result in results:
-                                    if result.get('from_name') == from_name and result.get('to_name') == to_name:
-                                        choices = result.get('value', {}).get('choices', [])
-                                        if choices:
-                                            label = choices[0]
-                                            logger.info(f"Successfully extracted: Text='{text[:50]}...', Label='{label}'")
-                                            texts.append(text)
-                                            labels.append(label)
-                                            break
-                            except Exception as e:
-                                logger.error(f"Error processing annotation: {str(e)}")
-                                continue
-                    except Exception as e:
-                        logger.error(f"Error processing task: {str(e)}")
-                        continue
-            except Exception as e:
-                logger.error(f"Error getting tasks: {str(e)}")
-                logger.error("Full error details:", exc_info=True)
             logger.info(f"Prepared {len(texts)} examples for training")
@@ -343,3 +366,26 @@ class BertClassifier(LabelStudioMLBase):
                 'error': str(e),
                 'train_size': len(texts) if 'texts' in locals() else 0
             }

 import logging
 import pathlib
 import pickle
+import json
 from typing import List, Dict, Optional
 from label_studio_ml.model import LabelStudioMLBase
 from transformers import (
     _model = None
     def __init__(self, project_id=None, label_config=None, **kwargs):
         super(BertClassifier, self).__init__(project_id=project_id, label_config=label_config)
+        logger.info(f"Initializing BertClassifier with project_id: {project_id}")
+        logger.info(f"Label config: {label_config}")
         self.label_encoder = LabelEncoder()
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.instruction_template = os.getenv('MODEL_INSTRUCTIONS', '{text}')
         self.model_dir = os.path.join(os.path.dirname(__file__), 'model')
         os.makedirs(self.model_dir, exist_ok=True)
         # Define your categories
         self.categories = [
             'affiliate_classification', 'brand', 'business_and_career',
             logger.error("Full error details:", exc_info=True)
             raise
+    def get_tasks(self):
+        """Get tasks from completions"""
+        try:
+            from_name, to_name, value = self.label_interface.get_first_tag_occurence('Choices', 'Text')
+            # Get all tasks from Label Studio ML backend storage
+            tasks = []
+            # Try to get tasks from Label Studio ML storage
+            storage_path = os.path.join(self.model_dir, 'tasks.json')
+            if os.path.exists(storage_path):
+                try:
+                    with open(storage_path, 'r') as f:
+                        tasks = json.load(f)
+                    logger.info(f"Loaded {len(tasks)} tasks from storage")
+                except Exception as e:
+                    logger.error(f"Error loading tasks from storage: {str(e)}")
+            return tasks
+        except Exception as e:
+            logger.error(f"Error in get_tasks: {str(e)}")
+            return []
     def fit(self, completions, workdir=None, **kwargs):
         """Train model on labeled data"""
         logger.info('Starting model training...')
             # Extract training data
             texts, labels = [], []
+            # If completions is a string (like "START_TRAINING"), try to get tasks
+            if isinstance(completions, str):
                 tasks = self.get_tasks()
+            else:
+                # If completions is a list, use it directly
+                tasks = completions if isinstance(completions, list) else [completions]
+            logger.info(f"Processing {len(tasks)} tasks")
+            # Get interface info
+            from_name, to_name, value = self.label_interface.get_first_tag_occurence('Choices', 'Text')
+            for task in tasks:
+                try:
+                    # Get text from task
+                    text = task['data'].get(value) if isinstance(task, dict) else None
+                    if not text:
+                        logger.warning(f"No text found in task")
+                        continue
+                    # Get annotations
+                    annotations = task.get('annotations', []) if isinstance(task, dict) else []
+                    if not annotations:
+                        logger.warning(f"No annotations found for task")
+                        continue
+                    for annotation in annotations:
+                        try:
+                            # Get choices from result
+                            results = annotation.get('result', [])
+                            if not results:
+                                logger.warning(f"No results found in annotation")
+                                continue
+                            for result in results:
+                                if result.get('from_name') == from_name and result.get('to_name') == to_name:
+                                    choices = result.get('value', {}).get('choices', [])
+                                    if choices:
+                                        label = choices[0]
+                                        logger.info(f"Successfully extracted: Text='{text[:50]}...', Label='{label}'")
+                                        texts.append(text)
+                                        labels.append(label)
+                                        break
+                        except Exception as e:
+                            logger.error(f"Error processing annotation: {str(e)}")
                             continue
+                except Exception as e:
+                    logger.error(f"Error processing task: {str(e)}")
+                    continue
             logger.info(f"Prepared {len(texts)} examples for training")
                 'error': str(e),
                 'train_size': len(texts) if 'texts' in locals() else 0
             }
+    def save_task(self, task):
+        """Save a task to local storage"""
+        try:
+            storage_path = os.path.join(self.model_dir, 'tasks.json')
+            tasks = []
+            # Load existing tasks
+            if os.path.exists(storage_path):
+                with open(storage_path, 'r') as f:
+                    tasks = json.load(f)
+            # Add new task
+            tasks.append(task)
+            # Save tasks
+            with open(storage_path, 'w') as f:
+                json.dump(tasks, f)
+            logger.info(f"Saved task to storage. Total tasks: {len(tasks)}")
+        except Exception as e:
+            logger.error(f"Error saving task: {str(e)}")