Spaces:

CreatorIQ-org
/

rlhf_docker

Sleeping

App Files Files Community

b2u commited on Nov 29, 2024

Commit

fa5ac26

1 Parent(s): a3330c8

debugging

Browse files

Files changed (1) hide show

model.py +54 -33

model.py CHANGED Viewed

@@ -76,6 +76,9 @@ class BertClassifier(LabelStudioMLBase):
         logger.info(f"Initializing BertClassifier with project_id: {project_id}")
         logger.info(f"Label config: {label_config}")
         self.label_encoder = LabelEncoder()
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.instruction_template = os.getenv('MODEL_INSTRUCTIONS', '{text}')
@@ -178,19 +181,24 @@ class BertClassifier(LabelStudioMLBase):
         return predictions
     def get_tasks(self):
-        """Get tasks from local storage"""
         try:
-            storage_path = os.path.join(self.model_dir, 'tasks.json')
-            if os.path.exists(storage_path):
-                with open(storage_path, 'r') as f:
-                    tasks = json.load(f)
-                logger.info(f"Loaded {len(tasks)} tasks from storage")
-                return tasks
-            else:
-                logger.warning("No tasks found in storage")
-                return []
         except Exception as e:
-            logger.error(f"Error retrieving tasks: {str(e)}")
             return []
     def fit(self, completions, workdir=None, **kwargs):
@@ -207,25 +215,13 @@ class BertClassifier(LabelStudioMLBase):
             # Extract training data
             texts, labels = [], []
-            # If completions is a string (like "START_TRAINING"), try to get tasks
-            if isinstance(completions, str):
-                tasks = self.get_tasks()
-                logger.info(f"Retrieved {len(tasks)} tasks from storage")
-                # Debug first task if available
-                if tasks:
-                    logger.info(f"First task content: {json.dumps(tasks[0], indent=2)}")
-            else:
-                # If completions is a list, use it directly
-                tasks = completions if isinstance(completions, list) else [completions]
-                logger.info(f"Using {len(tasks)} tasks from completions")
-                if tasks:
-                    logger.info(f"First completion content: {json.dumps(tasks[0], indent=2)}")
-            logger.info(f"Processing {len(tasks)} tasks")
             # Get interface info
-            from_name, to_name, value = self.label_interface.get_first_tag_occurence('Choices', 'Text')
-            logger.info(f"Interface info: from_name={from_name}, to_name={to_name}, value={value}")
             for task in tasks:
                 try:
@@ -237,17 +233,20 @@ class BertClassifier(LabelStudioMLBase):
                     # Get annotations
                     annotations = task.get('annotations', [])
-                    logger.info(f"Found {len(annotations)} annotations for task {task.get('id')}")
                     if not annotations:
                         logger.warning(f"No annotations found for task {task.get('id')}")
                         continue
                     for annotation in annotations:
-                        try:
-                            # Debug annotation content
-                            logger.info(f"Annotation content: {json.dumps(annotation, indent=2)}")
                             # Get choices from result
                             results = annotation.get('result', [])
                             if not results:
@@ -263,7 +262,7 @@ class BertClassifier(LabelStudioMLBase):
                                         texts.append(text)
                                         labels.append(label)
                                         break
                         except Exception as e:
                             logger.error(f"Error processing annotation: {str(e)}")
                             continue
@@ -398,3 +397,25 @@ class BertClassifier(LabelStudioMLBase):
         except Exception as e:
             logger.error(f"Error saving task: {str(e)}")
             logger.error("Full error details:", exc_info=True)

         logger.info(f"Initializing BertClassifier with project_id: {project_id}")
         logger.info(f"Label config: {label_config}")
+        # Initialize Label Studio client
+        self.label_studio_client = self.connect_to_label_studio()
         self.label_encoder = LabelEncoder()
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.instruction_template = os.getenv('MODEL_INSTRUCTIONS', '{text}')
         return predictions
     def get_tasks(self):
+        """Get tasks from Label Studio"""
         try:
+            # Get tasks from Label Studio API
+            params = {'project': self.project_id} if self.project_id else {}
+            response = self.label_studio_client.make_request('GET', '/api/tasks', params=params)
+            tasks = response.json()
+            logger.info(f"Retrieved {len(tasks)} tasks from Label Studio API")
+            # Debug first task if available
+            if tasks:
+                logger.info(f"First task content: {json.dumps(tasks[0], indent=2)}")
+            return tasks
         except Exception as e:
+            logger.error(f"Error retrieving tasks from Label Studio: {str(e)}")
+            logger.error("Full error details:", exc_info=True)
             return []
     def fit(self, completions, workdir=None, **kwargs):
             # Extract training data
             texts, labels = [], []
+            # Get tasks from Label Studio
+            tasks = self.get_tasks()
+            logger.info(f"Retrieved {len(tasks)} tasks from Label Studio")
             # Get interface info
+            from_name = 'sentiment'  # This matches your label config
+            to_name = 'text'        # This matches your label config
             for task in tasks:
                 try:
                     # Get annotations
                     annotations = task.get('annotations', [])
+                    if annotations:
+                        logger.info(f"Found {len(annotations)} annotations for task {task.get('id')}")
+                        logger.info(f"Annotation content: {json.dumps(annotations[0], indent=2)}")
                     if not annotations:
                         logger.warning(f"No annotations found for task {task.get('id')}")
                         continue
                     for annotation in annotations:
+                        # Only use completed annotations
+                        if annotation.get('was_cancelled') or not annotation.get('completed_by'):
+                            continue
+                        try:
                             # Get choices from result
                             results = annotation.get('result', [])
                             if not results:
                                         texts.append(text)
                                         labels.append(label)
                                         break
                         except Exception as e:
                             logger.error(f"Error processing annotation: {str(e)}")
                             continue
         except Exception as e:
             logger.error(f"Error saving task: {str(e)}")
             logger.error("Full error details:", exc_info=True)
+    def connect_to_label_studio(self):
+        """Connect to Label Studio API"""
+        try:
+            from label_studio_sdk import Client
+            # Get Label Studio connection details from environment
+            ls_url = os.getenv('LABEL_STUDIO_URL', 'http://localhost:8080')
+            ls_token = os.getenv('LABEL_STUDIO_API_TOKEN')
+            if not ls_token:
+                raise ValueError("LABEL_STUDIO_API_TOKEN environment variable is not set")
+            # Initialize client
+            client = Client(url=ls_url, api_key=ls_token)
+            logger.info(f"Connected to Label Studio at {ls_url}")
+            return client
+        except Exception as e:
+            logger.error(f"Error connecting to Label Studio: {str(e)}")
+            logger.error("Full error details:", exc_info=True)
+            raise