Spaces:

CreatorIQ-org
/

rlhf_docker

Sleeping

App Files Files Community

b2u commited on Nov 29, 2024

Commit

1b11c8f

1 Parent(s): c352a3a

rolling back

Browse files

Files changed (1) hide show

model.py +9 -422

model.py CHANGED Viewed

@@ -1,450 +1,37 @@
-import os
 import torch
 import logging
-import pathlib
-import pickle
 import json
-from typing import List, Dict, Optional
 from label_studio_ml.model import LabelStudioMLBase
-from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    Trainer,
-    TrainingArguments
-)
-from datasets import Dataset
 from sklearn.preprocessing import LabelEncoder
-from label_studio_sdk.label_interface.objects import PredictionValue
-from label_studio_ml.response import ModelResponse
-from label_studio_sdk import Client
 logger = logging.getLogger(__name__)
-if torch.cuda.is_available():
-    device = torch.device("cuda")
-    print('There are %d GPU(s) available.' % torch.cuda.device_count())
-    print('We will use the GPU:', torch.cuda.get_device_name(0))
-else:
-    print('No GPU available, using the CPU instead.')
-    device = torch.device("cpu")
 class BertClassifier(LabelStudioMLBase):
-    """
-    BERT-based text classification model for Label Studio
-    This model uses the Hugging Face Transformers library to fine-tune a BERT model for text classification.
-    Use any model for [AutoModelForSequenceClassification](https://huggingface.co/transformers/v3.0.2/model_doc/auto.html#automodelforsequenceclassification)
-    The model is trained on the labeled data from Label Studio and then used to make predictions on new data.
-    Parameters:
-    -----------
-    LABEL_STUDIO_HOST : str
-        The URL of the Label Studio instance
-    LABEL_STUDIO_API_KEY : str
-        The API key for the Label Studio instance
-    START_TRAINING_EACH_N_UPDATES : int
-        The number of labeled tasks to download from Label Studio before starting training
-    LEARNING_RATE : float
-        The learning rate for the model training
-    NUM_TRAIN_EPOCHS : int
-        The number of epochs for model training
-    WEIGHT_DECAY : float
-        The weight decay for the model training
-    baseline_model_name : str
-        The name of the baseline model to use for training
-    MODEL_DIR : str
-        The directory to save the trained model
-    finetuned_model_name : str
-        The name of the finetuned model
-    """
-    LABEL_STUDIO_HOST = os.getenv('LABEL_STUDIO_HOST', 'http://localhost:8080')
-    LABEL_STUDIO_API_KEY = os.getenv('LABEL_STUDIO_API_KEY')
-    START_TRAINING_EACH_N_UPDATES = int(os.getenv('START_TRAINING_EACH_N_UPDATES', 10))
-    LEARNING_RATE = float(os.getenv('LEARNING_RATE', 2e-5))
-    NUM_TRAIN_EPOCHS = int(os.getenv('NUM_TRAIN_EPOCHS', 3))
-    WEIGHT_DECAY = float(os.getenv('WEIGHT_DECAY', 0.01))
-    baseline_model_name = os.getenv('BASELINE_MODEL_NAME', 'bert-base-multilingual-cased')
-    MODEL_DIR = os.getenv('MODEL_DIR', './results')
-    finetuned_model_name = os.getenv('FINETUNED_MODEL_NAME', 'finetuned-model')
-    _model = None
     def __init__(self, project_id=None, label_config=None, **kwargs):
         super(BertClassifier, self).__init__(project_id=project_id, label_config=label_config)
         logger.info(f"Initializing BertClassifier with project_id: {project_id}")
         logger.info(f"Label config: {label_config}")
-        # Initialize basic attributes
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.version = 'v0.0.1'
-        self.model_dir = f'BertClassifier-{self.version}'
-        # Define categories
-        self.categories = [
-            'affiliate_classification', 'brand', 'business_and_career',
-            'content_quality', 'date', 'demographic', 'event',
-            'faith_and_religion', 'gaming', 'health',
-            'internal_categorization', 'location', 'number',
-            'performance', 'post_type', 'pricing_tier',
-            'product', 'profession', 'pii', 'social_network',
-            'style_and_fashion', 'no_category'
-        ]
-        # Initialize model and tokenizer as None - they'll be loaded when needed
         self._model = None
         self.tokenizer = None
-        logger.info("BertClassifier initialized successfully")
-    def get_labels(self):
-        li = self.label_interface
-        from_name, _, _ = li.get_first_tag_occurence('Choices', 'Text')
-        tag = li.get_tag(from_name)
-        return tag.labels
-    def setup(self):
-        """Setup the model - this is called when Label Studio connects"""
-        try:
-            # Initialize model directory
-            os.makedirs(self.model_dir, exist_ok=True)
-            # Return the required information for Label Studio
-            return {
-                'model_class': 'BertClassifier',  # Must match your class name
-                'model_params': {
-                    'device': str(self.device),
-                    'version': self.version
-                },
-                'label_config': {
-                    'from_name': 'sentiment',
-                    'to_name': 'text',
-                    'type': 'choices',
-                    'labels': self.categories
-                },
-                'api_version': '2'  # Important: specify API version
-            }
-        except Exception as e:
-            logger.error(f"Error in setup: {str(e)}")
-            logger.error("Full error details:", exc_info=True)
-            raise
-    def _lazy_init(self):
-        if not hasattr(self, '_model') or self._model is None:
-            try:
-                # Try to load fine-tuned model
-                model_path = os.path.join(self.MODEL_DIR, 'fine_tuned_model')
-                if os.path.exists(model_path):
-                    logger.info('Loading fine-tuned model...')
-                    self._model = AutoModelForSequenceClassification.from_pretrained(
-                        model_path,
-                        num_labels=len(self.categories)
-                    )
-                    self.tokenizer = AutoTokenizer.from_pretrained(model_path)
-                else:
-                    logger.info('Loading base model...')
-                    self._model = AutoModelForSequenceClassification.from_pretrained(
-                        'bert-base-multilingual-cased',
-                        num_labels=len(self.categories)
-                    )
-                    self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
-                self._model.to(self.device)
-                # Load label encoder if exists
-                label_encoder_path = os.path.join(self.MODEL_DIR, 'label_encoder.pkl')
-                if os.path.exists(label_encoder_path):
-                    with open(label_encoder_path, 'rb') as f:
-                        self.label_encoder = pickle.load(f)
-            except Exception as e:
-                logger.error(f'Error initializing model: {str(e)}')
-                raise
     def predict(self, tasks, **kwargs):
         """Make predictions for tasks"""
         predictions = []
-        try:
-            # Save tasks
-            for task in tasks:
-                self.save_task(task)
-                logger.info(f"Saved task: {task.get('id', 'unknown')}")
-                # Get text from task
-                text = task.get('data', {}).get('text', '')
-                # For now, return a default prediction (you can improve this later)
-                predictions.append({
-                    'result': [{
-                        'from_name': 'sentiment',
-                        'to_name': 'text',
-                        'type': 'choices',
-                        'value': {
-                            'choices': ['no_category']  # Default prediction
-                        },
-                        'score': 0.5  # Confidence score between 0 and 1
-                    }],
-                    'model_version': self.version
-                })
-        except Exception as e:
-            logger.error(f"Error in predict: {str(e)}")
-            logger.error("Full error details:", exc_info=True)
-            # Return empty predictions in case of error
-            predictions = [{
                 'result': [],
-                'model_version': self.version
-            } for _ in tasks]
         return predictions
-    def get_tasks(self):
-        """Get tasks from Label Studio"""
-        try:
-            # Get tasks from Label Studio API
-            params = {'project': self.project_id} if self.project_id else {}
-            response = self.label_studio_client.make_request('GET', '/api/tasks', params=params)
-            tasks = response.json()
-            logger.info(f"Retrieved {len(tasks)} tasks from Label Studio API")
-            # Debug first task if available
-            if tasks:
-                logger.info(f"First task content: {json.dumps(tasks[0], indent=2)}")
-            return tasks
-        except Exception as e:
-            logger.error(f"Error retrieving tasks from Label Studio: {str(e)}")
-            logger.error("Full error details:", exc_info=True)
-            return []
     def fit(self, completions, workdir=None, **kwargs):
         """Train model on labeled data"""
         logger.info('Starting model training...')
-        try:
-            # Get use_ground_truth parameter
-            use_ground_truth = kwargs.get('use_ground_truth', True)
-            logger.info(f"Training with use_ground_truth={use_ground_truth}")
-            # Debug completions
-            logger.info("=== DEBUG COMPLETIONS START ===")
-            logger.info(f"Type of completions: {type(completions)}")
-            logger.info(f"Completions content: {completions}")
-            logger.info("=== DEBUG COMPLETIONS END ===")
-            # Extract training data
-            texts, labels = [], []
-            # Get tasks from Label Studio
-            tasks = self.get_tasks()
-            logger.info(f"Retrieved {len(tasks)} tasks from Label Studio")
-            # Get interface info
-            from_name = 'sentiment'  # This matches your label config
-            to_name = 'text'        # This matches your label config
-            for task in tasks:
-                try:
-                    # Get text from task
-                    text = task['data'].get('text')
-                    if not text:
-                        logger.warning(f"No text found in task {task.get('id')}")
-                        continue
-                    # Get annotations
-                    annotations = task.get('annotations', [])
-                    if use_ground_truth:
-                        # Also include ground truth annotations
-                        annotations.extend(task.get('ground_truth', []))
-                    if annotations:
-                        logger.info(f"Found {len(annotations)} annotations for task {task.get('id')}")
-                        logger.info(f"Annotation content: {json.dumps(annotations[0], indent=2)}")
-                    if not annotations:
-                        logger.warning(f"No annotations found for task {task.get('id')}")
-                        continue
-                    for annotation in annotations:
-                        # Only use completed annotations
-                        if annotation.get('was_cancelled') or not annotation.get('completed_by'):
-                            continue
-                        try:
-                            # Get choices from result
-                            results = annotation.get('result', [])
-                            if not results:
-                                logger.warning(f"No results found in annotation for task {task.get('id')}")
-                                continue
-                            for result in results:
-                                if result.get('from_name') == from_name and result.get('to_name') == to_name:
-                                    choices = result.get('value', {}).get('choices', [])
-                                    if choices:
-                                        label = choices[0]
-                                        logger.info(f"Successfully extracted: Text='{text[:50]}...', Label='{label}'")
-                                        texts.append(text)
-                                        labels.append(label)
-                                        break
-                        except Exception as e:
-                            logger.error(f"Error processing annotation: {str(e)}")
-                            continue
-                except Exception as e:
-                    logger.error(f"Error processing task: {str(e)}")
-                    continue
-            logger.info(f"Prepared {len(texts)} examples for training")
-            if not texts:
-                raise ValueError("No valid training examples found")
-            # Convert labels to numeric using label encoder
-            numeric_labels = self.label_encoder.transform(labels)
-            # Create dataset
-            train_dataset = Dataset.from_dict({
-                'text': texts,
-                'label': numeric_labels
-            })
-            # Initialize tokenizer and model if not already done
-            self._lazy_init()
-            # Tokenize the texts
-            def tokenize_function(examples):
-                return self.tokenizer(
-                    examples['text'],
-                    padding='max_length',
-                    truncation=True,
-                    max_length=512
-                )
-            tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
-            # Define training arguments
-            training_args = TrainingArguments(
-                output_dir=os.path.join(self.model_dir, "results"),
-                num_train_epochs=3,
-                per_device_train_batch_size=8,
-                per_device_eval_batch_size=8,
-                warmup_steps=500,
-                weight_decay=0.01,
-                logging_dir=os.path.join(self.model_dir, "logs"),
-                logging_steps=10,
-                save_strategy="epoch",
-            )
-            # Initialize trainer
-            trainer = Trainer(
-                model=self._model,
-                args=training_args,
-                train_dataset=tokenized_dataset,
-            )
-            # Train the model
-            logger.info('Training started...')
-            trainer.train()
-            logger.info("Training completed successfully")
-            # Save the fine-tuned model
-            model_path = os.path.join(self.model_dir, 'fine_tuned_model')
-            trainer.save_model(model_path)
-            self.tokenizer.save_pretrained(model_path)
-            logger.info(f"Model saved to {model_path}")
-            # Save label encoder
-            label_encoder_path = os.path.join(self.model_dir, 'label_encoder.pkl')
-            with open(label_encoder_path, 'wb') as f:
-                pickle.dump(self.label_encoder, f)
-            return {
-                'model_path': model_path,
-                'label_encoder_path': label_encoder_path,
-                'categories': self.categories,
-                'metrics': trainer.state.log_history,
-                'status': 'success',
-                'train_size': len(texts)
-            }
-        except Exception as e:
-            logger.error(f"Training failed: {str(e)}")
-            logger.error("Full error details:", exc_info=True)
-            return {
-                'status': 'error',
-                'error': str(e),
-                'train_size': len(texts) if 'texts' in locals() else 0
-            }
-    def save_task(self, task):
-        """Save a task to local storage"""
-        try:
-            storage_path = os.path.join(self.model_dir, 'tasks.json')
-            tasks = []
-            # Load existing tasks
-            if os.path.exists(storage_path):
-                with open(storage_path, 'r') as f:
-                    tasks = json.load(f)
-                    logger.info(f"Loaded {len(tasks)} existing tasks")
-            # Check if task already exists
-            task_id = task.get('id')
-            task_exists = False
-            if task_id:
-                for i, existing_task in enumerate(tasks):
-                    if existing_task.get('id') == task_id:
-                        # Preserve existing annotations
-                        existing_annotations = existing_task.get('annotations', [])
-                        if existing_annotations:
-                            task['annotations'] = existing_annotations
-                        # Update existing task
-                        tasks[i] = task
-                        task_exists = True
-                        logger.info(f"Updated existing task {task_id} with {len(existing_annotations)} annotations")
-                        break
-            # Add new task if it doesn't exist
-            if not task_exists:
-                tasks.append(task)
-                logger.info(f"Added new task {task_id}")
-            # Save tasks
-            with open(storage_path, 'w') as f:
-                json.dump(tasks, f)
-            logger.info(f"Saved tasks to storage. Total tasks: {len(tasks)}")
-        except Exception as e:
-            logger.error(f"Error saving task: {str(e)}")
-            logger.error("Full error details:", exc_info=True)
-    def connect_to_label_studio(self):
-        """Connect to Label Studio API"""
-        try:
-            from label_studio_sdk import Client
-            # Get Label Studio connection details from environment
-            ls_url = os.getenv('LABEL_STUDIO_URL', 'http://localhost:8080')
-            ls_token = os.getenv('LABEL_STUDIO_API_TOKEN')
-            if not ls_token:
-                raise ValueError("LABEL_STUDIO_API_TOKEN environment variable is not set")
-            # Initialize client
-            client = Client(url=ls_url, api_key=ls_token)
-            logger.info(f"Connected to Label Studio at {ls_url}")
-            return client
-        except Exception as e:
-            logger.error(f"Error connecting to Label Studio: {str(e)}")
-            logger.error("Full error details:", exc_info=True)
-            raise

 import torch
 import logging
+import os
 import json
 from label_studio_ml.model import LabelStudioMLBase
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from sklearn.preprocessing import LabelEncoder
 logger = logging.getLogger(__name__)
 class BertClassifier(LabelStudioMLBase):
     def __init__(self, project_id=None, label_config=None, **kwargs):
         super(BertClassifier, self).__init__(project_id=project_id, label_config=label_config)
         logger.info(f"Initializing BertClassifier with project_id: {project_id}")
         logger.info(f"Label config: {label_config}")
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model_dir = os.path.join(os.path.dirname(__file__), 'model')
         self._model = None
         self.tokenizer = None
     def predict(self, tasks, **kwargs):
         """Make predictions for tasks"""
         predictions = []
+        for task in tasks:
+            predictions.append({
                 'result': [],
+                'score': 0,
+                'model_version': self.model_dir
+            })
         return predictions
     def fit(self, completions, workdir=None, **kwargs):
         """Train model on labeled data"""
         logger.info('Starting model training...')
+        return {'status': 'ok'}