Alaaeldin
/

example-model

Safetensors

bert

Model card Files Files and versions

xet

Community

Alaaeldin commited on Jan 11, 2025

Commit

93bf619

verified ·

1 Parent(s): 7e18680

Update train.py

Browse files

Files changed (1) hide show

train.py +227 -46

train.py CHANGED Viewed

@@ -1,53 +1,234 @@
-# File 1: Model Repo Code (train.py)
-# This file contains steps 1 to 4
-from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
-# Step 1: Load the Dataset
-dataset = load_dataset("squad")
-# Step 2: Preprocess the Dataset
-tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-def preprocess_function(examples):
-    return tokenizer(
-        examples["question"],
-        examples["context"],
-        truncation=True,
-        max_length=384,
-        stride=128,
-        return_overflowing_tokens=True,
-        padding="max_length"
-    )
-tokenized_dataset = dataset.map(preprocess_function, batched=True)
-# Step 3: Train the Model
-model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
-training_args = TrainingArguments(
-    output_dir="./results",
-    evaluation_strategy="epoch",
-    learning_rate=3e-5,
-    per_device_train_batch_size=16,
-    num_train_epochs=3,
-    weight_decay=0.01,
-    push_to_hub=True,  # Automatically push to the Hugging Face Hub
-    hub_model_id="username/qa_model_repo"  # Replace with your username and model repo name
 )
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=tokenized_dataset["train"],
-    eval_dataset=tokenized_dataset["validation"],
-)
-trainer.train()
-# Step 4: Push the Model and Tokenizer to Hugging Face Hub
-model.push_to_hub("username/qa_model_repo")
-tokenizer.push_to_hub("username/qa_model_repo")
-print("Model and tokenizer pushed to Hugging Face Hub successfully!")

+from datasets import load_dataset, load_metric
 from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
+import os
+import logging
+import numpy as np
+import torch
+from tqdm.auto import tqdm
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('training.log'),
+        logging.StreamHandler()
+    ]
 )
+logger = logging.getLogger(__name__)
+# Set up cache directory and token
+os.environ["HF_HOME"] = "/tmp/cache"
+os.makedirs("/tmp/cache", exist_ok=True)
+# Get Hugging Face token securely
+HF_TOKEN = os.getenv("HF_TOKEN")
+if HF_TOKEN is None:
+    raise ValueError("Hugging Face access token not found. Set it in the environment as 'HF_TOKEN'")
+MODEL_HUB_ID = "Alaaeldin/example-model"  # Replace with your Hugging Face username
+BASE_MODEL = "deepset/roberta-base-squad2"
+class ModelTrainer:
+    def __init__(self):
+        self.metric = load_metric("squad")
+        self.tokenizer = None
+        self.model = None
+    def load_tokenizer_and_model(self):
+        """Load the tokenizer and model with error handling"""
+        try:
+            logger.info(f"Loading tokenizer and model from {BASE_MODEL}")
+            self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+            self.model = AutoModelForQuestionAnswering.from_pretrained(BASE_MODEL)
+            return True
+        except Exception as e:
+            logger.error(f"Error loading tokenizer and model: {e}")
+            raise
+    def preprocess_function(self, examples):
+        """Preprocess the dataset examples"""
+        try:
+            tokenized_examples = self.tokenizer(
+                examples["question"],
+                examples["context"],
+                truncation=True,
+                max_length=384,
+                stride=128,
+                return_overflowing_tokens=True,
+                return_offsets_mapping=True,
+                padding="max_length",
+            )
+            sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
+            tokenized_examples["start_positions"] = []
+            tokenized_examples["end_positions"] = []
+            for i, offsets in enumerate(tokenized_examples["offset_mapping"]):
+                sample_idx = sample_mapping[i]
+                answers = examples["answers"][sample_idx]
+                # Default values
+                start_position = 0
+                end_position = 0
+                if len(answers["answer_start"]) > 0 and len(answers["text"]) > 0:
+                    start_char = answers["answer_start"][0]
+                    end_char = start_char + len(answers["text"][0])
+                    # Find token positions
+                    token_start_index = 0
+                    token_end_index = len(offsets) - 1
+                    # Find start position
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    token_start_index -= 1
+                    # Find end position
+                    while token_end_index > 0 and offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    token_end_index += 1
+                    if 0 <= token_start_index <= token_end_index < len(offsets):
+                        start_position = token_start_index
+                        end_position = token_end_index
+                tokenized_examples["start_positions"].append(start_position)
+                tokenized_examples["end_positions"].append(end_position)
+            return tokenized_examples
+        except Exception as e:
+            logger.error(f"Error in preprocessing: {e}")
+            raise
+    def compute_metrics(self, eval_pred):
+        """Compute evaluation metrics"""
+        predictions, labels = eval_pred
+        start_logits, end_logits = predictions
+        start_predictions = np.argmax(start_logits, axis=-1)
+        end_predictions = np.argmax(end_logits, axis=-1)
+        results = self.metric.compute(
+            predictions={
+                "start_positions": start_predictions,
+                "end_positions": end_predictions
+            },
+            references={
+                "start_positions": labels[0],
+                "end_positions": labels[1]
+            }
+        )
+        return results
+    def validate_model_outputs(self, model, tokenizer):
+        """Validate model outputs with a test example"""
+        logger.info("Validating model outputs...")
+        try:
+            test_question = "What is the capital of France?"
+            test_context = "Paris is the capital of France."
+            inputs = tokenizer(
+                test_question,
+                test_context,
+                return_tensors="pt",
+                truncation=True,
+                max_length=384,
+                padding="max_length"
+            )
+            outputs = model(**inputs)
+            if not (isinstance(outputs.start_logits, torch.Tensor) and
+                    isinstance(outputs.end_logits, torch.Tensor)):
+                raise ValueError("Model outputs validation failed")
+            logger.info("Model validation successful!")
+            return True
+        except Exception as e:
+            logger.error(f"Model validation failed: {e}")
+            raise
+    def train(self):
+        """Main training function"""
+        try:
+            logger.info("Starting training pipeline...")
+            # Load dataset with a smaller subset
+            logger.info("Loading SQuAD dataset...")
+            dataset = load_dataset("squad", split={
+                'train': 'train[:1000]',
+                'validation': 'validation[:100]'
+            })
+            # Load tokenizer and model
+            self.load_tokenizer_and_model()
+            # Preprocess dataset
+            logger.info("Preprocessing dataset...")
+            tokenized_dataset = dataset.map(
+                self.preprocess_function,
+                batched=True,
+                remove_columns=dataset["train"].column_names,
+                num_proc=2  # Reduced for Spaces
+            )
+            # Set up training arguments
+            output_dir = "/tmp/results"
+            os.makedirs(output_dir, exist_ok=True)
+            training_args = TrainingArguments(
+                output_dir=output_dir,
+                evaluation_strategy="steps",
+                eval_steps=100,
+                save_strategy="steps",
+                save_steps=100,
+                learning_rate=3e-5,
+                per_device_train_batch_size=4,
+                per_device_eval_batch_size=4,
+                num_train_epochs=1,
+                weight_decay=0.01,
+                load_best_model_at_end=True,
+                metric_for_best_model="eval_loss",
+                push_to_hub=True,
+                hub_model_id=MODEL_HUB_ID,
+                hub_token=HF_TOKEN,
+                report_to=["tensorboard"],
+                logging_dir="./logs",
+                logging_steps=50,
+                gradient_accumulation_steps=4,
+                warmup_steps=100,
+            )
+            # Initialize trainer
+            trainer = Trainer(
+                model=self.model,
+                args=training_args,
+                train_dataset=tokenized_dataset["train"],
+                eval_dataset=tokenized_dataset["validation"],
+                compute_metrics=self.compute_metrics,
+            )
+            # Train the model
+            logger.info("Starting training...")
+            trainer.train()
+            # Validate model
+            self.validate_model_outputs(self.model, self.tokenizer)
+            # Save and push to hub
+            logger.info("Saving and pushing model to Hugging Face Hub...")
+            trainer.save_model()
+            self.model.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN)
+            self.tokenizer.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN)
+            logger.info("Training pipeline completed successfully!")
+        except Exception as e:
+            logger.error(f"Training pipeline failed: {e}")
+            raise
+if __name__ == "__main__":
+    trainer = ModelTrainer()
+    trainer.train()