File size: 4,472 Bytes
53096ab d6bb7e9 53096ab 513e2df 53096ab 513e2df 53096ab 513e2df 341945b af5b0b6 e7e56a6 53096ab af5b0b6 53096ab d6bb7e9 53096ab e7e56a6 53096ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import os
import json
import logging
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from huggingface_hub import HfFolder
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Set cache directory to a writable location
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
def setup_training():
logging.info("Starting the training setup process")
# Load configuration
with open('config.json', 'r') as f:
config = json.load(f)
logging.info(f"Loaded configuration: {config}")
# Load your dataset with the specified configuration
logging.info("Loading the MarbleX dataset")
dataset = load_dataset("Oranblock/marblex_dataset", "config1") # Replace "config1" with the appropriate config name
# Print dataset structure
logging.info(f"Dataset columns: {dataset['train'].column_names}")
# Determine the number of unique labels
unique_labels = dataset['train'].unique(config['target_column'])
num_labels = len(unique_labels)
logging.info(f"Number of unique labels: {num_labels}")
logging.info(f"Dataset loaded. Train size: {len(dataset['train'])}, Test size: {len(dataset['test'])}")
# Load tokenizer and model
logging.info(f"Loading tokenizer and model: {config['model_name']}")
tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
model = AutoModelForSequenceClassification.from_pretrained(
config['model_name'],
num_labels=num_labels
)
# Tokenize the dataset
logging.info("Tokenizing the dataset")
def tokenize_function(examples):
# Concatenate all feature columns into a single input
features = np.stack([examples[col] for col in config['text_columns']], axis=1)
return tokenizer(features.tolist(), padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
logging.info("Dataset tokenization completed")
# Set up training arguments
logging.info("Setting up training arguments")
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=config['num_train_epochs'],
per_device_train_batch_size=config['per_device_train_batch_size'],
per_device_eval_batch_size=config['per_device_eval_batch_size'],
warmup_ratio=config['warmup_ratio'],
weight_decay=config['weight_decay'],
learning_rate=config['learning_rate'],
fp16=config['fp16'],
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=config['push_to_hub'],
hub_model_id=config['hub_model_id'],
logging_dir='./logs',
logging_steps=100,
)
# Initialize Trainer
logging.info("Initializing Trainer")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"], # Use 'test' split for evaluation
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
# Start training
logging.info("Starting the training process")
trainer.train()
# Evaluate the model
logging.info("Evaluating the model")
eval_results = trainer.evaluate()
logging.info(f"Evaluation results: {eval_results}")
# Push model to hub if configured
if config['push_to_hub']:
logging.info("Pushing model to Hugging Face Hub")
trainer.push_to_hub()
logging.info(f"Model pushed to {config['hub_model_id']}")
logging.info("Training process completed")
if __name__ == "__main__":
# Set Hugging Face token
hf_token = os.environ.get('HF_TOKEN')
if hf_token:
HfFolder.save_token(hf_token)
logging.info("Hugging Face token set")
else:
logging.warning("HF_TOKEN not found in environment variables")
setup_training() |