File size: 4,472 Bytes
53096ab
 
 
d6bb7e9
53096ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513e2df
53096ab
 
513e2df
53096ab
513e2df
341945b
 
 
 
af5b0b6
 
 
 
 
e7e56a6
53096ab
 
 
 
 
 
af5b0b6
53096ab
 
 
 
 
d6bb7e9
 
 
53096ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7e56a6
53096ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import json
import logging
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from huggingface_hub import HfFolder

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set cache directory to a writable location
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def setup_training():
    logging.info("Starting the training setup process")

    # Load configuration
    with open('config.json', 'r') as f:
        config = json.load(f)
    
    logging.info(f"Loaded configuration: {config}")

    # Load your dataset with the specified configuration
    logging.info("Loading the MarbleX dataset")
    dataset = load_dataset("Oranblock/marblex_dataset", "config1")  # Replace "config1" with the appropriate config name

    # Print dataset structure
    logging.info(f"Dataset columns: {dataset['train'].column_names}")

    # Determine the number of unique labels
    unique_labels = dataset['train'].unique(config['target_column'])
    num_labels = len(unique_labels)
    logging.info(f"Number of unique labels: {num_labels}")

    logging.info(f"Dataset loaded. Train size: {len(dataset['train'])}, Test size: {len(dataset['test'])}")

    # Load tokenizer and model
    logging.info(f"Loading tokenizer and model: {config['model_name']}")
    tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
    model = AutoModelForSequenceClassification.from_pretrained(
        config['model_name'], 
        num_labels=num_labels
    )

    # Tokenize the dataset
    logging.info("Tokenizing the dataset")
    def tokenize_function(examples):
        # Concatenate all feature columns into a single input
        features = np.stack([examples[col] for col in config['text_columns']], axis=1)
        return tokenizer(features.tolist(), padding="max_length", truncation=True)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    logging.info("Dataset tokenization completed")

    # Set up training arguments
    logging.info("Setting up training arguments")
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=config['num_train_epochs'],
        per_device_train_batch_size=config['per_device_train_batch_size'],
        per_device_eval_batch_size=config['per_device_eval_batch_size'],
        warmup_ratio=config['warmup_ratio'],
        weight_decay=config['weight_decay'],
        learning_rate=config['learning_rate'],
        fp16=config['fp16'],
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=config['push_to_hub'],
        hub_model_id=config['hub_model_id'],
        logging_dir='./logs',
        logging_steps=100,
    )

    # Initialize Trainer
    logging.info("Initializing Trainer")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],  # Use 'test' split for evaluation
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Start training
    logging.info("Starting the training process")
    trainer.train()

    # Evaluate the model
    logging.info("Evaluating the model")
    eval_results = trainer.evaluate()
    logging.info(f"Evaluation results: {eval_results}")

    # Push model to hub if configured
    if config['push_to_hub']:
        logging.info("Pushing model to Hugging Face Hub")
        trainer.push_to_hub()
        logging.info(f"Model pushed to {config['hub_model_id']}")

    logging.info("Training process completed")

if __name__ == "__main__":
    # Set Hugging Face token
    hf_token = os.environ.get('HF_TOKEN')
    if hf_token:
        HfFolder.save_token(hf_token)
        logging.info("Hugging Face token set")
    else:
        logging.warning("HF_TOKEN not found in environment variables")

    setup_training()