File size: 3,167 Bytes
a79e8b4
 
 
 
 
5fb33bc
 
a79e8b4
 
5fb33bc
a79e8b4
 
5fb33bc
 
 
 
 
 
 
a79e8b4
5fb33bc
 
a79e8b4
5fb33bc
 
a79e8b4
5fb33bc
 
a79e8b4
 
5fb33bc
a79e8b4
 
 
 
 
 
 
5fb33bc
 
a79e8b4
 
 
 
 
 
 
5fb33bc
a79e8b4
 
 
 
5fb33bc
 
 
a79e8b4
 
5fb33bc
a79e8b4
 
 
5fb33bc
 
 
a79e8b4
 
5fb33bc
a79e8b4
5fb33bc
 
a79e8b4
 
5fb33bc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# src/model.py
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import logging
from huggingface_hub import login
from src.config import MODEL_NAME, HF_MODEL_PATH, LOCAL_MODEL_PATH, BATCH_SIZE, EPOCHS, HF_TOKEN, LOG_FILE

def setup_logging():
    logging.basicConfig(filename=LOG_FILE, level=logging.INFO, 
                        format="%(asctime)s - %(levelname)s - %(message)s")

def compute_metrics(eval_pred):
    """Compute evaluation metrics."""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    report = classification_report(labels, predictions, output_dict=True, 
                                  target_names=["Electronics", "Household", "Books", "Clothing & Accessories"])
    return {"accuracy": report["accuracy"], "f1": report["macro avg"]["f1-score"]}

def train_model(train_dataset, val_dataset):
    """Fine-tune DistilBERT and push to Hugging Face Hub."""
    setup_logging()
    login(token=HF_TOKEN)  # Log in to Hugging Face Hub
    model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4)
    label_map = {"Electronics": 0, "Household": 1, "Books": 2, "Clothing & Accessories": 3}
    train_dataset = train_dataset.map(lambda x: {"labels": label_map[x["category"]]})
    val_dataset = val_dataset.map(lambda x: {"labels": label_map[x["category"]]})

    training_args = TrainingArguments(
        output_dir=LOCAL_MODEL_PATH,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir="logs/",
        logging_steps=100,
        push_to_hub=True,
        hub_model_id=HF_MODEL_PATH,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    logging.info("Starting model training")
    trainer.train()
    trainer.push_to_hub()  # Push model to Hugging Face Hub
    model.save_pretrained(LOCAL_MODEL_PATH)
    logging.info(f"Model saved locally to {LOCAL_MODEL_PATH} and pushed to {HF_MODEL_PATH}")
    return model, label_map

def evaluate_model(model, test_dataset):
    """Evaluate model and log metrics."""
    setup_logging()
    label_map = {"Electronics": 0, "Household": 1, "Books": 2, "Clothing & Accessories": 3}
    test_dataset = test_dataset.map(lambda x: {"labels": label_map[x["category"]]})
    trainer = Trainer(model=model, compute_metrics=compute_metrics)
    results = trainer.evaluate(test_dataset)
    predictions = trainer.predict(test_dataset).predictions
    pred_labels = np.argmax(predictions, axis=1)
    true_labels = [x["labels"] for x in test_dataset]

    report = classification_report(true_labels, pred_labels, target_names=label_map.keys())
    cm = confusion_matrix(true_labels, pred_labels)
    logging.info(f"Classification Report:\n{report}")
    logging.info(f"Confusion Matrix:\n{cm}")
    return report, cm, results