# src/model.py from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments from sklearn.metrics import classification_report, confusion_matrix import numpy as np import logging from huggingface_hub import login from src.config import MODEL_NAME, HF_MODEL_PATH, LOCAL_MODEL_PATH, BATCH_SIZE, EPOCHS, HF_TOKEN, LOG_FILE def setup_logging(): logging.basicConfig(filename=LOG_FILE, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") def compute_metrics(eval_pred): """Compute evaluation metrics.""" logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) report = classification_report(labels, predictions, output_dict=True, target_names=["Electronics", "Household", "Books", "Clothing & Accessories"]) return {"accuracy": report["accuracy"], "f1": report["macro avg"]["f1-score"]} def train_model(train_dataset, val_dataset): """Fine-tune DistilBERT and push to Hugging Face Hub.""" setup_logging() login(token=HF_TOKEN) # Log in to Hugging Face Hub model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4) label_map = {"Electronics": 0, "Household": 1, "Books": 2, "Clothing & Accessories": 3} train_dataset = train_dataset.map(lambda x: {"labels": label_map[x["category"]]}) val_dataset = val_dataset.map(lambda x: {"labels": label_map[x["category"]]}) training_args = TrainingArguments( output_dir=LOCAL_MODEL_PATH, num_train_epochs=EPOCHS, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, evaluation_strategy="epoch", save_strategy="epoch", logging_dir="logs/", logging_steps=100, push_to_hub=True, hub_model_id=HF_MODEL_PATH, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics, ) logging.info("Starting model training") trainer.train() trainer.push_to_hub() # Push model to Hugging Face Hub model.save_pretrained(LOCAL_MODEL_PATH) logging.info(f"Model saved locally to {LOCAL_MODEL_PATH} and pushed to {HF_MODEL_PATH}") return model, label_map def evaluate_model(model, test_dataset): """Evaluate model and log metrics.""" setup_logging() label_map = {"Electronics": 0, "Household": 1, "Books": 2, "Clothing & Accessories": 3} test_dataset = test_dataset.map(lambda x: {"labels": label_map[x["category"]]}) trainer = Trainer(model=model, compute_metrics=compute_metrics) results = trainer.evaluate(test_dataset) predictions = trainer.predict(test_dataset).predictions pred_labels = np.argmax(predictions, axis=1) true_labels = [x["labels"] for x in test_dataset] report = classification_report(true_labels, pred_labels, target_names=label_map.keys()) cm = confusion_matrix(true_labels, pred_labels) logging.info(f"Classification Report:\n{report}") logging.info(f"Confusion Matrix:\n{cm}") return report, cm, results