ayush2917's picture
Update src/model.py
5fb33bc verified
# src/model.py
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import logging
from huggingface_hub import login
from src.config import MODEL_NAME, HF_MODEL_PATH, LOCAL_MODEL_PATH, BATCH_SIZE, EPOCHS, HF_TOKEN, LOG_FILE
def setup_logging():
logging.basicConfig(filename=LOG_FILE, level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s")
def compute_metrics(eval_pred):
"""Compute evaluation metrics."""
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
report = classification_report(labels, predictions, output_dict=True,
target_names=["Electronics", "Household", "Books", "Clothing & Accessories"])
return {"accuracy": report["accuracy"], "f1": report["macro avg"]["f1-score"]}
def train_model(train_dataset, val_dataset):
"""Fine-tune DistilBERT and push to Hugging Face Hub."""
setup_logging()
login(token=HF_TOKEN) # Log in to Hugging Face Hub
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4)
label_map = {"Electronics": 0, "Household": 1, "Books": 2, "Clothing & Accessories": 3}
train_dataset = train_dataset.map(lambda x: {"labels": label_map[x["category"]]})
val_dataset = val_dataset.map(lambda x: {"labels": label_map[x["category"]]})
training_args = TrainingArguments(
output_dir=LOCAL_MODEL_PATH,
num_train_epochs=EPOCHS,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
evaluation_strategy="epoch",
save_strategy="epoch",
logging_dir="logs/",
logging_steps=100,
push_to_hub=True,
hub_model_id=HF_MODEL_PATH,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
)
logging.info("Starting model training")
trainer.train()
trainer.push_to_hub() # Push model to Hugging Face Hub
model.save_pretrained(LOCAL_MODEL_PATH)
logging.info(f"Model saved locally to {LOCAL_MODEL_PATH} and pushed to {HF_MODEL_PATH}")
return model, label_map
def evaluate_model(model, test_dataset):
"""Evaluate model and log metrics."""
setup_logging()
label_map = {"Electronics": 0, "Household": 1, "Books": 2, "Clothing & Accessories": 3}
test_dataset = test_dataset.map(lambda x: {"labels": label_map[x["category"]]})
trainer = Trainer(model=model, compute_metrics=compute_metrics)
results = trainer.evaluate(test_dataset)
predictions = trainer.predict(test_dataset).predictions
pred_labels = np.argmax(predictions, axis=1)
true_labels = [x["labels"] for x in test_dataset]
report = classification_report(true_labels, pred_labels, target_names=label_map.keys())
cm = confusion_matrix(true_labels, pred_labels)
logging.info(f"Classification Report:\n{report}")
logging.info(f"Confusion Matrix:\n{cm}")
return report, cm, results