Spaces:

ayush2917
/

Ubuntu-Customer-Centre-Inquiries

No application file

App Files Files Community

Ubuntu-Customer-Centre-Inquiries / src /model.py

ayush2917

Update src/model.py

5fb33bc verified 9 months ago

raw

history blame contribute delete

3.17 kB

	# src/model.py
	from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
	from sklearn.metrics import classification_report, confusion_matrix
	import numpy as np
	import logging
	from huggingface_hub import login
	from src.config import MODEL_NAME, HF_MODEL_PATH, LOCAL_MODEL_PATH, BATCH_SIZE, EPOCHS, HF_TOKEN, LOG_FILE

	def setup_logging():
	logging.basicConfig(filename=LOG_FILE, level=logging.INFO,
	format="%(asctime)s - %(levelname)s - %(message)s")

	def compute_metrics(eval_pred):
	"""Compute evaluation metrics."""
	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=-1)
	report = classification_report(labels, predictions, output_dict=True,
	target_names=["Electronics", "Household", "Books", "Clothing & Accessories"])
	return {"accuracy": report["accuracy"], "f1": report["macro avg"]["f1-score"]}

	def train_model(train_dataset, val_dataset):
	"""Fine-tune DistilBERT and push to Hugging Face Hub."""
	setup_logging()
	login(token=HF_TOKEN) # Log in to Hugging Face Hub
	model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4)
	label_map = {"Electronics": 0, "Household": 1, "Books": 2, "Clothing & Accessories": 3}
	train_dataset = train_dataset.map(lambda x: {"labels": label_map[x["category"]]})
	val_dataset = val_dataset.map(lambda x: {"labels": label_map[x["category"]]})

	training_args = TrainingArguments(
	output_dir=LOCAL_MODEL_PATH,
	num_train_epochs=EPOCHS,
	per_device_train_batch_size=BATCH_SIZE,
	per_device_eval_batch_size=BATCH_SIZE,
	evaluation_strategy="epoch",
	save_strategy="epoch",
	logging_dir="logs/",
	logging_steps=100,
	push_to_hub=True,
	hub_model_id=HF_MODEL_PATH,
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=val_dataset,
	compute_metrics=compute_metrics,
	)

	logging.info("Starting model training")
	trainer.train()
	trainer.push_to_hub() # Push model to Hugging Face Hub
	model.save_pretrained(LOCAL_MODEL_PATH)
	logging.info(f"Model saved locally to {LOCAL_MODEL_PATH} and pushed to {HF_MODEL_PATH}")
	return model, label_map

	def evaluate_model(model, test_dataset):
	"""Evaluate model and log metrics."""
	setup_logging()
	label_map = {"Electronics": 0, "Household": 1, "Books": 2, "Clothing & Accessories": 3}
	test_dataset = test_dataset.map(lambda x: {"labels": label_map[x["category"]]})
	trainer = Trainer(model=model, compute_metrics=compute_metrics)
	results = trainer.evaluate(test_dataset)
	predictions = trainer.predict(test_dataset).predictions
	pred_labels = np.argmax(predictions, axis=1)
	true_labels = [x["labels"] for x in test_dataset]

	report = classification_report(true_labels, pred_labels, target_names=label_map.keys())
	cm = confusion_matrix(true_labels, pred_labels)
	logging.info(f"Classification Report:\n{report}")
	logging.info(f"Confusion Matrix:\n{cm}")
	return report, cm, results