ayush2917 commited on
Commit
5fb33bc
·
verified ·
1 Parent(s): 2eb5a40

Update src/model.py

Browse files
Files changed (1) hide show
  1. src/model.py +31 -36
src/model.py CHANGED
@@ -1,44 +1,34 @@
1
  # src/model.py
2
  from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
3
  from sklearn.metrics import classification_report, confusion_matrix
4
- import torch
5
  import numpy as np
6
- import pandas as pd
7
  import logging
8
- from src.config import MODEL_PATH, BATCH_SIZE, EPOCHS
 
9
 
10
  def setup_logging():
11
- logging.basicConfig(filename="logs/app.log", level=logging.INFO,
12
  format="%(asctime)s - %(levelname)s - %(message)s")
13
 
14
- class EcommerceDataset(torch.utils.data.Dataset):
15
- def __init__(self, encodings, labels):
16
- self.encodings = encodings
17
- self.labels = labels
 
 
 
18
 
19
- def __getitem__(self, idx):
20
- item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
21
- item["labels"] = torch.tensor(self.labels[idx])
22
- return item
23
-
24
- def __len__(self):
25
- return len(self.labels)
26
-
27
- def train_model(train_encodings, train_labels, val_encodings, val_labels):
28
- """Fine-tune DistilBERT for classification."""
29
  setup_logging()
30
- model = DistilBertForSequenceClassification.from_pretrained(
31
- "distilbert-base-uncased", num_labels=4
32
- )
33
  label_map = {"Electronics": 0, "Household": 1, "Books": 2, "Clothing & Accessories": 3}
34
- train_labels = [label_map[label] for label in train_labels]
35
- val_labels = [label_map[label] for label in val_labels]
36
-
37
- train_dataset = EcommerceDataset(train_encodings, train_labels)
38
- val_dataset = EcommerceDataset(val_encodings, val_labels)
39
 
40
  training_args = TrainingArguments(
41
- output_dir=MODEL_PATH,
42
  num_train_epochs=EPOCHS,
43
  per_device_train_batch_size=BATCH_SIZE,
44
  per_device_eval_batch_size=BATCH_SIZE,
@@ -46,6 +36,8 @@ def train_model(train_encodings, train_labels, val_encodings, val_labels):
46
  save_strategy="epoch",
47
  logging_dir="logs/",
48
  logging_steps=100,
 
 
49
  )
50
 
51
  trainer = Trainer(
@@ -53,26 +45,29 @@ def train_model(train_encodings, train_labels, val_encodings, val_labels):
53
  args=training_args,
54
  train_dataset=train_dataset,
55
  eval_dataset=val_dataset,
 
56
  )
57
 
58
  logging.info("Starting model training")
59
  trainer.train()
60
- model.save_pretrained(MODEL_PATH)
61
- logging.info(f"Model saved to {MODEL_PATH}")
 
62
  return model, label_map
63
 
64
- def evaluate_model(model, test_encodings, test_labels):
65
  """Evaluate model and log metrics."""
66
  setup_logging()
67
  label_map = {"Electronics": 0, "Household": 1, "Books": 2, "Clothing & Accessories": 3}
68
- test_labels = [label_map[label] for label in test_labels]
69
- test_dataset = EcommerceDataset(test_encodings, test_labels)
70
- trainer = Trainer(model=model)
71
  predictions = trainer.predict(test_dataset).predictions
72
  pred_labels = np.argmax(predictions, axis=1)
 
73
 
74
- report = classification_report(test_labels, pred_labels, target_names=label_map.keys())
 
75
  logging.info(f"Classification Report:\n{report}")
76
- cm = confusion_matrix(test_labels, pred_labels)
77
  logging.info(f"Confusion Matrix:\n{cm}")
78
- return report, cm
 
1
  # src/model.py
2
  from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
3
  from sklearn.metrics import classification_report, confusion_matrix
 
4
  import numpy as np
 
5
  import logging
6
+ from huggingface_hub import login
7
+ from src.config import MODEL_NAME, HF_MODEL_PATH, LOCAL_MODEL_PATH, BATCH_SIZE, EPOCHS, HF_TOKEN, LOG_FILE
8
 
9
  def setup_logging():
10
+ logging.basicConfig(filename=LOG_FILE, level=logging.INFO,
11
  format="%(asctime)s - %(levelname)s - %(message)s")
12
 
13
+ def compute_metrics(eval_pred):
14
+ """Compute evaluation metrics."""
15
+ logits, labels = eval_pred
16
+ predictions = np.argmax(logits, axis=-1)
17
+ report = classification_report(labels, predictions, output_dict=True,
18
+ target_names=["Electronics", "Household", "Books", "Clothing & Accessories"])
19
+ return {"accuracy": report["accuracy"], "f1": report["macro avg"]["f1-score"]}
20
 
21
+ def train_model(train_dataset, val_dataset):
22
+ """Fine-tune DistilBERT and push to Hugging Face Hub."""
 
 
 
 
 
 
 
 
23
  setup_logging()
24
+ login(token=HF_TOKEN) # Log in to Hugging Face Hub
25
+ model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4)
 
26
  label_map = {"Electronics": 0, "Household": 1, "Books": 2, "Clothing & Accessories": 3}
27
+ train_dataset = train_dataset.map(lambda x: {"labels": label_map[x["category"]]})
28
+ val_dataset = val_dataset.map(lambda x: {"labels": label_map[x["category"]]})
 
 
 
29
 
30
  training_args = TrainingArguments(
31
+ output_dir=LOCAL_MODEL_PATH,
32
  num_train_epochs=EPOCHS,
33
  per_device_train_batch_size=BATCH_SIZE,
34
  per_device_eval_batch_size=BATCH_SIZE,
 
36
  save_strategy="epoch",
37
  logging_dir="logs/",
38
  logging_steps=100,
39
+ push_to_hub=True,
40
+ hub_model_id=HF_MODEL_PATH,
41
  )
42
 
43
  trainer = Trainer(
 
45
  args=training_args,
46
  train_dataset=train_dataset,
47
  eval_dataset=val_dataset,
48
+ compute_metrics=compute_metrics,
49
  )
50
 
51
  logging.info("Starting model training")
52
  trainer.train()
53
+ trainer.push_to_hub() # Push model to Hugging Face Hub
54
+ model.save_pretrained(LOCAL_MODEL_PATH)
55
+ logging.info(f"Model saved locally to {LOCAL_MODEL_PATH} and pushed to {HF_MODEL_PATH}")
56
  return model, label_map
57
 
58
+ def evaluate_model(model, test_dataset):
59
  """Evaluate model and log metrics."""
60
  setup_logging()
61
  label_map = {"Electronics": 0, "Household": 1, "Books": 2, "Clothing & Accessories": 3}
62
+ test_dataset = test_dataset.map(lambda x: {"labels": label_map[x["category"]]})
63
+ trainer = Trainer(model=model, compute_metrics=compute_metrics)
64
+ results = trainer.evaluate(test_dataset)
65
  predictions = trainer.predict(test_dataset).predictions
66
  pred_labels = np.argmax(predictions, axis=1)
67
+ true_labels = [x["labels"] for x in test_dataset]
68
 
69
+ report = classification_report(true_labels, pred_labels, target_names=label_map.keys())
70
+ cm = confusion_matrix(true_labels, pred_labels)
71
  logging.info(f"Classification Report:\n{report}")
 
72
  logging.info(f"Confusion Matrix:\n{cm}")
73
+ return report, cm, results