# ========================================================= # BERT PREPROCESSING + TRAINING + ARTIFACT GENERATION # ========================================================= import os import re import pickle import pandas as pd import numpy as np import torch from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, matthews_corrcoef from transformers import ( BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments ) from torch.utils.data import Dataset # --------------------------------------------------------- # CONFIG # --------------------------------------------------------- BASE_DIR = os.path.dirname(os.path.abspath(__file__)) DATA_PATH = os.path.join(BASE_DIR, "train.csv") print("šŸ“„ Loading dataset from:", DATA_PATH) # CHANGE if needed ARTIFACT_DIR = "classification/artifacts" MODEL_DIR = f"{ARTIFACT_DIR}/bert_model" MAX_LENGTH = 100 EPOCHS = 3 BATCH_SIZE = 16 LEARNING_RATE = 2e-5 os.makedirs(ARTIFACT_DIR, exist_ok=True) # --------------------------------------------------------- # 1. LOAD DATA # --------------------------------------------------------- df = pd.read_csv(DATA_PATH) df = df[['text', 'label']] df.dropna(inplace=True) df.drop_duplicates(inplace=True) # --------------------------------------------------------- # 2. CLEAN TEXT (BERT SAFE) # --------------------------------------------------------- def clean_text(text): text = str(text) text = re.sub(r"<.*?>", " ", text) text = re.sub(r"[^\x00-\x7F]+", " ", text) text = re.sub(r"\s+", " ", text).strip() return text df['text'] = df['text'].apply(clean_text) # --------------------------------------------------------- # 3. LABEL ENCODING # --------------------------------------------------------- label_encoder = LabelEncoder() df['label_id'] = label_encoder.fit_transform(df['label']) label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))) # SAVE LABEL ENCODER & MAP with open(f"{ARTIFACT_DIR}/label_encoder.pkl", "wb") as f: pickle.dump(label_encoder, f) with open(f"{ARTIFACT_DIR}/label_map.pkl", "wb") as f: pickle.dump(label_map, f)# ========================================================= # BERT PREPROCESSING + TRAINING + ARTIFACT GENERATION # ========================================================= import os import re import pickle import pandas as pd import numpy as np import torch from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.metrics import ( accuracy_score, f1_score, balanced_accuracy_score, matthews_corrcoef ) from transformers import ( BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments ) from torch.utils.data import Dataset # --------------------------------------------------------- # PATH CONFIG (WINDOWS SAFE) # --------------------------------------------------------- BASE_DIR = os.path.dirname(os.path.abspath(__file__)) DATA_PATH = os.path.join(BASE_DIR, "train.csv") ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts") MODEL_DIR = os.path.join(ARTIFACT_DIR, "bert_model") MAX_LENGTH = 100 EPOCHS = 3 BATCH_SIZE = 16 LEARNING_RATE = 2e-5 os.makedirs(ARTIFACT_DIR, exist_ok=True) # --------------------------------------------------------- # 1. LOAD DATA # --------------------------------------------------------- print(f"šŸ“„ Loading dataset from: {DATA_PATH}") df = pd.read_csv(DATA_PATH) df = df[['text', 'label']] df.dropna(inplace=True) df.drop_duplicates(inplace=True) # --------------------------------------------------------- # 2. CLEAN TEXT (BERT SAFE) # --------------------------------------------------------- def clean_text(text): text = str(text) text = re.sub(r"<.*?>", " ", text) text = re.sub(r"[^\x00-\x7F]+", " ", text) text = re.sub(r"\s+", " ", text).strip() return text df["text"] = df["text"].apply(clean_text) # --------------------------------------------------------- # 3. LABEL ENCODING # --------------------------------------------------------- label_encoder = LabelEncoder() df["label_id"] = label_encoder.fit_transform(df["label"]) label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))) # Save label artifacts with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "wb") as f: pickle.dump(label_encoder, f) with open(os.path.join(ARTIFACT_DIR, "label_map.pkl"), "wb") as f: pickle.dump(label_map, f) NUM_LABELS = len(label_map) print(f"āœ… Number of classes: {NUM_LABELS}") # --------------------------------------------------------- # 4. TRAIN / VAL / TEST SPLIT # --------------------------------------------------------- train_df, temp_df = train_test_split( df, test_size=0.30, stratify=df["label_id"], random_state=42 ) val_df, test_df = train_test_split( temp_df, test_size=0.50, stratify=temp_df["label_id"], random_state=42 ) # Save processed splits train_df.to_csv(os.path.join(ARTIFACT_DIR, "train.csv"), index=False) val_df.to_csv(os.path.join(ARTIFACT_DIR, "val.csv"), index=False) test_df.to_csv(os.path.join(ARTIFACT_DIR, "test.csv"), index=False) # --------------------------------------------------------- # 5. TOKENIZER # --------------------------------------------------------- tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "wb") as f: pickle.dump(tokenizer, f) # --------------------------------------------------------- # 6. TORCH DATASET # --------------------------------------------------------- class GrievanceDataset(Dataset): def __init__(self, texts, labels): self.encodings = tokenizer( list(texts), truncation=True, padding=True, max_length=MAX_LENGTH ) self.labels = list(labels) def __getitem__(self, idx): item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()} item["labels"] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) train_dataset = GrievanceDataset(train_df["text"], train_df["label_id"]) val_dataset = GrievanceDataset(val_df["text"], val_df["label_id"]) test_dataset = GrievanceDataset(test_df["text"], test_df["label_id"]) # --------------------------------------------------------- # 7. MODEL # --------------------------------------------------------- model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=NUM_LABELS ) # --------------------------------------------------------- # 8. METRICS # --------------------------------------------------------- def compute_metrics(eval_pred): logits, labels = eval_pred preds = np.argmax(logits, axis=1) return { "accuracy": accuracy_score(labels, preds), "balanced_accuracy": balanced_accuracy_score(labels, preds), "f1_weighted": f1_score(labels, preds, average="weighted"), "mcc": matthews_corrcoef(labels, preds) } # --------------------------------------------------------- # 9. TRAINING # --------------------------------------------------------- training_args = TrainingArguments( output_dir=os.path.join(ARTIFACT_DIR, "results"), learning_rate=LEARNING_RATE, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, num_train_epochs=EPOCHS, weight_decay=0.01, logging_steps=100, save_strategy="no", report_to="none" ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics ) trainer.train() # --------------------------------------------------------- # 10. FINAL TEST EVALUATION # --------------------------------------------------------- predictions = trainer.predict(test_dataset) y_true = predictions.label_ids y_pred = np.argmax(predictions.predictions, axis=1) print("\n===== FINAL TEST METRICS =====") print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}") print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}") print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}") print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}") # --------------------------------------------------------- # 11. SAVE TRAINED MODEL # --------------------------------------------------------- model.save_pretrained(MODEL_DIR) print("\nāœ… PREPROCESSING + TRAINING COMPLETED SUCCESSFULLY") NUM_LABELS = len(label_map) # --------------------------------------------------------- # 4. TRAIN / VAL / TEST SPLIT # --------------------------------------------------------- train_df, temp_df = train_test_split( df, test_size=0.30, stratify=df['label_id'], random_state=42 ) val_df, test_df = train_test_split( temp_df, test_size=0.50, stratify=temp_df['label_id'], random_state=42 ) # SAVE PREPROCESSED SPLITS train_df.to_csv(f"{ARTIFACT_DIR}/train.csv", index=False) val_df.to_csv(f"{ARTIFACT_DIR}/val.csv", index=False) test_df.to_csv(f"{ARTIFACT_DIR}/test.csv", index=False) # --------------------------------------------------------- # 5. TOKENIZER # --------------------------------------------------------- tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # SAVE TOKENIZER with open(f"{ARTIFACT_DIR}/tokenizer.pkl", "wb") as f: pickle.dump(tokenizer, f) # --------------------------------------------------------- # 6. DATASET CLASS # --------------------------------------------------------- class GrievanceDataset(Dataset): def __init__(self, texts, labels): self.encodings = tokenizer( list(texts), truncation=True, padding=True, max_length=MAX_LENGTH ) self.labels = list(labels) def __getitem__(self, idx): item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()} item["labels"] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) train_dataset = GrievanceDataset(train_df['text'], train_df['label_id']) val_dataset = GrievanceDataset(val_df['text'], val_df['label_id']) test_dataset = GrievanceDataset(test_df['text'], test_df['label_id']) # --------------------------------------------------------- # 7. MODEL # --------------------------------------------------------- model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=NUM_LABELS ) # --------------------------------------------------------- # 8. METRICS # --------------------------------------------------------- def compute_metrics(eval_pred): logits, labels = eval_pred preds = np.argmax(logits, axis=1) return { "accuracy": accuracy_score(labels, preds), "balanced_accuracy": balanced_accuracy_score(labels, preds), "f1": f1_score(labels, preds, average="weighted"), "mcc": matthews_corrcoef(labels, preds) } # --------------------------------------------------------- # 9. TRAINING # --------------------------------------------------------- training_args = TrainingArguments( output_dir=f"{ARTIFACT_DIR}/results", learning_rate=LEARNING_RATE, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, num_train_epochs=EPOCHS, weight_decay=0.01, logging_steps=100, save_strategy="no", report_to="none" ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics ) trainer.train() # --------------------------------------------------------- # 10. FINAL TEST EVALUATION # --------------------------------------------------------- predictions = trainer.predict(test_dataset) y_true = predictions.label_ids y_pred = np.argmax(predictions.predictions, axis=1) print("\n===== FINAL TEST METRICS =====") print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}") print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}") print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}") print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}") # --------------------------------------------------------- # 11. SAVE TRAINED MODEL # --------------------------------------------------------- model.save_pretrained(MODEL_DIR) print("\nāœ… PREPROCESSING + TRAINING + ARTIFACT GENERATION COMPLETED")