Spaces:
Sleeping
Sleeping
| # ========================================================= | |
| # BERT PREPROCESSING + TRAINING + ARTIFACT GENERATION | |
| # ========================================================= | |
| import os | |
| import re | |
| import pickle | |
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, matthews_corrcoef | |
| from transformers import ( | |
| BertTokenizer, | |
| BertForSequenceClassification, | |
| Trainer, | |
| TrainingArguments | |
| ) | |
| from torch.utils.data import Dataset | |
| # --------------------------------------------------------- | |
| # CONFIG | |
| # --------------------------------------------------------- | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DATA_PATH = os.path.join(BASE_DIR, "train.csv") | |
| print("📄 Loading dataset from:", DATA_PATH) # CHANGE if needed | |
| ARTIFACT_DIR = "classification/artifacts" | |
| MODEL_DIR = f"{ARTIFACT_DIR}/bert_model" | |
| MAX_LENGTH = 100 | |
| EPOCHS = 3 | |
| BATCH_SIZE = 16 | |
| LEARNING_RATE = 2e-5 | |
| os.makedirs(ARTIFACT_DIR, exist_ok=True) | |
| # --------------------------------------------------------- | |
| # 1. LOAD DATA | |
| # --------------------------------------------------------- | |
| df = pd.read_csv(DATA_PATH) | |
| df = df[['text', 'label']] | |
| df.dropna(inplace=True) | |
| df.drop_duplicates(inplace=True) | |
| # --------------------------------------------------------- | |
| # 2. CLEAN TEXT (BERT SAFE) | |
| # --------------------------------------------------------- | |
| def clean_text(text): | |
| text = str(text) | |
| text = re.sub(r"<.*?>", " ", text) | |
| text = re.sub(r"[^\x00-\x7F]+", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| df['text'] = df['text'].apply(clean_text) | |
| # --------------------------------------------------------- | |
| # 3. LABEL ENCODING | |
| # --------------------------------------------------------- | |
| label_encoder = LabelEncoder() | |
| df['label_id'] = label_encoder.fit_transform(df['label']) | |
| label_map = dict(zip(label_encoder.classes_, | |
| label_encoder.transform(label_encoder.classes_))) | |
| # SAVE LABEL ENCODER & MAP | |
| with open(f"{ARTIFACT_DIR}/label_encoder.pkl", "wb") as f: | |
| pickle.dump(label_encoder, f) | |
| with open(f"{ARTIFACT_DIR}/label_map.pkl", "wb") as f: | |
| pickle.dump(label_map, f)# ========================================================= | |
| # BERT PREPROCESSING + TRAINING + ARTIFACT GENERATION | |
| # ========================================================= | |
| import os | |
| import re | |
| import pickle | |
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.metrics import ( | |
| accuracy_score, | |
| f1_score, | |
| balanced_accuracy_score, | |
| matthews_corrcoef | |
| ) | |
| from transformers import ( | |
| BertTokenizer, | |
| BertForSequenceClassification, | |
| Trainer, | |
| TrainingArguments | |
| ) | |
| from torch.utils.data import Dataset | |
| # --------------------------------------------------------- | |
| # PATH CONFIG (WINDOWS SAFE) | |
| # --------------------------------------------------------- | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DATA_PATH = os.path.join(BASE_DIR, "train.csv") | |
| ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts") | |
| MODEL_DIR = os.path.join(ARTIFACT_DIR, "bert_model") | |
| MAX_LENGTH = 100 | |
| EPOCHS = 3 | |
| BATCH_SIZE = 16 | |
| LEARNING_RATE = 2e-5 | |
| os.makedirs(ARTIFACT_DIR, exist_ok=True) | |
| # --------------------------------------------------------- | |
| # 1. LOAD DATA | |
| # --------------------------------------------------------- | |
| print(f"📄 Loading dataset from: {DATA_PATH}") | |
| df = pd.read_csv(DATA_PATH) | |
| df = df[['text', 'label']] | |
| df.dropna(inplace=True) | |
| df.drop_duplicates(inplace=True) | |
| # --------------------------------------------------------- | |
| # 2. CLEAN TEXT (BERT SAFE) | |
| # --------------------------------------------------------- | |
| def clean_text(text): | |
| text = str(text) | |
| text = re.sub(r"<.*?>", " ", text) | |
| text = re.sub(r"[^\x00-\x7F]+", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| df["text"] = df["text"].apply(clean_text) | |
| # --------------------------------------------------------- | |
| # 3. LABEL ENCODING | |
| # --------------------------------------------------------- | |
| label_encoder = LabelEncoder() | |
| df["label_id"] = label_encoder.fit_transform(df["label"]) | |
| label_map = dict(zip(label_encoder.classes_, | |
| label_encoder.transform(label_encoder.classes_))) | |
| # Save label artifacts | |
| with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "wb") as f: | |
| pickle.dump(label_encoder, f) | |
| with open(os.path.join(ARTIFACT_DIR, "label_map.pkl"), "wb") as f: | |
| pickle.dump(label_map, f) | |
| NUM_LABELS = len(label_map) | |
| print(f"✅ Number of classes: {NUM_LABELS}") | |
| # --------------------------------------------------------- | |
| # 4. TRAIN / VAL / TEST SPLIT | |
| # --------------------------------------------------------- | |
| train_df, temp_df = train_test_split( | |
| df, | |
| test_size=0.30, | |
| stratify=df["label_id"], | |
| random_state=42 | |
| ) | |
| val_df, test_df = train_test_split( | |
| temp_df, | |
| test_size=0.50, | |
| stratify=temp_df["label_id"], | |
| random_state=42 | |
| ) | |
| # Save processed splits | |
| train_df.to_csv(os.path.join(ARTIFACT_DIR, "train.csv"), index=False) | |
| val_df.to_csv(os.path.join(ARTIFACT_DIR, "val.csv"), index=False) | |
| test_df.to_csv(os.path.join(ARTIFACT_DIR, "test.csv"), index=False) | |
| # --------------------------------------------------------- | |
| # 5. TOKENIZER | |
| # --------------------------------------------------------- | |
| tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
| with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "wb") as f: | |
| pickle.dump(tokenizer, f) | |
| # --------------------------------------------------------- | |
| # 6. TORCH DATASET | |
| # --------------------------------------------------------- | |
| class GrievanceDataset(Dataset): | |
| def __init__(self, texts, labels): | |
| self.encodings = tokenizer( | |
| list(texts), | |
| truncation=True, | |
| padding=True, | |
| max_length=MAX_LENGTH | |
| ) | |
| self.labels = list(labels) | |
| def __getitem__(self, idx): | |
| item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()} | |
| item["labels"] = torch.tensor(self.labels[idx]) | |
| return item | |
| def __len__(self): | |
| return len(self.labels) | |
| train_dataset = GrievanceDataset(train_df["text"], train_df["label_id"]) | |
| val_dataset = GrievanceDataset(val_df["text"], val_df["label_id"]) | |
| test_dataset = GrievanceDataset(test_df["text"], test_df["label_id"]) | |
| # --------------------------------------------------------- | |
| # 7. MODEL | |
| # --------------------------------------------------------- | |
| model = BertForSequenceClassification.from_pretrained( | |
| "bert-base-uncased", | |
| num_labels=NUM_LABELS | |
| ) | |
| # --------------------------------------------------------- | |
| # 8. METRICS | |
| # --------------------------------------------------------- | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| preds = np.argmax(logits, axis=1) | |
| return { | |
| "accuracy": accuracy_score(labels, preds), | |
| "balanced_accuracy": balanced_accuracy_score(labels, preds), | |
| "f1_weighted": f1_score(labels, preds, average="weighted"), | |
| "mcc": matthews_corrcoef(labels, preds) | |
| } | |
| # --------------------------------------------------------- | |
| # 9. TRAINING | |
| # --------------------------------------------------------- | |
| training_args = TrainingArguments( | |
| output_dir=os.path.join(ARTIFACT_DIR, "results"), | |
| learning_rate=LEARNING_RATE, | |
| per_device_train_batch_size=BATCH_SIZE, | |
| per_device_eval_batch_size=BATCH_SIZE, | |
| num_train_epochs=EPOCHS, | |
| weight_decay=0.01, | |
| logging_steps=100, | |
| save_strategy="no", | |
| report_to="none" | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=val_dataset, | |
| compute_metrics=compute_metrics | |
| ) | |
| trainer.train() | |
| # --------------------------------------------------------- | |
| # 10. FINAL TEST EVALUATION | |
| # --------------------------------------------------------- | |
| predictions = trainer.predict(test_dataset) | |
| y_true = predictions.label_ids | |
| y_pred = np.argmax(predictions.predictions, axis=1) | |
| print("\n===== FINAL TEST METRICS =====") | |
| print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}") | |
| print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}") | |
| print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}") | |
| print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}") | |
| # --------------------------------------------------------- | |
| # 11. SAVE TRAINED MODEL | |
| # --------------------------------------------------------- | |
| model.save_pretrained(MODEL_DIR) | |
| print("\n✅ PREPROCESSING + TRAINING COMPLETED SUCCESSFULLY") | |
| NUM_LABELS = len(label_map) | |
| # --------------------------------------------------------- | |
| # 4. TRAIN / VAL / TEST SPLIT | |
| # --------------------------------------------------------- | |
| train_df, temp_df = train_test_split( | |
| df, test_size=0.30, stratify=df['label_id'], random_state=42 | |
| ) | |
| val_df, test_df = train_test_split( | |
| temp_df, test_size=0.50, stratify=temp_df['label_id'], random_state=42 | |
| ) | |
| # SAVE PREPROCESSED SPLITS | |
| train_df.to_csv(f"{ARTIFACT_DIR}/train.csv", index=False) | |
| val_df.to_csv(f"{ARTIFACT_DIR}/val.csv", index=False) | |
| test_df.to_csv(f"{ARTIFACT_DIR}/test.csv", index=False) | |
| # --------------------------------------------------------- | |
| # 5. TOKENIZER | |
| # --------------------------------------------------------- | |
| tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
| # SAVE TOKENIZER | |
| with open(f"{ARTIFACT_DIR}/tokenizer.pkl", "wb") as f: | |
| pickle.dump(tokenizer, f) | |
| # --------------------------------------------------------- | |
| # 6. DATASET CLASS | |
| # --------------------------------------------------------- | |
| class GrievanceDataset(Dataset): | |
| def __init__(self, texts, labels): | |
| self.encodings = tokenizer( | |
| list(texts), | |
| truncation=True, | |
| padding=True, | |
| max_length=MAX_LENGTH | |
| ) | |
| self.labels = list(labels) | |
| def __getitem__(self, idx): | |
| item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()} | |
| item["labels"] = torch.tensor(self.labels[idx]) | |
| return item | |
| def __len__(self): | |
| return len(self.labels) | |
| train_dataset = GrievanceDataset(train_df['text'], train_df['label_id']) | |
| val_dataset = GrievanceDataset(val_df['text'], val_df['label_id']) | |
| test_dataset = GrievanceDataset(test_df['text'], test_df['label_id']) | |
| # --------------------------------------------------------- | |
| # 7. MODEL | |
| # --------------------------------------------------------- | |
| model = BertForSequenceClassification.from_pretrained( | |
| "bert-base-uncased", | |
| num_labels=NUM_LABELS | |
| ) | |
| # --------------------------------------------------------- | |
| # 8. METRICS | |
| # --------------------------------------------------------- | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| preds = np.argmax(logits, axis=1) | |
| return { | |
| "accuracy": accuracy_score(labels, preds), | |
| "balanced_accuracy": balanced_accuracy_score(labels, preds), | |
| "f1": f1_score(labels, preds, average="weighted"), | |
| "mcc": matthews_corrcoef(labels, preds) | |
| } | |
| # --------------------------------------------------------- | |
| # 9. TRAINING | |
| # --------------------------------------------------------- | |
| training_args = TrainingArguments( | |
| output_dir=f"{ARTIFACT_DIR}/results", | |
| learning_rate=LEARNING_RATE, | |
| per_device_train_batch_size=BATCH_SIZE, | |
| per_device_eval_batch_size=BATCH_SIZE, | |
| num_train_epochs=EPOCHS, | |
| weight_decay=0.01, | |
| logging_steps=100, | |
| save_strategy="no", | |
| report_to="none" | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=val_dataset, | |
| tokenizer=tokenizer, | |
| compute_metrics=compute_metrics | |
| ) | |
| trainer.train() | |
| # --------------------------------------------------------- | |
| # 10. FINAL TEST EVALUATION | |
| # --------------------------------------------------------- | |
| predictions = trainer.predict(test_dataset) | |
| y_true = predictions.label_ids | |
| y_pred = np.argmax(predictions.predictions, axis=1) | |
| print("\n===== FINAL TEST METRICS =====") | |
| print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}") | |
| print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}") | |
| print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}") | |
| print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}") | |
| # --------------------------------------------------------- | |
| # 11. SAVE TRAINED MODEL | |
| # --------------------------------------------------------- | |
| model.save_pretrained(MODEL_DIR) | |
| print("\n✅ PREPROCESSING + TRAINING + ARTIFACT GENERATION COMPLETED") | |