Spaces:
Sleeping
Sleeping
| # ========================================================= | |
| # INDICBERT PREPROCESSING + TRAINING + ARTIFACT GENERATION | |
| # Hindi + Telugu Grievance Classification | |
| # ========================================================= | |
| import os | |
| import re | |
| import pickle | |
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.metrics import ( | |
| accuracy_score, | |
| f1_score, | |
| balanced_accuracy_score, | |
| matthews_corrcoef | |
| ) | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| Trainer, | |
| TrainingArguments | |
| ) | |
| from torch.utils.data import Dataset | |
| # ========================================================= | |
| # CONFIG | |
| # ========================================================= | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DATA_PATH = os.path.join(BASE_DIR, "indic_train.csv") | |
| ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts") | |
| MODEL_DIR = os.path.join(ARTIFACT_DIR, "indicbert_model") | |
| MAX_LENGTH = 128 | |
| EPOCHS = 4 | |
| BATCH_SIZE = 16 | |
| LEARNING_RATE = 2e-5 | |
| MODEL_NAME = "ai4bharat/indic-bert" | |
| os.makedirs(ARTIFACT_DIR, exist_ok=True) | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| print(f"📄 Loading dataset from: {DATA_PATH}") | |
| # ========================================================= | |
| # LOAD DATA | |
| # ========================================================= | |
| df = pd.read_csv(DATA_PATH) | |
| df = df[['text', 'label']] | |
| df.dropna(inplace=True) | |
| df.drop_duplicates(inplace=True) | |
| # ========================================================= | |
| # CLEAN TEXT (KEEP HINDI & TELUGU SAFE) | |
| # ========================================================= | |
| def clean_text(text): | |
| text = str(text) | |
| # Remove HTML | |
| text = re.sub(r"<.*?>", " ", text) | |
| # Remove unwanted symbols but KEEP Indic unicode | |
| text = re.sub(r"[^\u0900-\u097F\u0C00-\u0C7F\u0020-\u007F]", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| df["text"] = df["text"].apply(clean_text) | |
| # ========================================================= | |
| # LABEL ENCODING | |
| # ========================================================= | |
| label_encoder = LabelEncoder() | |
| df["label_id"] = label_encoder.fit_transform(df["label"]) | |
| label_map = dict(zip( | |
| label_encoder.classes_, | |
| label_encoder.transform(label_encoder.classes_) | |
| )) | |
| # SAVE LABEL ARTIFACTS | |
| with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "wb") as f: | |
| pickle.dump(label_encoder, f) | |
| with open(os.path.join(ARTIFACT_DIR, "label_map.pkl"), "wb") as f: | |
| pickle.dump(label_map, f) | |
| NUM_LABELS = len(label_map) | |
| print(f"✅ Number of classes: {NUM_LABELS}") | |
| # ========================================================= | |
| # TRAIN / VAL / TEST SPLIT | |
| # ========================================================= | |
| train_df, temp_df = train_test_split( | |
| df, | |
| test_size=0.30, | |
| stratify=df["label_id"], | |
| random_state=42 | |
| ) | |
| val_df, test_df = train_test_split( | |
| temp_df, | |
| test_size=0.50, | |
| stratify=temp_df["label_id"], | |
| random_state=42 | |
| ) | |
| # SAVE SPLITS | |
| train_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_train.csv"), index=False) | |
| val_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_val.csv"), index=False) | |
| test_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_test.csv"), index=False) | |
| # ========================================================= | |
| # TOKENIZER | |
| # ========================================================= | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| with open(os.path.join(ARTIFACT_DIR, "indic_tokenizer.pkl"), "wb") as f: | |
| pickle.dump(tokenizer, f) | |
| # ========================================================= | |
| # DATASET CLASS | |
| # ========================================================= | |
| class GrievanceDataset(Dataset): | |
| def __init__(self, texts, labels): | |
| self.encodings = tokenizer( | |
| list(texts), | |
| truncation=True, | |
| padding=True, | |
| max_length=MAX_LENGTH | |
| ) | |
| self.labels = list(labels) | |
| def __getitem__(self, idx): | |
| item = { | |
| key: torch.tensor(val[idx]) | |
| for key, val in self.encodings.items() | |
| } | |
| item["labels"] = torch.tensor(self.labels[idx]) | |
| return item | |
| def __len__(self): | |
| return len(self.labels) | |
| train_dataset = GrievanceDataset( | |
| train_df["text"], | |
| train_df["label_id"] | |
| ) | |
| val_dataset = GrievanceDataset( | |
| val_df["text"], | |
| val_df["label_id"] | |
| ) | |
| test_dataset = GrievanceDataset( | |
| test_df["text"], | |
| test_df["label_id"] | |
| ) | |
| # ========================================================= | |
| # MODEL | |
| # ========================================================= | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| MODEL_NAME, | |
| num_labels=NUM_LABELS | |
| ) | |
| # ========================================================= | |
| # METRICS | |
| # ========================================================= | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| preds = np.argmax(logits, axis=1) | |
| return { | |
| "accuracy": accuracy_score(labels, preds), | |
| "balanced_accuracy": balanced_accuracy_score(labels, preds), | |
| "f1_weighted": f1_score(labels, preds, average="weighted"), | |
| "mcc": matthews_corrcoef(labels, preds) | |
| } | |
| # ========================================================= | |
| # TRAINING | |
| # ========================================================= | |
| training_args = TrainingArguments( | |
| output_dir=f"{ARTIFACT_DIR}/indic_results", | |
| learning_rate=LEARNING_RATE, | |
| per_device_train_batch_size=BATCH_SIZE, | |
| per_device_eval_batch_size=BATCH_SIZE, | |
| num_train_epochs=EPOCHS, | |
| weight_decay=0.01, | |
| logging_steps=100, | |
| save_strategy="no", | |
| report_to="none" | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=val_dataset, | |
| compute_metrics=compute_metrics | |
| ) | |
| print("\n🚀 Training IndicBERT Model...\n") | |
| trainer.train() | |
| # ========================================================= | |
| # FINAL TEST EVALUATION | |
| # ========================================================= | |
| predictions = trainer.predict(test_dataset) | |
| y_true = predictions.label_ids | |
| y_pred = np.argmax(predictions.predictions, axis=1) | |
| print("\n===== FINAL TEST METRICS =====") | |
| print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}") | |
| print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}") | |
| print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}") | |
| print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}") | |
| # ========================================================= | |
| # SAVE MODEL | |
| # ========================================================= | |
| model.save_pretrained(MODEL_DIR) | |
| tokenizer.save_pretrained(MODEL_DIR) | |
| print("\n✅ INDICBERT TRAINING COMPLETED SUCCESSFULLY") | |