civicconnect-ai-engine / classification /indic_bert_model.py
MOHAN799S
Deploy CivicConnect AI Engine — BERT + BLIP + EasyOCR + Whisper API
8da2d54
# =========================================================
# INDICBERT PREPROCESSING + TRAINING + ARTIFACT GENERATION
# Hindi + Telugu Grievance Classification
# =========================================================
import os
import re
import pickle
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
accuracy_score,
f1_score,
balanced_accuracy_score,
matthews_corrcoef
)
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
Trainer,
TrainingArguments
)
from torch.utils.data import Dataset
# =========================================================
# CONFIG
# =========================================================
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.join(BASE_DIR, "indic_train.csv")
ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
MODEL_DIR = os.path.join(ARTIFACT_DIR, "indicbert_model")
MAX_LENGTH = 128
EPOCHS = 4
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
MODEL_NAME = "ai4bharat/indic-bert"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
print(f"📄 Loading dataset from: {DATA_PATH}")
# =========================================================
# LOAD DATA
# =========================================================
df = pd.read_csv(DATA_PATH)
df = df[['text', 'label']]
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
# =========================================================
# CLEAN TEXT (KEEP HINDI & TELUGU SAFE)
# =========================================================
def clean_text(text):
text = str(text)
# Remove HTML
text = re.sub(r"<.*?>", " ", text)
# Remove unwanted symbols but KEEP Indic unicode
text = re.sub(r"[^\u0900-\u097F\u0C00-\u0C7F\u0020-\u007F]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
df["text"] = df["text"].apply(clean_text)
# =========================================================
# LABEL ENCODING
# =========================================================
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label"])
label_map = dict(zip(
label_encoder.classes_,
label_encoder.transform(label_encoder.classes_)
))
# SAVE LABEL ARTIFACTS
with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "wb") as f:
pickle.dump(label_encoder, f)
with open(os.path.join(ARTIFACT_DIR, "label_map.pkl"), "wb") as f:
pickle.dump(label_map, f)
NUM_LABELS = len(label_map)
print(f"✅ Number of classes: {NUM_LABELS}")
# =========================================================
# TRAIN / VAL / TEST SPLIT
# =========================================================
train_df, temp_df = train_test_split(
df,
test_size=0.30,
stratify=df["label_id"],
random_state=42
)
val_df, test_df = train_test_split(
temp_df,
test_size=0.50,
stratify=temp_df["label_id"],
random_state=42
)
# SAVE SPLITS
train_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_train.csv"), index=False)
val_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_val.csv"), index=False)
test_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_test.csv"), index=False)
# =========================================================
# TOKENIZER
# =========================================================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
with open(os.path.join(ARTIFACT_DIR, "indic_tokenizer.pkl"), "wb") as f:
pickle.dump(tokenizer, f)
# =========================================================
# DATASET CLASS
# =========================================================
class GrievanceDataset(Dataset):
def __init__(self, texts, labels):
self.encodings = tokenizer(
list(texts),
truncation=True,
padding=True,
max_length=MAX_LENGTH
)
self.labels = list(labels)
def __getitem__(self, idx):
item = {
key: torch.tensor(val[idx])
for key, val in self.encodings.items()
}
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = GrievanceDataset(
train_df["text"],
train_df["label_id"]
)
val_dataset = GrievanceDataset(
val_df["text"],
val_df["label_id"]
)
test_dataset = GrievanceDataset(
test_df["text"],
test_df["label_id"]
)
# =========================================================
# MODEL
# =========================================================
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME,
num_labels=NUM_LABELS
)
# =========================================================
# METRICS
# =========================================================
def compute_metrics(eval_pred):
logits, labels = eval_pred
preds = np.argmax(logits, axis=1)
return {
"accuracy": accuracy_score(labels, preds),
"balanced_accuracy": balanced_accuracy_score(labels, preds),
"f1_weighted": f1_score(labels, preds, average="weighted"),
"mcc": matthews_corrcoef(labels, preds)
}
# =========================================================
# TRAINING
# =========================================================
training_args = TrainingArguments(
output_dir=f"{ARTIFACT_DIR}/indic_results",
learning_rate=LEARNING_RATE,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
num_train_epochs=EPOCHS,
weight_decay=0.01,
logging_steps=100,
save_strategy="no",
report_to="none"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics
)
print("\n🚀 Training IndicBERT Model...\n")
trainer.train()
# =========================================================
# FINAL TEST EVALUATION
# =========================================================
predictions = trainer.predict(test_dataset)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)
print("\n===== FINAL TEST METRICS =====")
print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}")
print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}")
print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}")
# =========================================================
# SAVE MODEL
# =========================================================
model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)
print("\n✅ INDICBERT TRAINING COMPLETED SUCCESSFULLY")