Spaces:

mohanbot799s
/

civicconnect-ai-engine

Sleeping

civicconnect-ai-engine / classification /indic_bert_model.py

MOHAN799S

Deploy CivicConnect AI Engine — BERT + BLIP + EasyOCR + Whisper API

8da2d54 about 1 month ago

6.83 kB

	# =========================================================
	# INDICBERT PREPROCESSING + TRAINING + ARTIFACT GENERATION
	# Hindi + Telugu Grievance Classification
	# =========================================================

	import os
	import re
	import pickle
	import pandas as pd
	import numpy as np
	import torch

	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import LabelEncoder
	from sklearn.metrics import (
	accuracy_score,
	f1_score,
	balanced_accuracy_score,
	matthews_corrcoef
	)

	from transformers import (
	AutoTokenizer,
	AutoModelForSequenceClassification,
	Trainer,
	TrainingArguments
	)

	from torch.utils.data import Dataset


	# =========================================================
	# CONFIG
	# =========================================================

	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	DATA_PATH = os.path.join(BASE_DIR, "indic_train.csv")

	ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
	MODEL_DIR = os.path.join(ARTIFACT_DIR, "indicbert_model")

	MAX_LENGTH = 128
	EPOCHS = 4
	BATCH_SIZE = 16
	LEARNING_RATE = 2e-5

	MODEL_NAME = "ai4bharat/indic-bert"

	os.makedirs(ARTIFACT_DIR, exist_ok=True)
	os.makedirs(MODEL_DIR, exist_ok=True)

	print(f"📄 Loading dataset from: {DATA_PATH}")


	# =========================================================
	# LOAD DATA
	# =========================================================

	df = pd.read_csv(DATA_PATH)

	df = df[['text', 'label']]

	df.dropna(inplace=True)
	df.drop_duplicates(inplace=True)


	# =========================================================
	# CLEAN TEXT (KEEP HINDI & TELUGU SAFE)
	# =========================================================

	def clean_text(text):

	text = str(text)

	# Remove HTML
	text = re.sub(r"<.*?>", " ", text)

	# Remove unwanted symbols but KEEP Indic unicode
	text = re.sub(r"[^\u0900-\u097F\u0C00-\u0C7F\u0020-\u007F]", " ", text)

	text = re.sub(r"\s+", " ", text).strip()

	return text


	df["text"] = df["text"].apply(clean_text)


	# =========================================================
	# LABEL ENCODING
	# =========================================================

	label_encoder = LabelEncoder()

	df["label_id"] = label_encoder.fit_transform(df["label"])

	label_map = dict(zip(
	label_encoder.classes_,
	label_encoder.transform(label_encoder.classes_)
	))


	# SAVE LABEL ARTIFACTS
	with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "wb") as f:
	pickle.dump(label_encoder, f)

	with open(os.path.join(ARTIFACT_DIR, "label_map.pkl"), "wb") as f:
	pickle.dump(label_map, f)


	NUM_LABELS = len(label_map)

	print(f"✅ Number of classes: {NUM_LABELS}")


	# =========================================================
	# TRAIN / VAL / TEST SPLIT
	# =========================================================

	train_df, temp_df = train_test_split(
	df,
	test_size=0.30,
	stratify=df["label_id"],
	random_state=42
	)

	val_df, test_df = train_test_split(
	temp_df,
	test_size=0.50,
	stratify=temp_df["label_id"],
	random_state=42
	)


	# SAVE SPLITS
	train_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_train.csv"), index=False)
	val_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_val.csv"), index=False)
	test_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_test.csv"), index=False)


	# =========================================================
	# TOKENIZER
	# =========================================================

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	with open(os.path.join(ARTIFACT_DIR, "indic_tokenizer.pkl"), "wb") as f:
	pickle.dump(tokenizer, f)


	# =========================================================
	# DATASET CLASS
	# =========================================================

	class GrievanceDataset(Dataset):

	def __init__(self, texts, labels):

	self.encodings = tokenizer(
	list(texts),
	truncation=True,
	padding=True,
	max_length=MAX_LENGTH
	)

	self.labels = list(labels)


	def __getitem__(self, idx):

	item = {
	key: torch.tensor(val[idx])
	for key, val in self.encodings.items()
	}

	item["labels"] = torch.tensor(self.labels[idx])

	return item


	def __len__(self):

	return len(self.labels)



	train_dataset = GrievanceDataset(
	train_df["text"],
	train_df["label_id"]
	)

	val_dataset = GrievanceDataset(
	val_df["text"],
	val_df["label_id"]
	)

	test_dataset = GrievanceDataset(
	test_df["text"],
	test_df["label_id"]
	)


	# =========================================================
	# MODEL
	# =========================================================

	model = AutoModelForSequenceClassification.from_pretrained(
	MODEL_NAME,
	num_labels=NUM_LABELS
	)


	# =========================================================
	# METRICS
	# =========================================================

	def compute_metrics(eval_pred):

	logits, labels = eval_pred

	preds = np.argmax(logits, axis=1)

	return {

	"accuracy": accuracy_score(labels, preds),

	"balanced_accuracy": balanced_accuracy_score(labels, preds),

	"f1_weighted": f1_score(labels, preds, average="weighted"),

	"mcc": matthews_corrcoef(labels, preds)

	}


	# =========================================================
	# TRAINING
	# =========================================================

	training_args = TrainingArguments(
	output_dir=f"{ARTIFACT_DIR}/indic_results",
	learning_rate=LEARNING_RATE,
	per_device_train_batch_size=BATCH_SIZE,
	per_device_eval_batch_size=BATCH_SIZE,
	num_train_epochs=EPOCHS,
	weight_decay=0.01,
	logging_steps=100,
	save_strategy="no",
	report_to="none"
	)


	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=val_dataset,
	compute_metrics=compute_metrics
	)



	print("\n🚀 Training IndicBERT Model...\n")

	trainer.train()


	# =========================================================
	# FINAL TEST EVALUATION
	# =========================================================

	predictions = trainer.predict(test_dataset)

	y_true = predictions.label_ids

	y_pred = np.argmax(predictions.predictions, axis=1)


	print("\n===== FINAL TEST METRICS =====")

	print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")

	print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}")

	print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}")

	print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}")


	# =========================================================
	# SAVE MODEL
	# =========================================================

	model.save_pretrained(MODEL_DIR)

	tokenizer.save_pretrained(MODEL_DIR)


	print("\n✅ INDICBERT TRAINING COMPLETED SUCCESSFULLY")