# ========================================================= # BERT MODEL — CATEGORY CLASSIFICATION (ENGLISH) # ========================================================= import os import re import torch import pickle from transformers import BertForSequenceClassification # ── Path config ─────────────────────────────────────────── BASE_DIR = os.path.dirname(os.path.abspath(__file__)) ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts") MAX_LENGTH = 128 # ── Load artifacts ──────────────────────────────────────── with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "rb") as f: tokenizer = pickle.load(f) with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f: label_encoder = pickle.load(f) # ── Load model from HF Hub ──────────────────────────────── model = BertForSequenceClassification.from_pretrained( "mohanbot799s/civicconnect-bert-en" ) model.eval() # ── Edge-case constants ─────────────────────────────────── LABEL_WORDS = { "water", "electricity", "roads", "garbage", "sanitation", "pollution", "transport", "animals", } NON_GRIEVANCE_PHRASES = { "hello", "hi", "hi there", "hey", "hey there", "good morning", "good afternoon", "good evening", "good day", "greetings", "namaste", "how are you", "how are you doing", "hope you are doing well", "hope everything is fine", "just checking in", "nice to meet you", "long time no see", "good weather", "nice weather", "weather is nice", "weather is good", "it is a sunny day", "it is raining today", "pleasant weather", "cool weather today", "hot weather today", "cold weather today", "it is a good day", "everything is fine", "all good", "no issues", "no problem", "things are okay", "everything looks good", "nothing to complain", "all services are working", "thank you", "thanks", "thanks a lot", "thank you very much", "appreciate it", "appreciate your help", "great work", "good job", "well done", "excellent service", "for your information", "just informing", "sharing information", "today is a holiday", "office opens at 10 am", "school reopens next week", "meeting scheduled tomorrow", "okay", "ok", "alright", "fine", "cool", "great", "nice", "regards", "best regards", "with regards", "kind regards", "thank you and regards", "thank you very much sir", "test", "testing", "demo", "sample text", "random text", "🙂", "👍", "🙏", "😂", "🔥", "!!!", "???", } # ── Text cleaning ───────────────────────────────────────── def clean_text(text: str) -> str: text = str(text) text = re.sub(r"<.*?>", " ", text) text = re.sub(r"\s+", " ", text).strip() return text # ── Input validation ────────────────────────────────────── def validate_input(text: str): if not text or not text.strip(): return "empty_text" text_l = text.strip().lower() if len(text_l) < 10: return "too_short" if len(text_l.split()) < 3: return "too_few_words" if text_l in LABEL_WORDS: return "label_only" if text_l in NON_GRIEVANCE_PHRASES: return "non_grievance_text" return None # ── Predict ─────────────────────────────────────────────── def predict( text: str, input_ids=None, attention_mask=None, ) -> dict: reason = validate_input(text) if reason: return { "status": "failed", "reason": reason, "category": None, "confidence": 0.0, "class_index": None, } cleaned = clean_text(text) if input_ids is None: enc = tokenizer( cleaned, return_tensors="pt", truncation=True, padding=False, max_length=MAX_LENGTH, ) input_ids = enc["input_ids"] attention_mask = enc["attention_mask"] with torch.no_grad(): outputs = model(input_ids=input_ids, attention_mask=attention_mask) probs = torch.softmax(outputs.logits, dim=1) conf, pred = torch.max(probs, dim=1) confidence = conf.item() predicted_index = pred.item() if confidence < 0.30: return { "status": "success", "reason": "low_confidence", "category": "Other", "confidence": round(confidence, 4), "class_index": predicted_index, } label = label_encoder.inverse_transform([predicted_index])[0] return { "status": "success", "category": label, "confidence": round(confidence, 4), "class_index": predicted_index, } def get_model_and_tokenizer(): return model, tokenizer