Spaces:

mohanbot799s
/

civicconnect-ai-engine

Sleeping

App Files Files Community

MOHAN799S commited on Mar 12

Commit

6dfb8b7

1 Parent(s): cfaf6ec

Load models from HF Hub instead of local paths

Browse files

Files changed (4) hide show

classification/bert_classify.py +6 -29
classification/indic_bert_classify.py +7 -23
sentiment_analysis/bert_predict.py +5 -16
sentiment_analysis/indic_bert_predict.py +5 -16

classification/bert_classify.py CHANGED Viewed

@@ -11,8 +11,7 @@ from transformers import BertForSequenceClassification
 # ── Path config ───────────────────────────────────────────
 BASE_DIR     = os.path.dirname(os.path.abspath(__file__))
 ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
-MODEL_DIR    = os.path.join(ARTIFACT_DIR, "bert_model")
-MAX_LENGTH   = 128   # FIX: was 100 — aligned with IG explainer and indic module
 # ── Load artifacts ────────────────────────────────────────
 with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "rb") as f:
@@ -21,8 +20,9 @@ with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "rb") as f:
 with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f:
     label_encoder = pickle.load(f)
 model = BertForSequenceClassification.from_pretrained(
-    MODEL_DIR, local_files_only=True
 )
 model.eval()
@@ -61,10 +61,6 @@ NON_GRIEVANCE_PHRASES = {
 def clean_text(text: str) -> str:
     text = str(text)
     text = re.sub(r"<.*?>", " ", text)
-    # FIX: do NOT strip non-ASCII here — this module receives English
-    # only (language detection in main.py routes correctly), but
-    # stripping non-ASCII would silently corrupt any mis-routed Indic text.
-    # Keep only the HTML-strip; whitespace normalisation is sufficient.
     text = re.sub(r"\s+", " ", text).strip()
     return text
@@ -88,22 +84,9 @@ def validate_input(text: str):
 # ── Predict ───────────────────────────────────────────────
 def predict(
     text: str,
-    input_ids=None,       # O3: pre-tokenised tensor from main.py
-    attention_mask=None,  # O3: pre-tokenised tensor from main.py
 ) -> dict:
-    """
-    Predict grievance category for English text.
-    Args:
-        text           : Raw input string (always required for validation).
-        input_ids      : Optional pre-tokenised tensor (1, seq_len).
-                         When provided by main.py the internal tokenisation
-                         step is skipped — eliminates duplicate tokenisation.
-        attention_mask : Required when input_ids is provided.
-    Returns dict with keys: status, category, confidence, class_index.
-    """
-    # 1. Rule-based validation (always on raw text)
     reason = validate_input(text)
     if reason:
         return {
@@ -114,12 +97,8 @@ def predict(
             "class_index": None,
         }
-    # 2. Clean text for model consumption
     cleaned = clean_text(text)
-    # 3. O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
-    #    padding=False — single-string inference needs no padding;
-    #    avoids [PAD] tokens appearing in IG attributions.
     if input_ids is None:
         enc = tokenizer(
             cleaned,
@@ -131,16 +110,14 @@ def predict(
         input_ids      = enc["input_ids"]
         attention_mask = enc["attention_mask"]
-    # 4. Forward pass
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
     probs      = torch.softmax(outputs.logits, dim=1)
     conf, pred = torch.max(probs, dim=1)
-    confidence     = conf.item()
     predicted_index = pred.item()
-    # 5. Confidence gate
     if confidence < 0.30:
         return {
             "status":      "success",

 # ── Path config ───────────────────────────────────────────
 BASE_DIR     = os.path.dirname(os.path.abspath(__file__))
 ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
+MAX_LENGTH   = 128
 # ── Load artifacts ────────────────────────────────────────
 with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "rb") as f:
 with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f:
     label_encoder = pickle.load(f)
+# ── Load model from HF Hub ────────────────────────────────
 model = BertForSequenceClassification.from_pretrained(
+    "mohanbot799s/civicconnect-bert-en"
 )
 model.eval()
 def clean_text(text: str) -> str:
     text = str(text)
     text = re.sub(r"<.*?>", " ", text)
     text = re.sub(r"\s+", " ", text).strip()
     return text
 # ── Predict ───────────────────────────────────────────────
 def predict(
     text: str,
+    input_ids=None,
+    attention_mask=None,
 ) -> dict:
     reason = validate_input(text)
     if reason:
         return {
             "class_index": None,
         }
     cleaned = clean_text(text)
     if input_ids is None:
         enc = tokenizer(
             cleaned,
         input_ids      = enc["input_ids"]
         attention_mask = enc["attention_mask"]
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
     probs      = torch.softmax(outputs.logits, dim=1)
     conf, pred = torch.max(probs, dim=1)
+    confidence      = conf.item()
     predicted_index = pred.item()
     if confidence < 0.30:
         return {
             "status":      "success",

classification/indic_bert_classify.py CHANGED Viewed

@@ -11,17 +11,18 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
 # ── Path config ───────────────────────────────────────────
 BASE_DIR     = os.path.dirname(os.path.abspath(__file__))
 ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
-MODEL_DIR    = os.path.join(ARTIFACT_DIR, "indicbert_model")
 MAX_LENGTH   = 128
 # ── Load artifacts ────────────────────────────────────────
-tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
 with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f:
     label_encoder = pickle.load(f)
 model = AutoModelForSequenceClassification.from_pretrained(
-    MODEL_DIR, local_files_only=True
 )
 model.eval()
@@ -29,14 +30,13 @@ model.eval()
 LABEL_WORDS = {
     "water", "electricity", "roads", "garbage",
     "sanitation", "pollution", "transport", "animals",
-    "पानी", "बिजली", "सड़क", "कचरा",
     "నీరు", "విద్యుత్", "రోడ్డు", "చెత్త",
 }
 NON_GRIEVANCE_PHRASES = {
     "hello", "hi", "good morning", "good evening",
     "thank you", "thanks", "all good", "no issues", "test", "demo",
-    "नमस्ते", "धन्यवाद", "सब ठीक है", "कोई समस्या नहीं",
     "నమస్తే", "ధన్యవాదాలు", "అన్నీ బాగున్నాయి", "సమస్య లేదు",
 }
@@ -45,7 +45,6 @@ NON_GRIEVANCE_PHRASES = {
 def clean_text(text: str) -> str:
     text = str(text)
     text = re.sub(r"<.*?>", " ", text)
-    # Keep Hindi (0900-097F), Telugu (0C00-0C7F), basic ASCII (0020-007F)
     text = re.sub(r"[^\u0900-\u097F\u0C00-\u0C7F\u0020-\u007F]", " ", text)
     text = re.sub(r"\s+", " ", text).strip()
     return text
@@ -70,20 +69,9 @@ def validate_input(text: str):
 # ── Predict ───────────────────────────────────────────────
 def predict(
     text: str,
-    input_ids=None,       # O3: pre-tokenised tensor from main.py
-    attention_mask=None,  # O3: pre-tokenised tensor from main.py
 ) -> dict:
-    """
-    Predict grievance category for Hindi / Telugu text.
-    Args:
-        text           : Raw input string (always required for validation).
-        input_ids      : Optional pre-tokenised tensor (1, seq_len).
-        attention_mask : Required when input_ids is provided.
-    Returns dict with keys: status, category, confidence, class_index.
-    """
-    # 1. Rule-based validation
     reason = validate_input(text)
     if reason:
         return {
@@ -94,10 +82,8 @@ def predict(
             "class_index": None,
         }
-    # 2. Clean text
     cleaned = clean_text(text)
-    # 3. O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
     if input_ids is None:
         enc = tokenizer(
             cleaned,
@@ -109,7 +95,6 @@ def predict(
         input_ids      = enc["input_ids"]
         attention_mask = enc["attention_mask"]
-    # 4. Forward pass
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
@@ -118,7 +103,6 @@ def predict(
     confidence      = conf.item()
     predicted_index = pred.item()
-    # 5. Confidence gate
     if confidence < 0.30:
         return {
             "status":      "success",

 # ── Path config ───────────────────────────────────────────
 BASE_DIR     = os.path.dirname(os.path.abspath(__file__))
 ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
 MAX_LENGTH   = 128
 # ── Load artifacts ────────────────────────────────────────
+# ── Load tokenizer from HF Hub ───────────────────────────
+tokenizer = AutoTokenizer.from_pretrained("mohanbot799s/civicconnect-bert-indic")
 with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f:
     label_encoder = pickle.load(f)
+# ── Load model from HF Hub ────────────────────────────────
 model = AutoModelForSequenceClassification.from_pretrained(
+    "mohanbot799s/civicconnect-bert-indic"
 )
 model.eval()
 LABEL_WORDS = {
     "water", "electricity", "roads", "garbage",
     "sanitation", "pollution", "transport", "animals",
+    "పానీ", "బిజలీ", "సడక", "కచరా",
     "నీరు", "విద్యుత్", "రోడ్డు", "చెత్త",
 }
 NON_GRIEVANCE_PHRASES = {
     "hello", "hi", "good morning", "good evening",
     "thank you", "thanks", "all good", "no issues", "test", "demo",
     "నమస్తే", "ధన్యవాదాలు", "అన్నీ బాగున్నాయి", "సమస్య లేదు",
 }
 def clean_text(text: str) -> str:
     text = str(text)
     text = re.sub(r"<.*?>", " ", text)
     text = re.sub(r"[^\u0900-\u097F\u0C00-\u0C7F\u0020-\u007F]", " ", text)
     text = re.sub(r"\s+", " ", text).strip()
     return text
 # ── Predict ───────────────────────────────────────────────
 def predict(
     text: str,
+    input_ids=None,
+    attention_mask=None,
 ) -> dict:
     reason = validate_input(text)
     if reason:
         return {
             "class_index": None,
         }
     cleaned = clean_text(text)
     if input_ids is None:
         enc = tokenizer(
             cleaned,
         input_ids      = enc["input_ids"]
         attention_mask = enc["attention_mask"]
     with torch.no_grad():
         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
     confidence      = conf.item()
     predicted_index = pred.item()
     if confidence < 0.30:
         return {
             "status":      "success",

sentiment_analysis/bert_predict.py CHANGED Viewed

@@ -11,8 +11,9 @@ from transformers import BertTokenizer, BertForSequenceClassification
 BASE_DIR  = os.path.dirname(__file__)
 MODEL_DIR = os.path.join(BASE_DIR, "artifacts", "urgency_bert_model")
-tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
-model     = BertForSequenceClassification.from_pretrained(MODEL_DIR)
 label_encoder = pickle.load(
     open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb")
@@ -26,20 +27,9 @@ MAX_LENGTH = 128
 # ── Predict ───────────────────────────────────────────────
 def predict_urgency(
     text: str,
-    input_ids=None,       # O3: pre-tokenised tensor from main.py
-    attention_mask=None,  # O3: pre-tokenised tensor from main.py
 ) -> dict:
-    """
-    Predict urgency level for English grievance text.
-    Args:
-        text           : Raw input string.
-        input_ids      : Optional pre-tokenised tensor (1, seq_len).
-        attention_mask : Required when input_ids is provided.
-    Returns dict with keys: urgency, confidence, class_index.
-    """
-    # O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
     if input_ids is None:
         enc = tokenizer(
             text,
@@ -72,7 +62,6 @@ def get_model_and_tokenizer():
     return model, tokenizer
-# ── Standalone test ───────────────────────────────────────
 if __name__ == "__main__":
     print("\nBERT Urgency Prediction Test")
     while True:

 BASE_DIR  = os.path.dirname(__file__)
 MODEL_DIR = os.path.join(BASE_DIR, "artifacts", "urgency_bert_model")
+# ── Load tokenizer + model from HF Hub ───────────────────
+tokenizer = BertTokenizer.from_pretrained("mohanbot799s/civicconnect-urgency-en")
+model     = BertForSequenceClassification.from_pretrained("mohanbot799s/civicconnect-urgency-en")
 label_encoder = pickle.load(
     open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb")
 # ── Predict ───────────────────────────────────────────────
 def predict_urgency(
     text: str,
+    input_ids=None,
+    attention_mask=None,
 ) -> dict:
     if input_ids is None:
         enc = tokenizer(
             text,
     return model, tokenizer
 if __name__ == "__main__":
     print("\nBERT Urgency Prediction Test")
     while True:

sentiment_analysis/indic_bert_predict.py CHANGED Viewed

@@ -12,8 +12,9 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
 BASE_DIR  = os.path.dirname(os.path.abspath(__file__))
 MODEL_DIR = os.path.join(BASE_DIR, "artifacts", "indic_urgency_model")
-tokenizer     = AutoTokenizer.from_pretrained(MODEL_DIR)
-model         = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
 model.eval()
 with open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb") as f:
@@ -33,20 +34,9 @@ def clean_text(text: str) -> str:
 # ── Predict ───────────────────────────────────────────────
 def predict(
     text: str,
-    input_ids=None,       # O3: pre-tokenised tensor from main.py
-    attention_mask=None,  # O3: pre-tokenised tensor from main.py
 ) -> dict:
-    """
-    Predict urgency level for Hindi / Telugu grievance text.
-    Args:
-        text           : Raw input string.
-        input_ids      : Optional pre-tokenised tensor (1, seq_len).
-        attention_mask : Required when input_ids is provided.
-    Returns dict with keys: urgency, confidence, class_index.
-    """
-    # O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
     if input_ids is None:
         cleaned = clean_text(text)
         enc = tokenizer(
@@ -80,7 +70,6 @@ def get_model_and_tokenizer():
     return model, tokenizer
-# ── Standalone test ───────────────────────────────────────
 if __name__ == "__main__":
     while True:
         text = input("\nEnter Hindi/Telugu grievance (or 'exit'): ")

 BASE_DIR  = os.path.dirname(os.path.abspath(__file__))
 MODEL_DIR = os.path.join(BASE_DIR, "artifacts", "indic_urgency_model")
+# ── Load tokenizer + model from HF Hub ───────────────────
+tokenizer = AutoTokenizer.from_pretrained("mohanbot799s/civicconnect-urgency-indic")
+model     = AutoModelForSequenceClassification.from_pretrained("mohanbot799s/civicconnect-urgency-indic")
 model.eval()
 with open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb") as f:
 # ── Predict ───────────────────────────────────────────────
 def predict(
     text: str,
+    input_ids=None,
+    attention_mask=None,
 ) -> dict:
     if input_ids is None:
         cleaned = clean_text(text)
         enc = tokenizer(
     return model, tokenizer
 if __name__ == "__main__":
     while True:
         text = input("\nEnter Hindi/Telugu grievance (or 'exit'): ")