MOHAN799S commited on
Commit
6dfb8b7
·
1 Parent(s): cfaf6ec

Load models from HF Hub instead of local paths

Browse files
classification/bert_classify.py CHANGED
@@ -11,8 +11,7 @@ from transformers import BertForSequenceClassification
11
  # ── Path config ───────────────────────────────────────────
12
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
  ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
14
- MODEL_DIR = os.path.join(ARTIFACT_DIR, "bert_model")
15
- MAX_LENGTH = 128 # FIX: was 100 — aligned with IG explainer and indic module
16
 
17
  # ── Load artifacts ────────────────────────────────────────
18
  with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "rb") as f:
@@ -21,8 +20,9 @@ with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "rb") as f:
21
  with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f:
22
  label_encoder = pickle.load(f)
23
 
 
24
  model = BertForSequenceClassification.from_pretrained(
25
- MODEL_DIR, local_files_only=True
26
  )
27
  model.eval()
28
 
@@ -61,10 +61,6 @@ NON_GRIEVANCE_PHRASES = {
61
  def clean_text(text: str) -> str:
62
  text = str(text)
63
  text = re.sub(r"<.*?>", " ", text)
64
- # FIX: do NOT strip non-ASCII here — this module receives English
65
- # only (language detection in main.py routes correctly), but
66
- # stripping non-ASCII would silently corrupt any mis-routed Indic text.
67
- # Keep only the HTML-strip; whitespace normalisation is sufficient.
68
  text = re.sub(r"\s+", " ", text).strip()
69
  return text
70
 
@@ -88,22 +84,9 @@ def validate_input(text: str):
88
  # ── Predict ───────────────────────────────────────────────
89
  def predict(
90
  text: str,
91
- input_ids=None, # O3: pre-tokenised tensor from main.py
92
- attention_mask=None, # O3: pre-tokenised tensor from main.py
93
  ) -> dict:
94
- """
95
- Predict grievance category for English text.
96
-
97
- Args:
98
- text : Raw input string (always required for validation).
99
- input_ids : Optional pre-tokenised tensor (1, seq_len).
100
- When provided by main.py the internal tokenisation
101
- step is skipped — eliminates duplicate tokenisation.
102
- attention_mask : Required when input_ids is provided.
103
-
104
- Returns dict with keys: status, category, confidence, class_index.
105
- """
106
- # 1. Rule-based validation (always on raw text)
107
  reason = validate_input(text)
108
  if reason:
109
  return {
@@ -114,12 +97,8 @@ def predict(
114
  "class_index": None,
115
  }
116
 
117
- # 2. Clean text for model consumption
118
  cleaned = clean_text(text)
119
 
120
- # 3. O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
121
- # padding=False — single-string inference needs no padding;
122
- # avoids [PAD] tokens appearing in IG attributions.
123
  if input_ids is None:
124
  enc = tokenizer(
125
  cleaned,
@@ -131,16 +110,14 @@ def predict(
131
  input_ids = enc["input_ids"]
132
  attention_mask = enc["attention_mask"]
133
 
134
- # 4. Forward pass
135
  with torch.no_grad():
136
  outputs = model(input_ids=input_ids, attention_mask=attention_mask)
137
 
138
  probs = torch.softmax(outputs.logits, dim=1)
139
  conf, pred = torch.max(probs, dim=1)
140
- confidence = conf.item()
141
  predicted_index = pred.item()
142
 
143
- # 5. Confidence gate
144
  if confidence < 0.30:
145
  return {
146
  "status": "success",
 
11
  # ── Path config ───────────────────────────────────────────
12
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
  ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
14
+ MAX_LENGTH = 128
 
15
 
16
  # ── Load artifacts ────────────────────────────────────────
17
  with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "rb") as f:
 
20
  with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f:
21
  label_encoder = pickle.load(f)
22
 
23
+ # ── Load model from HF Hub ────────────────────────────────
24
  model = BertForSequenceClassification.from_pretrained(
25
+ "mohanbot799s/civicconnect-bert-en"
26
  )
27
  model.eval()
28
 
 
61
  def clean_text(text: str) -> str:
62
  text = str(text)
63
  text = re.sub(r"<.*?>", " ", text)
 
 
 
 
64
  text = re.sub(r"\s+", " ", text).strip()
65
  return text
66
 
 
84
  # ── Predict ───────────────────────────────────────────────
85
  def predict(
86
  text: str,
87
+ input_ids=None,
88
+ attention_mask=None,
89
  ) -> dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  reason = validate_input(text)
91
  if reason:
92
  return {
 
97
  "class_index": None,
98
  }
99
 
 
100
  cleaned = clean_text(text)
101
 
 
 
 
102
  if input_ids is None:
103
  enc = tokenizer(
104
  cleaned,
 
110
  input_ids = enc["input_ids"]
111
  attention_mask = enc["attention_mask"]
112
 
 
113
  with torch.no_grad():
114
  outputs = model(input_ids=input_ids, attention_mask=attention_mask)
115
 
116
  probs = torch.softmax(outputs.logits, dim=1)
117
  conf, pred = torch.max(probs, dim=1)
118
+ confidence = conf.item()
119
  predicted_index = pred.item()
120
 
 
121
  if confidence < 0.30:
122
  return {
123
  "status": "success",
classification/indic_bert_classify.py CHANGED
@@ -11,17 +11,18 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
  # ── Path config ───────────────────────────────────────────
12
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
  ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
14
- MODEL_DIR = os.path.join(ARTIFACT_DIR, "indicbert_model")
15
  MAX_LENGTH = 128
16
 
17
  # ── Load artifacts ────────────────────────────────────────
18
- tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
 
19
 
20
  with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f:
21
  label_encoder = pickle.load(f)
22
 
 
23
  model = AutoModelForSequenceClassification.from_pretrained(
24
- MODEL_DIR, local_files_only=True
25
  )
26
  model.eval()
27
 
@@ -29,14 +30,13 @@ model.eval()
29
  LABEL_WORDS = {
30
  "water", "electricity", "roads", "garbage",
31
  "sanitation", "pollution", "transport", "animals",
32
- "पानी", "बिजली", "सड़क", "कचरा",
33
  "నీరు", "విద్యుత్", "రోడ్డు", "చెత్త",
34
  }
35
 
36
  NON_GRIEVANCE_PHRASES = {
37
  "hello", "hi", "good morning", "good evening",
38
  "thank you", "thanks", "all good", "no issues", "test", "demo",
39
- "नमस्ते", "धन्यवाद", "सब ठीक है", "कोई समस्या नहीं",
40
  "నమస్తే", "ధన్యవాదాలు", "అన్నీ బాగున్నాయి", "సమస్య లేదు",
41
  }
42
 
@@ -45,7 +45,6 @@ NON_GRIEVANCE_PHRASES = {
45
  def clean_text(text: str) -> str:
46
  text = str(text)
47
  text = re.sub(r"<.*?>", " ", text)
48
- # Keep Hindi (0900-097F), Telugu (0C00-0C7F), basic ASCII (0020-007F)
49
  text = re.sub(r"[^\u0900-\u097F\u0C00-\u0C7F\u0020-\u007F]", " ", text)
50
  text = re.sub(r"\s+", " ", text).strip()
51
  return text
@@ -70,20 +69,9 @@ def validate_input(text: str):
70
  # ── Predict ───────────────────────────────────────────────
71
  def predict(
72
  text: str,
73
- input_ids=None, # O3: pre-tokenised tensor from main.py
74
- attention_mask=None, # O3: pre-tokenised tensor from main.py
75
  ) -> dict:
76
- """
77
- Predict grievance category for Hindi / Telugu text.
78
-
79
- Args:
80
- text : Raw input string (always required for validation).
81
- input_ids : Optional pre-tokenised tensor (1, seq_len).
82
- attention_mask : Required when input_ids is provided.
83
-
84
- Returns dict with keys: status, category, confidence, class_index.
85
- """
86
- # 1. Rule-based validation
87
  reason = validate_input(text)
88
  if reason:
89
  return {
@@ -94,10 +82,8 @@ def predict(
94
  "class_index": None,
95
  }
96
 
97
- # 2. Clean text
98
  cleaned = clean_text(text)
99
 
100
- # 3. O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
101
  if input_ids is None:
102
  enc = tokenizer(
103
  cleaned,
@@ -109,7 +95,6 @@ def predict(
109
  input_ids = enc["input_ids"]
110
  attention_mask = enc["attention_mask"]
111
 
112
- # 4. Forward pass
113
  with torch.no_grad():
114
  outputs = model(input_ids=input_ids, attention_mask=attention_mask)
115
 
@@ -118,7 +103,6 @@ def predict(
118
  confidence = conf.item()
119
  predicted_index = pred.item()
120
 
121
- # 5. Confidence gate
122
  if confidence < 0.30:
123
  return {
124
  "status": "success",
 
11
  # ── Path config ───────────────────────────────────────────
12
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
  ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
 
14
  MAX_LENGTH = 128
15
 
16
  # ── Load artifacts ────────────────────────────────────────
17
+ # ── Load tokenizer from HF Hub ───────────────────────────
18
+ tokenizer = AutoTokenizer.from_pretrained("mohanbot799s/civicconnect-bert-indic")
19
 
20
  with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f:
21
  label_encoder = pickle.load(f)
22
 
23
+ # ── Load model from HF Hub ────────────────────────────────
24
  model = AutoModelForSequenceClassification.from_pretrained(
25
+ "mohanbot799s/civicconnect-bert-indic"
26
  )
27
  model.eval()
28
 
 
30
  LABEL_WORDS = {
31
  "water", "electricity", "roads", "garbage",
32
  "sanitation", "pollution", "transport", "animals",
33
+ "పానీ", "బిజలీ", "సడక", "కచరా",
34
  "నీరు", "విద్యుత్", "రోడ్డు", "చెత్త",
35
  }
36
 
37
  NON_GRIEVANCE_PHRASES = {
38
  "hello", "hi", "good morning", "good evening",
39
  "thank you", "thanks", "all good", "no issues", "test", "demo",
 
40
  "నమస్తే", "ధన్యవాదాలు", "అన్నీ బాగున్నాయి", "సమస్య లేదు",
41
  }
42
 
 
45
  def clean_text(text: str) -> str:
46
  text = str(text)
47
  text = re.sub(r"<.*?>", " ", text)
 
48
  text = re.sub(r"[^\u0900-\u097F\u0C00-\u0C7F\u0020-\u007F]", " ", text)
49
  text = re.sub(r"\s+", " ", text).strip()
50
  return text
 
69
  # ── Predict ───────────────────────────────────────────────
70
  def predict(
71
  text: str,
72
+ input_ids=None,
73
+ attention_mask=None,
74
  ) -> dict:
 
 
 
 
 
 
 
 
 
 
 
75
  reason = validate_input(text)
76
  if reason:
77
  return {
 
82
  "class_index": None,
83
  }
84
 
 
85
  cleaned = clean_text(text)
86
 
 
87
  if input_ids is None:
88
  enc = tokenizer(
89
  cleaned,
 
95
  input_ids = enc["input_ids"]
96
  attention_mask = enc["attention_mask"]
97
 
 
98
  with torch.no_grad():
99
  outputs = model(input_ids=input_ids, attention_mask=attention_mask)
100
 
 
103
  confidence = conf.item()
104
  predicted_index = pred.item()
105
 
 
106
  if confidence < 0.30:
107
  return {
108
  "status": "success",
sentiment_analysis/bert_predict.py CHANGED
@@ -11,8 +11,9 @@ from transformers import BertTokenizer, BertForSequenceClassification
11
  BASE_DIR = os.path.dirname(__file__)
12
  MODEL_DIR = os.path.join(BASE_DIR, "artifacts", "urgency_bert_model")
13
 
14
- tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
15
- model = BertForSequenceClassification.from_pretrained(MODEL_DIR)
 
16
 
17
  label_encoder = pickle.load(
18
  open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb")
@@ -26,20 +27,9 @@ MAX_LENGTH = 128
26
  # ── Predict ───────────────────────────────────────────────
27
  def predict_urgency(
28
  text: str,
29
- input_ids=None, # O3: pre-tokenised tensor from main.py
30
- attention_mask=None, # O3: pre-tokenised tensor from main.py
31
  ) -> dict:
32
- """
33
- Predict urgency level for English grievance text.
34
-
35
- Args:
36
- text : Raw input string.
37
- input_ids : Optional pre-tokenised tensor (1, seq_len).
38
- attention_mask : Required when input_ids is provided.
39
-
40
- Returns dict with keys: urgency, confidence, class_index.
41
- """
42
- # O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
43
  if input_ids is None:
44
  enc = tokenizer(
45
  text,
@@ -72,7 +62,6 @@ def get_model_and_tokenizer():
72
  return model, tokenizer
73
 
74
 
75
- # ── Standalone test ───────────────────────────────────────
76
  if __name__ == "__main__":
77
  print("\nBERT Urgency Prediction Test")
78
  while True:
 
11
  BASE_DIR = os.path.dirname(__file__)
12
  MODEL_DIR = os.path.join(BASE_DIR, "artifacts", "urgency_bert_model")
13
 
14
+ # ── Load tokenizer + model from HF Hub ───────────────────
15
+ tokenizer = BertTokenizer.from_pretrained("mohanbot799s/civicconnect-urgency-en")
16
+ model = BertForSequenceClassification.from_pretrained("mohanbot799s/civicconnect-urgency-en")
17
 
18
  label_encoder = pickle.load(
19
  open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb")
 
27
  # ── Predict ───────────────────────────────────────────────
28
  def predict_urgency(
29
  text: str,
30
+ input_ids=None,
31
+ attention_mask=None,
32
  ) -> dict:
 
 
 
 
 
 
 
 
 
 
 
33
  if input_ids is None:
34
  enc = tokenizer(
35
  text,
 
62
  return model, tokenizer
63
 
64
 
 
65
  if __name__ == "__main__":
66
  print("\nBERT Urgency Prediction Test")
67
  while True:
sentiment_analysis/indic_bert_predict.py CHANGED
@@ -12,8 +12,9 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
12
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
  MODEL_DIR = os.path.join(BASE_DIR, "artifacts", "indic_urgency_model")
14
 
15
- tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
16
- model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
 
17
  model.eval()
18
 
19
  with open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb") as f:
@@ -33,20 +34,9 @@ def clean_text(text: str) -> str:
33
  # ── Predict ───────────────────────────────────────────────
34
  def predict(
35
  text: str,
36
- input_ids=None, # O3: pre-tokenised tensor from main.py
37
- attention_mask=None, # O3: pre-tokenised tensor from main.py
38
  ) -> dict:
39
- """
40
- Predict urgency level for Hindi / Telugu grievance text.
41
-
42
- Args:
43
- text : Raw input string.
44
- input_ids : Optional pre-tokenised tensor (1, seq_len).
45
- attention_mask : Required when input_ids is provided.
46
-
47
- Returns dict with keys: urgency, confidence, class_index.
48
- """
49
- # O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
50
  if input_ids is None:
51
  cleaned = clean_text(text)
52
  enc = tokenizer(
@@ -80,7 +70,6 @@ def get_model_and_tokenizer():
80
  return model, tokenizer
81
 
82
 
83
- # ── Standalone test ───────────────────────────────────────
84
  if __name__ == "__main__":
85
  while True:
86
  text = input("\nEnter Hindi/Telugu grievance (or 'exit'): ")
 
12
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
  MODEL_DIR = os.path.join(BASE_DIR, "artifacts", "indic_urgency_model")
14
 
15
+ # ── Load tokenizer + model from HF Hub ───────────────────
16
+ tokenizer = AutoTokenizer.from_pretrained("mohanbot799s/civicconnect-urgency-indic")
17
+ model = AutoModelForSequenceClassification.from_pretrained("mohanbot799s/civicconnect-urgency-indic")
18
  model.eval()
19
 
20
  with open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb") as f:
 
34
  # ── Predict ───────────────────────────────────────────────
35
  def predict(
36
  text: str,
37
+ input_ids=None,
38
+ attention_mask=None,
39
  ) -> dict:
 
 
 
 
 
 
 
 
 
 
 
40
  if input_ids is None:
41
  cleaned = clean_text(text)
42
  enc = tokenizer(
 
70
  return model, tokenizer
71
 
72
 
 
73
  if __name__ == "__main__":
74
  while True:
75
  text = input("\nEnter Hindi/Telugu grievance (or 'exit'): ")