samithcs commited on
Commit
a647fb1
·
1 Parent(s): 6597336

updated models

Browse files
src/components/model_nlp_intent.py CHANGED
@@ -2,6 +2,8 @@ import pandas as pd
2
  import numpy as np
3
  import tensorflow as tf
4
  from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
 
 
5
  from sklearn.model_selection import train_test_split
6
  from sklearn.preprocessing import LabelEncoder
7
  import joblib
@@ -121,22 +123,34 @@ def main():
121
  logger.info(f"Query: '{query}' -> Intent: {intent} (Confidence: {confidence:.3f})")
122
 
123
 
 
 
124
  def predict_intent(text: str) -> dict:
125
-
126
 
 
 
 
 
 
 
127
 
128
- model_dir = Path(__file__).resolve().parents[2] / "artifacts" / "models" / "nlp_intent"
129
- model = TFDistilBertForSequenceClassification.from_pretrained(model_dir / "intent_model")
130
- tokenizer = DistilBertTokenizer.from_pretrained(model_dir / "intent_tokenizer")
131
- label_encoder = joblib.load(model_dir / "label_encoder.joblib")
132
 
 
133
  inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True, max_length=128)
 
 
134
  outputs = model(inputs)
135
  predicted_class = tf.argmax(outputs.logits, axis=1).numpy()[0]
136
  intent = label_encoder.inverse_transform([predicted_class])[0]
137
  confidence = float(tf.nn.softmax(outputs.logits)[0][predicted_class].numpy())
 
138
  return {"intent": intent, "confidence": confidence}
139
 
140
 
 
141
  if __name__ == "__main__":
142
  main()
 
2
  import numpy as np
3
  import tensorflow as tf
4
  from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
5
+ import requests
6
+ from io import BytesIO
7
  from sklearn.model_selection import train_test_split
8
  from sklearn.preprocessing import LabelEncoder
9
  import joblib
 
123
  logger.info(f"Query: '{query}' -> Intent: {intent} (Confidence: {confidence:.3f})")
124
 
125
 
126
+
127
+
128
  def predict_intent(text: str) -> dict:
 
129
 
130
+ model = TFDistilBertForSequenceClassification.from_pretrained(
131
+ "samithcs/nlp_intent_model", from_tf=True
132
+ )
133
+ tokenizer = DistilBertTokenizer.from_pretrained(
134
+ "samithcs/nlp_intent_model"
135
+ )
136
 
137
+
138
+ label_url = "https://huggingface.co/samithcs/nlp_intent_model/resolve/main/label_encoder.joblib"
139
+ response = requests.get(label_url)
140
+ label_encoder = joblib.load(BytesIO(response.content))
141
 
142
+
143
  inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True, max_length=128)
144
+
145
+
146
  outputs = model(inputs)
147
  predicted_class = tf.argmax(outputs.logits, axis=1).numpy()[0]
148
  intent = label_encoder.inverse_transform([predicted_class])[0]
149
  confidence = float(tf.nn.softmax(outputs.logits)[0][predicted_class].numpy())
150
+
151
  return {"intent": intent, "confidence": confidence}
152
 
153
 
154
+
155
  if __name__ == "__main__":
156
  main()
src/components/model_nlp_ner.py CHANGED
@@ -1,5 +1,8 @@
1
  import tensorflow as tf
2
  from transformers import DistilBertTokenizerFast, TFDistilBertForTokenClassification, pipeline
 
 
 
3
  from sklearn.model_selection import train_test_split
4
  import numpy as np
5
  import joblib
@@ -178,22 +181,44 @@ def train_ner_model():
178
  logger.info(f"NER (TF) model, tokenizer, and label map saved to {out_dir}")
179
 
180
 
 
 
181
  def extract_entities_pipeline(text: str) -> dict:
182
- model_dir = Path(__file__).resolve().parents[2] / "artifacts" / "models" / "nlp_ner"
183
- custom_model = TFDistilBertForTokenClassification.from_pretrained(model_dir / "ner_model")
184
- custom_tokenizer = DistilBertTokenizerFast.from_pretrained(model_dir / "ner_tokenizer")
185
- label2id = joblib.load(model_dir / "label2id.joblib")
 
 
 
 
 
 
186
  id2label = {i: t for t, i in label2id.items()}
 
 
187
  max_len = 32
188
  tokens = text.split()
189
- encoding = custom_tokenizer([tokens], is_split_into_words=True, return_tensors='tf', padding='max_length', truncation=True, max_length=max_len)
 
 
 
 
 
 
 
 
 
190
  outputs = custom_model({k: v for k, v in encoding.items() if k != "labels"})
191
  logits = outputs.logits.numpy()[0]
192
  pred_ids = np.argmax(logits, axis=-1)
 
 
193
  custom_entities = {"location": [], "event": []}
194
  current_loc, current_evt = [], []
195
  for w, id in zip(tokens, pred_ids[:len(tokens)]):
196
  label = id2label[id]
 
197
  if label == "B-LOC":
198
  if current_loc:
199
  custom_entities["location"].append(" ".join(current_loc))
@@ -205,6 +230,7 @@ def extract_entities_pipeline(text: str) -> dict:
205
  if current_loc:
206
  custom_entities["location"].append(" ".join(current_loc))
207
  current_loc = []
 
208
  if label == "B-EVENT":
209
  if current_evt:
210
  custom_entities["event"].append(" ".join(current_evt))
@@ -216,17 +242,21 @@ def extract_entities_pipeline(text: str) -> dict:
216
  if current_evt:
217
  custom_entities["event"].append(" ".join(current_evt))
218
  current_evt = []
 
219
  if current_loc:
220
  custom_entities["location"].append(" ".join(current_loc))
221
  if current_evt:
222
  custom_entities["event"].append(" ".join(current_evt))
223
 
 
224
  hf_ner = pipeline("ner", grouped_entities=True, model="dbmdz/bert-large-cased-finetuned-conll03-english")
225
  hf_results = hf_ner(text)
226
  hf_locations = [ent['word'] for ent in hf_results if ent['entity_group'] == "LOC"]
227
 
 
228
  all_locations = set(custom_entities["location"]) | set(hf_locations)
229
  all_events = custom_entities["event"]
 
230
  return {"location": list(all_locations), "event": all_events}
231
 
232
 
 
1
  import tensorflow as tf
2
  from transformers import DistilBertTokenizerFast, TFDistilBertForTokenClassification, pipeline
3
+ import requests
4
+ from io import BytesIO
5
+ import numpy as np
6
  from sklearn.model_selection import train_test_split
7
  import numpy as np
8
  import joblib
 
181
  logger.info(f"NER (TF) model, tokenizer, and label map saved to {out_dir}")
182
 
183
 
184
+
185
+
186
  def extract_entities_pipeline(text: str) -> dict:
187
+
188
+ custom_model = TFDistilBertForTokenClassification.from_pretrained(
189
+ "samithcs/nlp_ner", from_tf=True
190
+ )
191
+ custom_tokenizer = DistilBertTokenizerFast.from_pretrained("samithcs/nlp_ner")
192
+
193
+
194
+ label_url = "https://huggingface.co/samithcs/nlp_ner/resolve/main/label2id.joblib"
195
+ response = requests.get(label_url)
196
+ label2id = joblib.load(BytesIO(response.content))
197
  id2label = {i: t for t, i in label2id.items()}
198
+
199
+
200
  max_len = 32
201
  tokens = text.split()
202
+ encoding = custom_tokenizer(
203
+ [tokens],
204
+ is_split_into_words=True,
205
+ return_tensors='tf',
206
+ padding='max_length',
207
+ truncation=True,
208
+ max_length=max_len
209
+ )
210
+
211
+
212
  outputs = custom_model({k: v for k, v in encoding.items() if k != "labels"})
213
  logits = outputs.logits.numpy()[0]
214
  pred_ids = np.argmax(logits, axis=-1)
215
+
216
+
217
  custom_entities = {"location": [], "event": []}
218
  current_loc, current_evt = [], []
219
  for w, id in zip(tokens, pred_ids[:len(tokens)]):
220
  label = id2label[id]
221
+
222
  if label == "B-LOC":
223
  if current_loc:
224
  custom_entities["location"].append(" ".join(current_loc))
 
230
  if current_loc:
231
  custom_entities["location"].append(" ".join(current_loc))
232
  current_loc = []
233
+
234
  if label == "B-EVENT":
235
  if current_evt:
236
  custom_entities["event"].append(" ".join(current_evt))
 
242
  if current_evt:
243
  custom_entities["event"].append(" ".join(current_evt))
244
  current_evt = []
245
+
246
  if current_loc:
247
  custom_entities["location"].append(" ".join(current_loc))
248
  if current_evt:
249
  custom_entities["event"].append(" ".join(current_evt))
250
 
251
+
252
  hf_ner = pipeline("ner", grouped_entities=True, model="dbmdz/bert-large-cased-finetuned-conll03-english")
253
  hf_results = hf_ner(text)
254
  hf_locations = [ent['word'] for ent in hf_results if ent['entity_group'] == "LOC"]
255
 
256
+
257
  all_locations = set(custom_entities["location"]) | set(hf_locations)
258
  all_events = custom_entities["event"]
259
+
260
  return {"location": list(all_locations), "event": all_events}
261
 
262
 
src/components/model_risk_predictor.py CHANGED
@@ -1,5 +1,7 @@
1
  import pandas as pd
2
  import numpy as np
 
 
3
  from sklearn.model_selection import train_test_split
4
  from sklearn.ensemble import HistGradientBoostingClassifier
5
  from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
@@ -154,71 +156,60 @@ def calculate_rule_based_risk(region, days, incidents):
154
  return min(1.0, rule_risk)
155
 
156
 
 
 
157
  def predict_risk(region: str, days: int = 5, origin=None, destination=None,
158
  event_type=None, incidents=None, shipping_mode=None):
159
 
160
  try:
161
- import joblib
162
- import pandas as pd
163
- from pathlib import Path
164
-
165
- model_dir = Path(__file__).resolve().parents[2] / "artifacts" / "models" / "risk_predictor"
166
- model_path = model_dir / "hist_gradient_boosting_risk_predictor.joblib"
167
-
168
-
169
  if shipping_mode is None:
170
  shipping_mode = "Standard Class"
171
-
172
-
173
  rule_risk = calculate_rule_based_risk(region, days, incidents or [])
174
  logger.info(f"Rule-based risk for {region}: {rule_risk:.3f}")
175
-
176
-
177
- ml_risk = 0.40
178
-
179
- if model_path.exists():
180
- try:
181
- model = joblib.load(model_path)
182
- logger.debug(f"Loaded ML model from {model_path}")
183
-
184
- data_dir = Path(__file__).resolve().parents[2] / "artifacts" / "data" / "processed"
185
- feature_csv_path = data_dir / "supply_chain_disruptions_features.csv"
186
-
187
- if feature_csv_path.exists():
188
- feature_csv = pd.read_csv(feature_csv_path)
189
- feature_cols = list(model.feature_names_in_) if hasattr(model, "feature_names_in_") else list(feature_csv.columns)
190
-
191
- reference_row = feature_csv[feature_cols].median()
192
-
193
- query_dict = {
194
- "region": region,
195
- "days": days,
196
- "origin": origin,
197
- "destination": destination,
198
- "shipping_mode": shipping_mode,
199
- }
200
-
201
- test_features = pd.DataFrame([build_feature_row(feature_cols, query_dict, reference_row)])
202
- ml_risk = float(model.predict_proba(test_features)[0, 1])
203
- logger.info(f"ML model risk for {region}: {ml_risk:.3f}")
204
- except Exception as e:
205
- logger.warning(f"Could not get ML prediction: {e}")
206
-
207
-
208
  if incidents and len(incidents) > 0:
209
-
210
  final_risk = (ml_risk * 0.40) + (rule_risk * 0.60)
211
  logger.info(f"Hybrid risk (with incidents): ML={ml_risk:.3f}*0.4 + Rule={rule_risk:.3f}*0.6 = {final_risk:.3f}")
212
  else:
213
-
214
  final_risk = (ml_risk * 0.70) + (rule_risk * 0.30)
215
  logger.info(f"Hybrid risk (no incidents): ML={ml_risk:.3f}*0.7 + Rule={rule_risk:.3f}*0.3 = {final_risk:.3f}")
216
-
217
-
218
  final_risk = float(np.clip(final_risk, 0.0, 1.0))
219
-
220
  return round(final_risk, 2)
221
-
222
  except Exception as e:
223
  logger.error(f"Error in predict_risk: {e}", exc_info=True)
224
  return 0.50
 
1
  import pandas as pd
2
  import numpy as np
3
+ import requests
4
+ from io import BytesIO
5
  from sklearn.model_selection import train_test_split
6
  from sklearn.ensemble import HistGradientBoostingClassifier
7
  from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
 
156
  return min(1.0, rule_risk)
157
 
158
 
159
+
160
+
161
  def predict_risk(region: str, days: int = 5, origin=None, destination=None,
162
  event_type=None, incidents=None, shipping_mode=None):
163
 
164
  try:
 
 
 
 
 
 
 
 
165
  if shipping_mode is None:
166
  shipping_mode = "Standard Class"
167
+
168
+ # Calculate rule-based risk (assuming this function exists)
169
  rule_risk = calculate_rule_based_risk(region, days, incidents or [])
170
  logger.info(f"Rule-based risk for {region}: {rule_risk:.3f}")
171
+
172
+ ml_risk = 0.40 # default if model fails
173
+
174
+ # Load ML model from Hugging Face Hub
175
+ try:
176
+ model_url = "https://huggingface.co/samithcs/risk_predictor/resolve/main/hist_gradient_boosting_risk_predictor.joblib"
177
+ response = requests.get(model_url)
178
+ model = joblib.load(BytesIO(response.content))
179
+ logger.debug(f"Loaded ML model from HF Hub: {model_url}")
180
+
181
+ # Load reference CSV (optional)
182
+ data_url = "https://huggingface.co/samithcs/risk_predictor/resolve/main/supply_chain_disruptions_features.csv"
183
+ feature_csv = pd.read_csv(data_url)
184
+ feature_cols = list(model.feature_names_in_) if hasattr(model, "feature_names_in_") else list(feature_csv.columns)
185
+ reference_row = feature_csv[feature_cols].median()
186
+
187
+ query_dict = {
188
+ "region": region,
189
+ "days": days,
190
+ "origin": origin,
191
+ "destination": destination,
192
+ "shipping_mode": shipping_mode,
193
+ }
194
+
195
+ test_features = pd.DataFrame([build_feature_row(feature_cols, query_dict, reference_row)])
196
+ ml_risk = float(model.predict_proba(test_features)[0, 1])
197
+ logger.info(f"ML model risk for {region}: {ml_risk:.3f}")
198
+
199
+ except Exception as e:
200
+ logger.warning(f"Could not get ML prediction: {e}")
201
+
202
+ # Combine ML and rule-based risk
 
203
  if incidents and len(incidents) > 0:
 
204
  final_risk = (ml_risk * 0.40) + (rule_risk * 0.60)
205
  logger.info(f"Hybrid risk (with incidents): ML={ml_risk:.3f}*0.4 + Rule={rule_risk:.3f}*0.6 = {final_risk:.3f}")
206
  else:
 
207
  final_risk = (ml_risk * 0.70) + (rule_risk * 0.30)
208
  logger.info(f"Hybrid risk (no incidents): ML={ml_risk:.3f}*0.7 + Rule={rule_risk:.3f}*0.3 = {final_risk:.3f}")
209
+
 
210
  final_risk = float(np.clip(final_risk, 0.0, 1.0))
 
211
  return round(final_risk, 2)
212
+
213
  except Exception as e:
214
  logger.error(f"Error in predict_risk: {e}", exc_info=True)
215
  return 0.50
src/components/model_timeseries_risk.py CHANGED
@@ -5,22 +5,32 @@ from sklearn.preprocessing import StandardScaler
5
  from sklearn.model_selection import train_test_split
6
  from sklearn.utils import class_weight
7
  import joblib
8
- from pathlib import Path
 
9
  import logging
10
 
11
  logger = logging.getLogger(__name__)
12
  logging.basicConfig(level=logging.INFO)
13
 
 
 
 
14
 
15
- base_dir = Path(__file__).resolve().parents[2]
16
- data_path = base_dir / "artifacts" / "data" / "processed" / "supply_chain_disruptions_features.csv"
 
17
 
 
 
 
 
18
 
19
- df = pd.read_csv(data_path)
 
 
20
  region_col = "Order City"
21
  region_name = "Shanghai"
22
 
23
-
24
  df_region = df[df[region_col] == region_name].copy()
25
  if len(df_region) < 100:
26
  logger.warning("Region sample is small, upsampling/cropping to 200 rows from full dataset.")
@@ -36,8 +46,7 @@ seq_length = 7
36
  X_all = df_region[feature_cols].fillna(0).astype(float).values
37
  y_all = df_region[label_col].fillna(0).astype(int).values
38
 
39
- scaler = StandardScaler()
40
- X_scaled = scaler.fit_transform(X_all)
41
 
42
  X_seq, y_seq = [], []
43
  for i in range(len(X_scaled) - seq_length):
@@ -51,26 +60,15 @@ if len(X_seq) < 2:
51
  logger.error("Not enough sequences. Add more data or lower seq_length.")
52
  exit()
53
 
54
-
55
  test_size = int(0.2 * len(X_seq))
56
  X_train, X_test = X_seq[:-test_size], X_seq[-test_size:]
57
  y_train, y_test = y_seq[:-test_size], y_seq[-test_size:]
58
 
59
-
60
  weights = class_weight.compute_class_weight(class_weight="balanced",
61
  classes=np.unique(y_train),
62
  y=y_train)
63
  class_weight_dict = dict(zip(np.unique(y_train), weights))
64
 
65
-
66
- model = tf.keras.Sequential([
67
- tf.keras.layers.Input(shape=(seq_length, len(feature_cols))),
68
- tf.keras.layers.LSTM(64, return_sequences=True),
69
- tf.keras.layers.Dropout(0.25),
70
- tf.keras.layers.LSTM(32),
71
- tf.keras.layers.Dropout(0.25),
72
- tf.keras.layers.Dense(1, activation="sigmoid")
73
- ])
74
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
75
 
76
  logger.info("Training LSTM risk model with weighted loss and dropout.")
@@ -80,21 +78,4 @@ model.fit(X_train, y_train, epochs=12, batch_size=8,
80
  test_loss, test_acc = model.evaluate(X_test, y_test)
81
  logger.info(f"Test Accuracy: {test_acc:.4f}")
82
 
83
-
84
- model_dir = base_dir / "artifacts" / "models" / "timeseries_risk"
85
- model_dir.mkdir(parents=True, exist_ok=True)
86
- model.save(model_dir / "lstm_risk_model.keras")
87
- joblib.dump(scaler, model_dir / "scaler.joblib")
88
- logger.info(f"Saved LSTM model and scaler to {model_dir}")
89
-
90
- def predict_risk_for_next_day(sequence, threshold=0.5):
91
- seq = scaler.transform(sequence)
92
- seq_window = np.expand_dims(seq, axis=0)
93
- pred_prob = model.predict(seq_window)[0][0]
94
- pred_label = int(pred_prob > threshold)
95
- logger.info(f"Predicted next-day risk score: {pred_prob:.3f} (region: {region_name}), label: {pred_label}")
96
- return pred_prob, pred_label
97
-
98
- if X_test.shape[0] > 0:
99
- logger.info("Demo prediction for next-day risk using last window of test set:")
100
- predict_risk_for_next_day(X_test[0], threshold=0.5)
 
5
  from sklearn.model_selection import train_test_split
6
  from sklearn.utils import class_weight
7
  import joblib
8
+ import requests
9
+ from io import BytesIO
10
  import logging
11
 
12
  logger = logging.getLogger(__name__)
13
  logging.basicConfig(level=logging.INFO)
14
 
15
+ # URLs for your model and scaler on HF Hub
16
+ model_url = "https://huggingface.co/samithcs/timeseries_risk/resolve/main/lstm_risk_model.keras"
17
+ scaler_url = "https://huggingface.co/samithcs/timeseries_risk/resolve/main/scaler.joblib"
18
 
19
+ # Load LSTM model from Hugging Face Hub
20
+ logger.info("Loading LSTM model from Hugging Face Hub...")
21
+ model = tf.keras.models.load_model(model_url)
22
 
23
+ # Load scaler from Hugging Face Hub
24
+ logger.info("Loading scaler from Hugging Face Hub...")
25
+ response = requests.get(scaler_url)
26
+ scaler = joblib.load(BytesIO(response.content))
27
 
28
+
29
+ # Load dataset (still local CSV if needed)
30
+ df = pd.read_csv("path_to_your_csv/supply_chain_disruptions_features.csv") # update CSV path if needed
31
  region_col = "Order City"
32
  region_name = "Shanghai"
33
 
 
34
  df_region = df[df[region_col] == region_name].copy()
35
  if len(df_region) < 100:
36
  logger.warning("Region sample is small, upsampling/cropping to 200 rows from full dataset.")
 
46
  X_all = df_region[feature_cols].fillna(0).astype(float).values
47
  y_all = df_region[label_col].fillna(0).astype(int).values
48
 
49
+ X_scaled = scaler.transform(X_all)
 
50
 
51
  X_seq, y_seq = [], []
52
  for i in range(len(X_scaled) - seq_length):
 
60
  logger.error("Not enough sequences. Add more data or lower seq_length.")
61
  exit()
62
 
 
63
  test_size = int(0.2 * len(X_seq))
64
  X_train, X_test = X_seq[:-test_size], X_seq[-test_size:]
65
  y_train, y_test = y_seq[:-test_size], y_seq[-test_size:]
66
 
 
67
  weights = class_weight.compute_class_weight(class_weight="balanced",
68
  classes=np.unique(y_train),
69
  y=y_train)
70
  class_weight_dict = dict(zip(np.unique(y_train), weights))
71
 
 
 
 
 
 
 
 
 
 
72
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
73
 
74
  logger.info("Training LSTM risk model with weighted loss and dropout.")
 
78
  test_loss, test_acc = model.evaluate(X_test, y_test)
79
  logger.info(f"Test Accuracy: {test_acc:.4f}")
80
 
81
+ logger.info("Finished training/evaluation with model loaded from Hugging Face Hub.")