COCODEDE04 commited on
Commit
dfe0810
·
verified ·
1 Parent(s): 7894cc4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -41
app.py CHANGED
@@ -551,9 +551,15 @@ async def predict(req: Request):
551
 
552
  # ============================================================
553
  # CORAL ORDINAL HELPERS (from training script)
 
 
554
  # ============================================================
555
 
556
  def to_cumulative_targets_tf(y_true_int, K_):
 
 
 
 
557
  y = tf.reshape(y_true_int, [-1])
558
  y = tf.cast(y, tf.int32)
559
  thresholds = tf.range(1, K_, dtype=tf.int32)
@@ -562,30 +568,37 @@ def to_cumulative_targets_tf(y_true_int, K_):
562
 
563
 
564
  def coral_loss_tf(y_true, logits):
 
 
 
 
 
565
  y_true = tf.reshape(y_true, [-1])
566
  y_true = tf.cast(y_true, tf.int32)
567
- T = to_cumulative_targets_tf(y_true, len(CLASSES))
568
  bce = tf.nn.sigmoid_cross_entropy_with_logits(labels=T, logits=logits)
569
  return tf.reduce_mean(tf.reduce_sum(bce, axis=1))
570
 
571
 
572
- # ---------- TF helper & numpy wrapper (unified version) ----------
573
  def _coral_probs_from_logits_tf(logits_tf: tf.Tensor) -> tf.Tensor:
574
- """Pure TF CORAL probability transform."""
 
 
 
 
575
  sig = tf.math.sigmoid(logits_tf)
576
  left = tf.concat([tf.ones_like(sig[:, :1]), sig], axis=1)
577
  right = tf.concat([sig, tf.zeros_like(sig[:, :1])], axis=1)
578
- return tf.clip_by_value(left - right, 1e-12, 1.0)
579
-
580
-
581
- def coral_probs_from_logits(logits_np: np.ndarray) -> np.ndarray:
582
- """Numpy wrapper used by decode_logits + SHAP."""
583
- logits_tf = tf.convert_to_tensor(logits_np, dtype=tf.float32)
584
- return _coral_probs_from_logits_tf(logits_tf).numpy()
585
 
586
 
587
  @tf.function
588
  def ordinal_accuracy_metric(y_true, y_pred_logits):
 
 
 
589
  y_true = tf.reshape(y_true, [-1])
590
  y_true = tf.cast(y_true, tf.int32)
591
  probs = _coral_probs_from_logits_tf(y_pred_logits)
@@ -593,11 +606,75 @@ def ordinal_accuracy_metric(y_true, y_pred_logits):
593
  return tf.reduce_mean(tf.cast(tf.equal(y_true, y_pred), tf.float32))
594
 
595
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  # ============================================================
597
  # RECREATE MODEL FROM BEST HYPERPARAMETERS
598
  # ============================================================
599
 
600
  def build_model_from_hparams(hp: dict):
 
 
 
 
601
  inputs = tf.keras.Input(shape=(len(FEATURES),))
602
  x = inputs
603
 
@@ -622,6 +699,7 @@ def build_model_from_hparams(hp: dict):
622
  if drop > 0:
623
  x = tf.keras.layers.Dropout(drop)(x)
624
 
 
625
  outputs = tf.keras.layers.Dense(len(CLASSES) - 1, activation=None)(x)
626
 
627
  model = tf.keras.Model(inputs, outputs)
@@ -637,37 +715,81 @@ def build_model_from_hparams(hp: dict):
637
  # RETRAINING LOGIC + DATASET MGMT
638
  # ============================================================
639
 
640
- FINGERPRINT_CSV = "fingerprints_db.csv"
641
- BEST_HP_JSON = "best_params_and_metrics.json"
642
 
643
 
644
  def load_best_hparams():
 
 
 
 
645
  with open(BEST_HP_JSON, "r") as f:
646
  js = json.load(f)
647
  return js["best_hyperparams"]
648
 
649
 
650
  def load_fingerprint_dataset():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  df = pd.read_csv(FINGERPRINT_CSV)
652
 
653
- y = df["rating"].map({c: i for i, c in enumerate(CLASSES)}).astype("int32").to_numpy()
 
 
 
 
 
 
 
 
 
 
654
  X_raw = df[FEATURES].to_numpy().astype("float32")
655
 
 
656
  imp = SimpleImputer(strategy="median")
657
  sc = StandardScaler()
658
 
659
  X_imp = imp.fit_transform(X_raw)
660
- X_sc = sc.fit_transform(X_imp)
661
 
662
  return X_sc, y, imp, sc
663
 
664
 
665
  def retrain_model():
 
 
 
 
 
 
 
 
 
 
666
  hp = load_best_hparams()
667
  X, y, imp, sc = load_fingerprint_dataset()
668
 
 
669
  model_new = build_model_from_hparams(hp)
670
 
 
671
  es = tf.keras.callbacks.EarlyStopping(
672
  monitor="loss",
673
  patience=15,
@@ -683,13 +805,13 @@ def retrain_model():
683
  verbose=1,
684
  )
685
 
686
- # Update global model + preprocessors
687
  global model, imputer, scaler
688
  model = model_new
689
  imputer = imp
690
  scaler = sc
691
 
692
- # Rebuild SHAP explainer to match new model
693
  global EXPLAINER
694
  if SHAP_AVAILABLE:
695
  try:
@@ -700,6 +822,7 @@ def retrain_model():
700
  EXPLAINER = None
701
  print("⚠️ Failed to rebuild SHAP explainer:", repr(e))
702
 
 
703
  return True
704
 
705
 
@@ -710,48 +833,67 @@ def retrain_model():
710
  @app.post("/append_and_retrain")
711
  def append_and_retrain(payload: dict):
712
  """
713
- payload format:
 
 
 
714
  {
715
- "company": "...",
716
- "date": "2025-Q1",
717
- "rating": "Mid",
 
718
  "features": {
719
- "autosuf_oper": ...,
720
- "improductiva": ...,
 
 
721
  ...
 
722
  }
723
  }
 
 
 
 
724
  """
725
 
726
- company = payload.get("company")
727
- date = payload.get("date")
728
- rating = payload.get("rating")
729
- feats = payload.get("features", {})
 
730
 
731
- if not company or not date or not rating:
732
- return {"ok": False, "error": "Missing company/date/rating"}
733
 
734
  if set(feats.keys()) != set(FEATURES):
735
- return {"ok": False, "error": "Features missing or incorrect"}
736
-
737
- # Append row
738
- new_row = {
739
- "company": company,
740
- "date": date,
741
- "rating": rating,
742
- **feats
 
 
 
 
 
 
 
743
  }
744
- df_new = pd.DataFrame([new_row])
745
 
 
746
  if os.path.exists(FINGERPRINT_CSV):
747
  df = pd.read_csv(FINGERPRINT_CSV)
748
- df = pd.concat([df, df_new], ignore_index=True)
749
  else:
750
- df = df_new
751
 
752
  df.to_csv(FINGERPRINT_CSV, index=False)
753
 
754
- # Retrain model
755
  retrain_model()
756
 
757
- return {"ok": True, "message": "Fingerprint appended + model retrained"}
 
551
 
552
  # ============================================================
553
  # CORAL ORDINAL HELPERS (from training script)
554
+ # (we do NOT redefine coral_probs_from_logits here to avoid
555
+ # clashing with the one already used by decode_logits)
556
  # ============================================================
557
 
558
  def to_cumulative_targets_tf(y_true_int, K_):
559
+ """
560
+ y_true_int: (N,) integer targets 0..K-1
561
+ returns (N, K_-1) with t_k = 1[y >= k], k = 1..K-1
562
+ """
563
  y = tf.reshape(y_true_int, [-1])
564
  y = tf.cast(y, tf.int32)
565
  thresholds = tf.range(1, K_, dtype=tf.int32)
 
568
 
569
 
570
  def coral_loss_tf(y_true, logits):
571
+ """
572
+ CORAL ordinal loss implemented in TF:
573
+ y_true: (N,) or (N,1) with integer labels 0..K-1
574
+ logits: (N, K-1)
575
+ """
576
  y_true = tf.reshape(y_true, [-1])
577
  y_true = tf.cast(y_true, tf.int32)
578
+ T = to_cumulative_targets_tf(y_true, len(CLASSES)) # (N, K-1)
579
  bce = tf.nn.sigmoid_cross_entropy_with_logits(labels=T, logits=logits)
580
  return tf.reduce_mean(tf.reduce_sum(bce, axis=1))
581
 
582
 
583
+ # ---------- TF helper (pure TF CORAL probs) ----------
584
  def _coral_probs_from_logits_tf(logits_tf: tf.Tensor) -> tf.Tensor:
585
+ """
586
+ Pure-TF version of CORAL probability transform, used in metric.
587
+ logits_tf: (N, K-1)
588
+ returns (N, K) probabilities
589
+ """
590
  sig = tf.math.sigmoid(logits_tf)
591
  left = tf.concat([tf.ones_like(sig[:, :1]), sig], axis=1)
592
  right = tf.concat([sig, tf.zeros_like(sig[:, :1])], axis=1)
593
+ probs = tf.clip_by_value(left - right, 1e-12, 1.0)
594
+ return probs
 
 
 
 
 
595
 
596
 
597
  @tf.function
598
  def ordinal_accuracy_metric(y_true, y_pred_logits):
599
+ """
600
+ Exact class accuracy for CORAL outputs (same idea as training script).
601
+ """
602
  y_true = tf.reshape(y_true, [-1])
603
  y_true = tf.cast(y_true, tf.int32)
604
  probs = _coral_probs_from_logits_tf(y_pred_logits)
 
606
  return tf.reduce_mean(tf.cast(tf.equal(y_true, y_pred), tf.float32))
607
 
608
 
609
+ # ============================================================
610
+ # IMPORTS FOR RETRAINING / DATA MGMT
611
+ # (Ok to import here; Python allows imports anywhere in file)
612
+ # ============================================================
613
+
614
+ import pandas as pd
615
+ from sklearn.impute import SimpleImputer
616
+ from sklearn.preprocessing import StandardScaler
617
+
618
+
619
+ # ============================================================
620
+ # LETTER → 5-CLASS GROUP MAPPING (same logic as training code)
621
+ # ============================================================
622
+
623
+ def letter_to_group(letter: str):
624
+ """
625
+ Converts raw rating letters (AAA, A-, BBB+, BB-, etc.)
626
+ into the 5 ordinal groups used by the model:
627
+ Top, Mid-Top, Mid, Mid-Low, Low
628
+ """
629
+ if letter is None:
630
+ return None
631
+
632
+ s = str(letter).strip().upper()
633
+ if s == "":
634
+ return None
635
+
636
+ # Normalise duals like "AA / AA+" by taking the stronger one
637
+ s_clean = s.replace(" ", "")
638
+ if "/" in s_clean:
639
+ order = [
640
+ "E","D","C-","C","C+",
641
+ "B-","B","B+","BB-","BB","BB+",
642
+ "BBB-","BBB","BBB+",
643
+ "A-","A","A+",
644
+ "AA-","AA","AA+",
645
+ "AAA-","AAA"
646
+ ]
647
+ parts = [p for p in s_clean.split("/") if p]
648
+ idxs = [order.index(p) for p in parts if p in order]
649
+ if idxs:
650
+ s = order[max(idxs)] # stronger (higher index)
651
+ else:
652
+ s = parts[0]
653
+
654
+ # Group boundaries (as in your training script)
655
+ g1 = {"AAA","AAA-","AA+","AA"} # Top
656
+ g2 = {"AA-","A+","A","A-"} # Mid-Top
657
+ g3 = {"BBB+","BBB","BBB-","BB+"} # Mid
658
+ g4 = {"BB","BB-","B+","B","B-"} # Mid-Low
659
+ g5 = {"C+","C","C-","D","E"} # Low
660
+
661
+ if s in g1: return "Top"
662
+ if s in g2: return "Mid-Top"
663
+ if s in g3: return "Mid"
664
+ if s in g4: return "Mid-Low"
665
+ if s in g5: return "Low"
666
+ return None
667
+
668
+
669
  # ============================================================
670
  # RECREATE MODEL FROM BEST HYPERPARAMETERS
671
  # ============================================================
672
 
673
  def build_model_from_hparams(hp: dict):
674
+ """
675
+ Rebuilds the CORAL DNN with the same structure & hyperparameters
676
+ as in your training script.
677
+ """
678
  inputs = tf.keras.Input(shape=(len(FEATURES),))
679
  x = inputs
680
 
 
699
  if drop > 0:
700
  x = tf.keras.layers.Dropout(drop)(x)
701
 
702
+ # CORAL output: K-1 logits (K = len(CLASSES))
703
  outputs = tf.keras.layers.Dense(len(CLASSES) - 1, activation=None)(x)
704
 
705
  model = tf.keras.Model(inputs, outputs)
 
715
  # RETRAINING LOGIC + DATASET MGMT
716
  # ============================================================
717
 
718
+ FINGERPRINT_CSV = "fingerprints_db.csv" # master DB file
719
+ BEST_HP_JSON = "best_params_and_metrics.json" # hyperparams JSON
720
 
721
 
722
  def load_best_hparams():
723
+ """
724
+ Loads best hyperparameters from your tuning JSON.
725
+ Expects JSON to contain key "best_hyperparams".
726
+ """
727
  with open(BEST_HP_JSON, "r") as f:
728
  js = json.load(f)
729
  return js["best_hyperparams"]
730
 
731
 
732
  def load_fingerprint_dataset():
733
+ """
734
+ Loads the full fingerprint DB from FINGERPRINT_CSV.
735
+
736
+ Expected columns (at minimum):
737
+ - QTR
738
+ - COMPANY
739
+ - Supervisor
740
+ - RATING_RAW
741
+ - 21 ratio features named exactly as in FEATURES
742
+ - rating_score (can be ignored for training)
743
+
744
+ We:
745
+ - derive RATING_GROUP (Top/Mid-Top/...) from RATING_RAW if missing
746
+ - drop rows with RATING_GROUP = NaN
747
+ - impute missing feature values with median
748
+ - scale with StandardScaler
749
+ """
750
  df = pd.read_csv(FINGERPRINT_CSV)
751
 
752
+ # Derive 5-class group if not already present
753
+ if "RATING_GROUP" not in df.columns:
754
+ df["RATING_GROUP"] = df["RATING_RAW"].apply(letter_to_group)
755
+
756
+ df = df[df["RATING_GROUP"].notna()].copy()
757
+
758
+ # y labels 0..4
759
+ class_to_id = {c: i for i, c in enumerate(CLASSES)}
760
+ y = df["RATING_GROUP"].map(class_to_id).astype("int32").to_numpy()
761
+
762
+ # X features
763
  X_raw = df[FEATURES].to_numpy().astype("float32")
764
 
765
+ # Fit fresh imputer + scaler on full dataset
766
  imp = SimpleImputer(strategy="median")
767
  sc = StandardScaler()
768
 
769
  X_imp = imp.fit_transform(X_raw)
770
+ X_sc = sc.fit_transform(X_imp).astype("float32")
771
 
772
  return X_sc, y, imp, sc
773
 
774
 
775
  def retrain_model():
776
+ """
777
+ Retrains the model on the current fingerprints_db.csv
778
+ using the fixed best hyperparameters.
779
+
780
+ - Rebuilds the model
781
+ - Fits on full (X_sc, y)
782
+ - Updates global model/imputer/scaler
783
+ - Rebuilds SHAP explainer to stay in sync
784
+ """
785
+ print(">>> RETRAIN: loading dataset")
786
  hp = load_best_hparams()
787
  X, y, imp, sc = load_fingerprint_dataset()
788
 
789
+ print(">>> RETRAIN: building model from best hparams")
790
  model_new = build_model_from_hparams(hp)
791
 
792
+ print(">>> RETRAIN: fitting on fingerprint DB")
793
  es = tf.keras.callbacks.EarlyStopping(
794
  monitor="loss",
795
  patience=15,
 
805
  verbose=1,
806
  )
807
 
808
+ # Update global model + preprocessors used by /predict
809
  global model, imputer, scaler
810
  model = model_new
811
  imputer = imp
812
  scaler = sc
813
 
814
+ # Rebuild SHAP explainer so explanations match new model
815
  global EXPLAINER
816
  if SHAP_AVAILABLE:
817
  try:
 
822
  EXPLAINER = None
823
  print("⚠️ Failed to rebuild SHAP explainer:", repr(e))
824
 
825
+ print(">>> RETRAIN COMPLETE")
826
  return True
827
 
828
 
 
833
  @app.post("/append_and_retrain")
834
  def append_and_retrain(payload: dict):
835
  """
836
+ Appends a new fingerprint row to fingerprints_db.csv
837
+ and retrains the model.
838
+
839
+ Expected payload:
840
  {
841
+ "qtr": "2014Q4",
842
+ "company": "COAC Ambato Ltda",
843
+ "supervisor": "SEPS",
844
+ "rating_raw": "B",
845
  "features": {
846
+ "autosuf_oper": 0.536154555,
847
+ "improductiva": null,
848
+ "gastos_fin_over_avg_cart": 1.200803646,
849
+ "_equity": ...,
850
  ...
851
+ "roa_pre_tax": 1.580296249
852
  }
853
  }
854
+
855
+ - rating_raw is the letter rating (AAA, A-, BBB+, BB-, ...)
856
+ - we derive RATING_GROUP (Top / Mid-Top / Mid / Mid-Low / Low)
857
+ using the same logic as in the training script.
858
  """
859
 
860
+ qtr = payload.get("qtr")
861
+ company = payload.get("company")
862
+ supervisor = payload.get("supervisor")
863
+ rating_raw = payload.get("rating_raw")
864
+ feats = payload.get("features", {})
865
 
866
+ if not qtr or not company or not rating_raw:
867
+ return {"ok": False, "error": "Missing qtr/company/rating_raw"}
868
 
869
  if set(feats.keys()) != set(FEATURES):
870
+ return {"ok": False, "error": "features must contain all 21 ratio names"}
871
+
872
+ rating_group = letter_to_group(rating_raw)
873
+ if rating_group is None:
874
+ return {"ok": False, "error": f"Cannot map rating_raw '{rating_raw}' to 5-class group"}
875
+
876
+ # Build new row matching your CSV schema
877
+ row = {
878
+ "QTR": qtr,
879
+ "COMPANY": company,
880
+ "Supervisor": supervisor,
881
+ "RATING_RAW": rating_raw,
882
+ "RATING_GROUP": rating_group,
883
+ **feats,
884
+ "rating_score": None # optional, can be filled later
885
  }
 
886
 
887
+ # Append row to CSV
888
  if os.path.exists(FINGERPRINT_CSV):
889
  df = pd.read_csv(FINGERPRINT_CSV)
890
+ df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
891
  else:
892
+ df = pd.DataFrame([row])
893
 
894
  df.to_csv(FINGERPRINT_CSV, index=False)
895
 
896
+ # Retrain model on full updated DB
897
  retrain_model()
898
 
899
+ return {"ok": True, "message": "Fingerprint appended and model retrained"}