Spaces:

iharshyadav
/

SmartCertify-ML

Sleeping

App Files Files Community

Harsh Yadav commited on 27 days ago

Commit

ba6d7cd

1 Parent(s): 2a34453

fix: remove XGBoost use_label_encoder (removed in v2+), reduce CNN to 3k imgs/5 epochs for HF timeout, pre-download ResNet18

Browse files

Files changed (2) hide show

Dockerfile +11 -0
app/models/train_all.py +3 -3

Dockerfile CHANGED Viewed

@@ -60,11 +60,22 @@ tamp = load_tampered_images(n_max=150); \
 print(f'Cached {len(auth)} authentic + {len(tamp)} tampered images'); \
 "
 # ─────────────────────────────────────────────────────────────────────────────
 # BUILD STEP 4: Train all models (uses cached data — no network calls)
 # ─────────────────────────────────────────────────────────────────────────────
 RUN python -m app.models.train_all
 # ─────────────────────────────────────────────────────────────────────────────
 # BUILD STEP 5: Verify all required model files exist — fail build if missing
 # ─────────────────────────────────────────────────────────────────────────────

 print(f'Cached {len(auth)} authentic + {len(tamp)} tampered images'); \
 "
+# ─────────────────────────────────────────────────────────────────────────────
+# BUILD STEP 3.5: Pre-download ResNet18 weights
+# ─────────────────────────────────────────────────────────────────────────────
+RUN python -c "\
+import torchvision.models as tv_models; \
+print('Downloading ResNet18 weights...'); \
+tv_models.resnet18(weights=tv_models.ResNet18_Weights.DEFAULT); \
+print('ResNet18 weights downloaded.') \
+"
 # ─────────────────────────────────────────────────────────────────────────────
 # BUILD STEP 4: Train all models (uses cached data — no network calls)
 # ─────────────────────────────────────────────────────────────────────────────
 RUN python -m app.models.train_all
 # ─────────────────────────────────────────────────────────────────────────────
 # BUILD STEP 5: Verify all required model files exist — fail build if missing
 # ─────────────────────────────────────────────────────────────────────────────

app/models/train_all.py CHANGED Viewed

@@ -94,7 +94,7 @@ def train_fraud_model(df: pd.DataFrame) -> None:
     print("  Training XGBClassifier...")
     xgb_model = xgb.XGBClassifier(
         n_estimators=200, max_depth=6, learning_rate=0.1,
-        use_label_encoder=False, eval_metric="mlogloss",
         random_state=42, verbosity=0,
     )
     xgb_model.fit(X_train, y_train)
@@ -208,7 +208,7 @@ def train_image_model() -> None:
     print(f"  Created {len(tampered_from_real)} tampered versions of real certs")
     # ── Step 2: Generate synthetic PIL images to fill volume ──────────────────
-    N_SYNTHETIC_PER_CLASS = 2_500  # 5,000 synthetic images
     print(f"\n  [Phase 2] Generating {N_SYNTHETIC_PER_CLASS * 2} synthetic images...")
     all_images = []   # PIL Images
@@ -343,7 +343,7 @@ def train_image_model() -> None:
     )
     best_val_acc = 0.0
-    N_EPOCHS = 10  # more epochs for hybrid dataset
     print("\n  Training ResNet-18...")
     for epoch in range(N_EPOCHS):

     print("  Training XGBClassifier...")
     xgb_model = xgb.XGBClassifier(
         n_estimators=200, max_depth=6, learning_rate=0.1,
+        eval_metric="mlogloss",
         random_state=42, verbosity=0,
     )
     xgb_model.fit(X_train, y_train)
     print(f"  Created {len(tampered_from_real)} tampered versions of real certs")
     # ── Step 2: Generate synthetic PIL images to fill volume ──────────────────
+    N_SYNTHETIC_PER_CLASS = 1_500  # 3,000 synthetic images — fits in HF build timeout
     print(f"\n  [Phase 2] Generating {N_SYNTHETIC_PER_CLASS * 2} synthetic images...")
     all_images = []   # PIL Images
     )
     best_val_acc = 0.0
+    N_EPOCHS = 5  # 5 epochs fits within HF Spaces 30-min build timeout
     print("\n  Training ResNet-18...")
     for epoch in range(N_EPOCHS):