Harsh Yadav commited on
Commit
ba6d7cd
Β·
1 Parent(s): 2a34453

fix: remove XGBoost use_label_encoder (removed in v2+), reduce CNN to 3k imgs/5 epochs for HF timeout, pre-download ResNet18

Browse files
Files changed (2) hide show
  1. Dockerfile +11 -0
  2. app/models/train_all.py +3 -3
Dockerfile CHANGED
@@ -60,11 +60,22 @@ tamp = load_tampered_images(n_max=150); \
60
  print(f'Cached {len(auth)} authentic + {len(tamp)} tampered images'); \
61
  "
62
 
 
 
 
 
 
 
 
 
 
 
63
  # ─────────────────────────────────────────────────────────────────────────────
64
  # BUILD STEP 4: Train all models (uses cached data β€” no network calls)
65
  # ─────────────────────────────────────────────────────────────────────────────
66
  RUN python -m app.models.train_all
67
 
 
68
  # ─────────────────────────────────────────────────────────────────────────────
69
  # BUILD STEP 5: Verify all required model files exist β€” fail build if missing
70
  # ─────────────────────────────────────────────────────────────────────────────
 
60
  print(f'Cached {len(auth)} authentic + {len(tamp)} tampered images'); \
61
  "
62
 
63
+ # ─────────────────────────────────────────────────────────────────────────────
64
+ # BUILD STEP 3.5: Pre-download ResNet18 weights
65
+ # ─────────────────────────────────────────────────────────────────────────────
66
+ RUN python -c "\
67
+ import torchvision.models as tv_models; \
68
+ print('Downloading ResNet18 weights...'); \
69
+ tv_models.resnet18(weights=tv_models.ResNet18_Weights.DEFAULT); \
70
+ print('ResNet18 weights downloaded.') \
71
+ "
72
+
73
  # ─────────────────────────────────────────────────────────────────────────────
74
  # BUILD STEP 4: Train all models (uses cached data β€” no network calls)
75
  # ─────────────────────────────────────────────────────────────────────────────
76
  RUN python -m app.models.train_all
77
 
78
+
79
  # ─────────────────────────────────────────────────────────────────────────────
80
  # BUILD STEP 5: Verify all required model files exist β€” fail build if missing
81
  # ─────────────────────────────────────────────────────────────────────────────
app/models/train_all.py CHANGED
@@ -94,7 +94,7 @@ def train_fraud_model(df: pd.DataFrame) -> None:
94
  print(" Training XGBClassifier...")
95
  xgb_model = xgb.XGBClassifier(
96
  n_estimators=200, max_depth=6, learning_rate=0.1,
97
- use_label_encoder=False, eval_metric="mlogloss",
98
  random_state=42, verbosity=0,
99
  )
100
  xgb_model.fit(X_train, y_train)
@@ -208,7 +208,7 @@ def train_image_model() -> None:
208
  print(f" Created {len(tampered_from_real)} tampered versions of real certs")
209
 
210
  # ── Step 2: Generate synthetic PIL images to fill volume ──────────────────
211
- N_SYNTHETIC_PER_CLASS = 2_500 # 5,000 synthetic images
212
  print(f"\n [Phase 2] Generating {N_SYNTHETIC_PER_CLASS * 2} synthetic images...")
213
 
214
  all_images = [] # PIL Images
@@ -343,7 +343,7 @@ def train_image_model() -> None:
343
  )
344
 
345
  best_val_acc = 0.0
346
- N_EPOCHS = 10 # more epochs for hybrid dataset
347
 
348
  print("\n Training ResNet-18...")
349
  for epoch in range(N_EPOCHS):
 
94
  print(" Training XGBClassifier...")
95
  xgb_model = xgb.XGBClassifier(
96
  n_estimators=200, max_depth=6, learning_rate=0.1,
97
+ eval_metric="mlogloss",
98
  random_state=42, verbosity=0,
99
  )
100
  xgb_model.fit(X_train, y_train)
 
208
  print(f" Created {len(tampered_from_real)} tampered versions of real certs")
209
 
210
  # ── Step 2: Generate synthetic PIL images to fill volume ──────────────────
211
+ N_SYNTHETIC_PER_CLASS = 1_500 # 3,000 synthetic images β€” fits in HF build timeout
212
  print(f"\n [Phase 2] Generating {N_SYNTHETIC_PER_CLASS * 2} synthetic images...")
213
 
214
  all_images = [] # PIL Images
 
343
  )
344
 
345
  best_val_acc = 0.0
346
+ N_EPOCHS = 5 # 5 epochs fits within HF Spaces 30-min build timeout
347
 
348
  print("\n Training ResNet-18...")
349
  for epoch in range(N_EPOCHS):