Commit
b25a5e5
·
1 Parent(s): 3c27eba

Bring Rami utility scripts and configs

Browse files
bioflow/api/deeppurpose_api.py CHANGED
@@ -18,7 +18,7 @@ sys.path.insert(0, ROOT_DIR)
18
 
19
  logger = logging.getLogger(__name__)
20
 
21
- router = APIRouter(prefix="/api/dp", tags=["deeppurpose"])
22
 
23
  # Global state for DeepPurpose model
24
  _dp_model = None
 
18
 
19
  logger = logging.getLogger(__name__)
20
 
21
+ router = APIRouter(prefix="/api", tags=["deeppurpose"])
22
 
23
  # Global state for DeepPurpose model
24
  _dp_model = None
scripts/deeppurpose002.py CHANGED
@@ -5,7 +5,8 @@ import time
5
  import argparse
6
  from datetime import datetime
7
  import matplotlib.pyplot as plt
8
- import torch
 
9
 
10
  import numpy as np
11
  import pandas as pd
@@ -120,11 +121,16 @@ def detect_cols(df):
120
  return drug_col, target_col, y_col
121
 
122
  def label_transform(y, mode, dataset_name):
 
 
 
 
 
 
123
  y = np.asarray(y, dtype=float)
124
 
125
- # Force auto to paffinity_nm for standard datasets
126
  if mode == "auto":
127
- if dataset_name.lower() in ["davis", "kiba"] or dataset_name.startswith("BindingDB"):
128
  mode = "paffinity_nm"
129
  else:
130
  mode = "none"
@@ -133,11 +139,7 @@ def label_transform(y, mode, dataset_name):
133
  return y, "none"
134
 
135
  if mode == "paffinity_nm":
136
- # SAFETY CHECK: Clip values to avoid log(0) or log(negative)
137
- y = np.where(y < 1e-9, 1e-9, y)
138
- # Convert nM to pM ( -log10( Molar ) )
139
- # Value 100 nM = 100e-9 M = 1e-7 M -> -log10(1e-7) = 7.0
140
- # Formula: 9 - log10(nM)
141
  y = 9.0 - np.log10(y)
142
  return y, "paffinity_nm"
143
 
@@ -150,24 +152,8 @@ def make_run_dir(base_dir, dataset):
150
  os.makedirs(run_dir, exist_ok=True)
151
  return run_id, run_dir
152
 
153
- def check_gpu():
154
- """Check GPU availability and return CUDA ID for DeepPurpose."""
155
- if torch.cuda.is_available():
156
- device_name = torch.cuda.get_device_name(0)
157
- gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
158
- print(f"\n[SYSTEM] ✅ CUDA GPU Detected: {device_name}")
159
- print(f"[SYSTEM] Memory: {gpu_memory:.1f} GB")
160
- print(f"[SYSTEM] CUDA Version: {torch.version.cuda}")
161
- return 0 # Use GPU 0
162
- else:
163
- print("\n[SYSTEM] ⚠️ No GPU detected. Running on CPU (will be slow).")
164
- return -1 # Use CPU
165
-
166
 
167
  def main():
168
- # --- GPU DETECTION FIRST ---
169
- use_cuda = check_gpu()
170
-
171
  ap = argparse.ArgumentParser()
172
  ap.add_argument("--dataset", default="DAVIS", help="DAVIS | KIBA | BindingDB_Kd | BindingDB_Ki | BindingDB_IC50")
173
  ap.add_argument("--drug_enc", default="Morgan")
@@ -180,18 +166,12 @@ def main():
180
  ap.add_argument("--frac_train", type=float, default=0.8)
181
  ap.add_argument("--frac_val", type=float, default=0.1)
182
  ap.add_argument("--frac_test", type=float, default=0.1)
183
- ap.add_argument("--label_transform", default="paffinity_nm", help="Force log transform! (auto | none | paffinity_nm)")
184
  ap.add_argument("--harmonize", default="none", help="none | mean | max_affinity (utile surtout BindingDB_*)")
185
  ap.add_argument("--runs_dir", default="runs")
186
  ap.add_argument("--dry_run", action="store_true", help="Charge dataset + prints info, sans training")
187
- ap.add_argument("--gpu", type=int, default=None, help="Override GPU ID (default: auto-detect)")
188
  args = ap.parse_args()
189
 
190
- # Override GPU if specified
191
- if args.gpu is not None:
192
- use_cuda = args.gpu
193
- print(f"[SYSTEM] Using specified GPU: {use_cuda}")
194
-
195
  np.random.seed(args.seed)
196
 
197
  run_id, run_dir = make_run_dir(args.runs_dir, args.dataset)
@@ -291,7 +271,7 @@ def main():
291
  print(f"[ENC] drug_encoding={args.drug_enc} | target_encoding={args.target_enc}")
292
 
293
  # -------------------------
294
- # 4) MODEL INIT + TRAIN (WITH GPU CONFIG)
295
  # -------------------------
296
  log_section("[4] MODEL INIT + TRAIN")
297
  config = utils.generate_config(
@@ -301,21 +281,14 @@ def main():
301
  train_epoch=args.epochs,
302
  batch_size=args.batch,
303
  LR=args.lr,
304
- result_folder=run_dir,
305
- cuda_id=use_cuda # <<< GPU CONFIG HERE
306
  )
307
 
308
  print("[MODEL] config:")
309
  print(f" epochs={args.epochs} | batch={args.batch} | lr={args.lr}")
310
  print(f" hidden=[1024,1024,512] | result_dir={run_dir}")
311
- print(f" cuda_id={use_cuda} {'(GPU)' if use_cuda >= 0 else '(CPU)'}")
312
 
313
  model = dp_models.model_initialize(**config)
314
-
315
- # Verify device placement
316
- if use_cuda >= 0:
317
- device = next(model.model.parameters()).device
318
- print(f"[MODEL] ✓ Model loaded on device: {device}")
319
 
320
  t_train0 = time.time()
321
  try:
@@ -330,19 +303,8 @@ def main():
330
  # -------------------------
331
  log_section("[5] EVAL + EXPORT")
332
  print("[PREDICT] predicting on test...")
333
-
334
- # [FIX] Reset index to ensure alignment between DataFrame and Model Output
335
- test = test.reset_index(drop=True)
336
-
337
  y_true = np.asarray(test.Label.values, dtype=float).reshape(-1)
338
-
339
- # [DEBUG] Check what the model is actually predicting
340
- raw_pred = model.predict(test)
341
- y_pred = np.asarray(raw_pred, dtype=float).reshape(-1)
342
-
343
- # [DEBUG] Print first 5 comparisons to verify scaling matches
344
- print(f"[DEBUG] First 5 True: {y_true[:5]}")
345
- print(f"[DEBUG] First 5 Pred: {y_pred[:5]}")
346
 
347
  m_mse = mse(y_true, y_pred)
348
  m_rmse = float(math.sqrt(m_mse))
@@ -377,9 +339,6 @@ def main():
377
  "label_transform": used_transform,
378
  "harmonize": args.harmonize,
379
  "n_rows_after_clean": int(len(df)),
380
- "cuda_id": use_cuda,
381
- "gpu_used": use_cuda >= 0,
382
- "gpu_name": torch.cuda.get_device_name(0) if use_cuda >= 0 else "CPU",
383
  "metrics_test": {
384
  "mse": m_mse,
385
  "rmse": m_rmse,
@@ -400,56 +359,74 @@ def main():
400
  json.dump(summary, f, indent=2)
401
  print(f"[FILE] saved summary: {summary_path}")
402
 
403
- # -------------------------
404
- # 6) VISUALISATION
405
- # -------------------------
406
- log_section("[6] VISUALISATION")
407
-
408
- # Scatter plot
409
- scatter_png = os.path.join(run_dir, "scatter.png")
410
- plt.figure(figsize=(8, 6))
411
- plt.scatter(y_true, y_pred, s=8, alpha=0.5)
412
- plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', label='Perfect fit')
413
- plt.xlabel("y_true")
414
- plt.ylabel("y_pred")
415
- plt.title("Test: y_true vs y_pred")
416
- plt.legend()
417
- plt.tight_layout()
418
- plt.savefig(scatter_png, dpi=200)
419
- plt.close()
420
- print(f"[PLOT] saved: {scatter_png}")
421
-
422
- # Sorted curves
423
- curves_png = os.path.join(run_dir, "curves_sorted.png")
424
- order = np.argsort(y_true)
425
- plt.figure(figsize=(10, 6))
426
- plt.plot(y_true[order], label="y_true", alpha=0.7)
427
- plt.plot(y_pred[order], label="y_pred", alpha=0.7)
428
- plt.xlabel("samples (sorted by y_true)")
429
- plt.ylabel("value")
430
- plt.title("Test: curves (sorted)")
431
- plt.legend()
432
- plt.tight_layout()
433
- plt.savefig(curves_png, dpi=200)
434
- plt.close()
435
- print(f"[PLOT] saved: {curves_png}")
436
-
437
- # Residuals
438
- res_png = os.path.join(run_dir, "residuals.png")
439
- res = y_pred - y_true
440
- plt.figure(figsize=(8, 6))
441
- plt.scatter(y_true, res, s=8, alpha=0.5)
442
- plt.axhline(0, color='r', linestyle='--')
443
- plt.xlabel("y_true")
444
- plt.ylabel("y_pred - y_true")
445
- plt.title("Test: residuals")
446
- plt.tight_layout()
447
- plt.savefig(res_png, dpi=200)
448
- plt.close()
449
- print(f"[PLOT] saved: {res_png}")
450
-
451
  print("\n[DONE]")
452
 
453
 
454
  if __name__ == "__main__":
455
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import argparse
6
  from datetime import datetime
7
  import matplotlib.pyplot as plt
8
+ from datetime import datetime
9
+
10
 
11
  import numpy as np
12
  import pandas as pd
 
121
  return drug_col, target_col, y_col
122
 
123
  def label_transform(y, mode, dataset_name):
124
+ """
125
+ mode:
126
+ - none
127
+ - paffinity_nm : suppose y en nM -> p = 9 - log10(nM)
128
+ - auto : BindingDB_* -> paffinity_nm, sinon none
129
+ """
130
  y = np.asarray(y, dtype=float)
131
 
 
132
  if mode == "auto":
133
+ if dataset_name.startswith("BindingDB_"):
134
  mode = "paffinity_nm"
135
  else:
136
  mode = "none"
 
139
  return y, "none"
140
 
141
  if mode == "paffinity_nm":
142
+ y = np.where(y <= 0, np.nan, y)
 
 
 
 
143
  y = 9.0 - np.log10(y)
144
  return y, "paffinity_nm"
145
 
 
152
  os.makedirs(run_dir, exist_ok=True)
153
  return run_id, run_dir
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  def main():
 
 
 
157
  ap = argparse.ArgumentParser()
158
  ap.add_argument("--dataset", default="DAVIS", help="DAVIS | KIBA | BindingDB_Kd | BindingDB_Ki | BindingDB_IC50")
159
  ap.add_argument("--drug_enc", default="Morgan")
 
166
  ap.add_argument("--frac_train", type=float, default=0.8)
167
  ap.add_argument("--frac_val", type=float, default=0.1)
168
  ap.add_argument("--frac_test", type=float, default=0.1)
169
+ ap.add_argument("--label_transform", default="auto", help="auto | none | paffinity_nm")
170
  ap.add_argument("--harmonize", default="none", help="none | mean | max_affinity (utile surtout BindingDB_*)")
171
  ap.add_argument("--runs_dir", default="runs")
172
  ap.add_argument("--dry_run", action="store_true", help="Charge dataset + prints info, sans training")
 
173
  args = ap.parse_args()
174
 
 
 
 
 
 
175
  np.random.seed(args.seed)
176
 
177
  run_id, run_dir = make_run_dir(args.runs_dir, args.dataset)
 
271
  print(f"[ENC] drug_encoding={args.drug_enc} | target_encoding={args.target_enc}")
272
 
273
  # -------------------------
274
+ # 4) MODEL INIT + TRAIN
275
  # -------------------------
276
  log_section("[4] MODEL INIT + TRAIN")
277
  config = utils.generate_config(
 
281
  train_epoch=args.epochs,
282
  batch_size=args.batch,
283
  LR=args.lr,
284
+ result_folder=run_dir
 
285
  )
286
 
287
  print("[MODEL] config:")
288
  print(f" epochs={args.epochs} | batch={args.batch} | lr={args.lr}")
289
  print(f" hidden=[1024,1024,512] | result_dir={run_dir}")
 
290
 
291
  model = dp_models.model_initialize(**config)
 
 
 
 
 
292
 
293
  t_train0 = time.time()
294
  try:
 
303
  # -------------------------
304
  log_section("[5] EVAL + EXPORT")
305
  print("[PREDICT] predicting on test...")
 
 
 
 
306
  y_true = np.asarray(test.Label.values, dtype=float).reshape(-1)
307
+ y_pred = np.asarray(model.predict(test), dtype=float).reshape(-1)
 
 
 
 
 
 
 
308
 
309
  m_mse = mse(y_true, y_pred)
310
  m_rmse = float(math.sqrt(m_mse))
 
339
  "label_transform": used_transform,
340
  "harmonize": args.harmonize,
341
  "n_rows_after_clean": int(len(df)),
 
 
 
342
  "metrics_test": {
343
  "mse": m_mse,
344
  "rmse": m_rmse,
 
359
  json.dump(summary, f, indent=2)
360
  print(f"[FILE] saved summary: {summary_path}")
361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  print("\n[DONE]")
363
 
364
 
365
  if __name__ == "__main__":
366
+ main()
367
+ # =========================
368
+ # VISUALISATION + EXPORTS
369
+ # =========================
370
+
371
+ # 0) Dossier de sortie (stable)
372
+ BASE_DIR = os.path.dirname(__file__)
373
+ RUNS_DIR = os.path.join(BASE_DIR, "runs")
374
+ os.makedirs(RUNS_DIR, exist_ok=True)
375
+
376
+ # si tu as déjà un run_dir dans ton code, il sera utilisé; sinon on en crée un
377
+ try:
378
+ run_dir
379
+ except NameError:
380
+ run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
381
+ run_dir = os.path.join(RUNS_DIR, run_id)
382
+ os.makedirs(run_dir, exist_ok=True)
383
+
384
+ # 1) Récupère y_true / y_pred (sans retraining)
385
+ y_true = np.asarray(test.Label.values, dtype=float).reshape(-1)
386
+ y_pred = np.asarray(model.predict(test), dtype=float).reshape(-1)
387
+
388
+ # 2) Sauvegarde CSV predictions
389
+ pred_csv = os.path.join(run_dir, "predictions_test.csv")
390
+ pd.DataFrame({"y_true": y_true, "y_pred": y_pred}).to_csv(pred_csv, index=False)
391
+ print("[FILE] saved predictions:", pred_csv)
392
+
393
+ # 3) Scatter: y_true vs y_pred
394
+ scatter_png = os.path.join(run_dir, "scatter.png")
395
+ plt.figure()
396
+ plt.scatter(y_true, y_pred, s=8)
397
+ plt.xlabel("y_true")
398
+ plt.ylabel("y_pred")
399
+ plt.title("Test: y_true vs y_pred")
400
+ plt.tight_layout()
401
+ plt.savefig(scatter_png, dpi=200)
402
+ plt.close()
403
+ print("[PLOT] saved:", scatter_png)
404
+
405
+ # 4) Courbes triées: y_true et y_pred (tri par y_true)
406
+ curves_png = os.path.join(run_dir, "curves_sorted.png")
407
+ order = np.argsort(y_true)
408
+ plt.figure()
409
+ plt.plot(y_true[order], label="y_true")
410
+ plt.plot(y_pred[order], label="y_pred")
411
+ plt.xlabel("samples (sorted by y_true)")
412
+ plt.ylabel("value")
413
+ plt.title("Test: curves (sorted)")
414
+ plt.legend()
415
+ plt.tight_layout()
416
+ plt.savefig(curves_png, dpi=200)
417
+ plt.close()
418
+ print("[PLOT] saved:", curves_png)
419
+
420
+ # 5) Résidus: (y_pred - y_true) vs y_true
421
+ res_png = os.path.join(run_dir, "residuals.png")
422
+ res = y_pred - y_true
423
+ plt.figure()
424
+ plt.scatter(y_true, res, s=8)
425
+ plt.axhline(0)
426
+ plt.xlabel("y_true")
427
+ plt.ylabel("y_pred - y_true")
428
+ plt.title("Test: residuals")
429
+ plt.tight_layout()
430
+ plt.savefig(res_png, dpi=200)
431
+ plt.close()
432
+ print("[PLOT] saved:", res_png)