Valmbd commited on
Commit
c8ad6f1
·
verified ·
1 Parent(s): 8bdea01

Update data_loader to read directly from zip

Browse files
Files changed (1) hide show
  1. app/utils/data_loader.py +52 -4
app/utils/data_loader.py CHANGED
@@ -6,8 +6,16 @@ from pathlib import Path
6
  from functools import lru_cache
7
 
8
 
 
 
 
 
 
 
9
  def find_predictions_dir(root: str) -> str | None:
10
- """Find the predictions directory (most recent model)."""
 
 
11
  pred_root = os.path.join(root, "predictions")
12
  if not os.path.isdir(pred_root):
13
  return None
@@ -22,17 +30,38 @@ def find_predictions_dir(root: str) -> str | None:
22
  def load_prediction_index(pred_dir: str) -> pd.DataFrame:
23
  """Build index of all predicted proteins with metadata."""
24
  rows = []
25
- mode_files = glob.glob(os.path.join(pred_dir, "*_mode_0.txt"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
 
 
 
 
 
27
  for mf in mode_files:
28
  base = os.path.basename(mf).replace("_mode_0.txt", "")
29
- # Load mode 0 for stats
30
  try:
31
  vecs = np.loadtxt(mf)
32
  n_res = len(vecs)
33
  mag = np.linalg.norm(vecs, axis=1)
34
 
35
- # Count available modes
36
  n_modes = 0
37
  for k in range(10):
38
  if os.path.exists(os.path.join(pred_dir, f"{base}_mode_{k}.txt")):
@@ -57,6 +86,25 @@ def load_prediction_index(pred_dir: str) -> pd.DataFrame:
57
  def load_modes(pred_dir: str, name: str) -> dict[int, np.ndarray]:
58
  """Load all mode files for a protein."""
59
  modes = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  for k in range(10):
61
  for pfx in [f"extracted_{name}", name]:
62
  mf = os.path.join(pred_dir, f"{pfx}_mode_{k}.txt")
 
6
  from functools import lru_cache
7
 
8
 
9
+ def get_predictions_zip(root: str) -> str | None:
10
+ """Find predictions.zip in the root directory."""
11
+ zip_path = os.path.join(root, "predictions.zip")
12
+ return zip_path if os.path.exists(zip_path) else None
13
+
14
+
15
  def find_predictions_dir(root: str) -> str | None:
16
+ """Find the predictions directory (most recent model) or zip."""
17
+ if get_predictions_zip(root):
18
+ return root # Signal that we have a zip
19
  pred_root = os.path.join(root, "predictions")
20
  if not os.path.isdir(pred_root):
21
  return None
 
30
  def load_prediction_index(pred_dir: str) -> pd.DataFrame:
31
  """Build index of all predicted proteins with metadata."""
32
  rows = []
33
+ zip_path = get_predictions_zip(pred_dir)
34
+
35
+ if zip_path:
36
+ import zipfile
37
+ with zipfile.ZipFile(zip_path, 'r') as zf:
38
+ idx_file = next((f for f in zf.namelist() if f.endswith("index.json")), None)
39
+ if idx_file:
40
+ with zf.open(idx_file) as f:
41
+ index_dict = json.load(f)
42
+ for k, v in index_dict.items():
43
+ rows.append({
44
+ "name": k,
45
+ "seq_len": v.get("seq_len", 0),
46
+ "n_modes": v.get("n_modes", 0),
47
+ "mean_disp_m0": v.get("mean_disp", 0.0),
48
+ "max_disp_m0": v.get("max_disp", 0.0),
49
+ "top_residue": -1,
50
+ })
51
+ return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
52
 
53
+ # Fallback to loose files
54
+ if not os.path.isdir(pred_dir):
55
+ return pd.DataFrame()
56
+
57
+ mode_files = glob.glob(os.path.join(pred_dir, "*_mode_0.txt"))
58
  for mf in mode_files:
59
  base = os.path.basename(mf).replace("_mode_0.txt", "")
 
60
  try:
61
  vecs = np.loadtxt(mf)
62
  n_res = len(vecs)
63
  mag = np.linalg.norm(vecs, axis=1)
64
 
 
65
  n_modes = 0
66
  for k in range(10):
67
  if os.path.exists(os.path.join(pred_dir, f"{base}_mode_{k}.txt")):
 
86
  def load_modes(pred_dir: str, name: str) -> dict[int, np.ndarray]:
87
  """Load all mode files for a protein."""
88
  modes = {}
89
+ zip_path = get_predictions_zip(pred_dir)
90
+
91
+ if zip_path:
92
+ import zipfile
93
+ with zipfile.ZipFile(zip_path, 'r') as zf:
94
+ namelist = zf.namelist()
95
+ for k in range(10):
96
+ for pfx in [f"extracted_{name}", name]:
97
+ suffix = f"{pfx}_mode_{k}.txt"
98
+ # Fast check if any path ends with suffix
99
+ matched = next((f for f in namelist if f.endswith(f"/{suffix}") or f == suffix), None)
100
+ if matched:
101
+ with zf.open(matched) as f:
102
+ modes[k] = np.loadtxt(f)
103
+ break
104
+ if modes:
105
+ return modes
106
+
107
+ # Fallback for loose files
108
  for k in range(10):
109
  for pfx in [f"extracted_{name}", name]:
110
  mf = os.path.join(pred_dir, f"{pfx}_mode_{k}.txt")