Valmbd commited on
Commit
539c642
Β·
verified Β·
1 Parent(s): f1e15b2

Fix: robust zip reading + handle empty DataFrame

Browse files
Files changed (1) hide show
  1. app/utils/data_loader.py +124 -72
app/utils/data_loader.py CHANGED
@@ -1,9 +1,26 @@
1
  """Data loading utilities for pre-computed PETIMOT predictions."""
2
- import os, json, glob, torch
3
  import numpy as np
4
  import pandas as pd
5
  from pathlib import Path
6
  from functools import lru_cache
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def get_predictions_zip(root: str) -> str | None:
@@ -13,9 +30,12 @@ def get_predictions_zip(root: str) -> str | None:
13
 
14
 
15
  def find_predictions_dir(root: str) -> str | None:
16
- """Find the predictions directory (most recent model) or zip."""
 
 
 
17
  if get_predictions_zip(root):
18
- return root # Signal that we have a zip
19
  pred_root = os.path.join(root, "predictions")
20
  if not os.path.isdir(pred_root):
21
  return None
@@ -30,101 +50,133 @@ def find_predictions_dir(root: str) -> str | None:
30
  def load_prediction_index(pred_dir: str) -> pd.DataFrame:
31
  """Build index of all predicted proteins with metadata."""
32
  rows = []
 
 
33
  zip_path = get_predictions_zip(pred_dir)
34
-
35
  if zip_path:
36
- import zipfile
37
- with zipfile.ZipFile(zip_path, 'r') as zf:
38
- idx_file = next((f for f in zf.namelist() if f.endswith("index.json")), None)
39
- if idx_file:
40
- with zf.open(idx_file) as f:
41
- index_dict = json.load(f)
42
- for k, v in index_dict.items():
43
- rows.append({
44
- "name": k,
45
- "seq_len": v.get("seq_len", 0),
46
- "n_modes": v.get("n_modes", 0),
47
- "mean_disp_m0": v.get("mean_disp", 0.0),
48
- "max_disp_m0": v.get("max_disp", 0.0),
49
- "top_residue": -1,
50
- })
51
- return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
52
-
53
- # Fallback to loose files
54
- if not os.path.isdir(pred_dir):
55
- return pd.DataFrame()
56
-
57
- mode_files = glob.glob(os.path.join(pred_dir, "*_mode_0.txt"))
58
- for mf in mode_files:
59
- base = os.path.basename(mf).replace("_mode_0.txt", "")
60
  try:
61
- vecs = np.loadtxt(mf)
62
- n_res = len(vecs)
63
- mag = np.linalg.norm(vecs, axis=1)
64
-
65
- n_modes = 0
66
- for k in range(10):
67
- if os.path.exists(os.path.join(pred_dir, f"{base}_mode_{k}.txt")):
68
- n_modes += 1
 
 
 
 
 
 
 
69
  else:
70
- break
71
-
72
- rows.append({
73
- "name": base,
74
- "seq_len": n_res,
75
- "n_modes": n_modes,
76
- "mean_disp_m0": float(mag.mean()),
77
- "max_disp_m0": float(mag.max()),
78
- "top_residue": int(np.argmax(mag)) + 1,
79
- })
80
- except Exception:
81
- continue
82
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
84
 
85
 
86
  def load_modes(pred_dir: str, name: str) -> dict[int, np.ndarray]:
87
  """Load all mode files for a protein."""
88
  modes = {}
 
 
89
  zip_path = get_predictions_zip(pred_dir)
90
-
91
  if zip_path:
92
- import zipfile
93
- with zipfile.ZipFile(zip_path, 'r') as zf:
94
- namelist = zf.namelist()
95
- for k in range(10):
96
- for pfx in [f"extracted_{name}", name]:
97
- suffix = f"{pfx}_mode_{k}.txt"
98
- # Fast check if any path ends with suffix
99
- matched = next((f for f in namelist if f.endswith(f"/{suffix}") or f == suffix), None)
100
- if matched:
101
- with zf.open(matched) as f:
102
- modes[k] = np.loadtxt(f)
103
- break
 
 
 
 
 
 
104
  if modes:
105
  return modes
106
 
107
- # Fallback for loose files
108
  for k in range(10):
 
109
  for pfx in [f"extracted_{name}", name]:
110
  mf = os.path.join(pred_dir, f"{pfx}_mode_{k}.txt")
111
  if os.path.exists(mf):
112
  modes[k] = np.loadtxt(mf)
 
113
  break
 
 
114
  return modes
115
 
116
 
117
  def load_ground_truth(gt_dir: str, name: str) -> dict | None:
118
  """Load ground truth data for a protein."""
119
- path = os.path.join(gt_dir, f"{name}.pt")
120
- if not os.path.exists(path):
121
- return None
122
- try:
123
- data = torch.load(path, map_location="cpu", weights_only=True)
124
- return {k: v.numpy() if isinstance(v, torch.Tensor) else v
125
- for k, v in data.items()}
126
- except Exception:
127
- return None
 
 
128
 
129
 
130
  def load_pdb_text(pdb_path: str) -> str | None:
 
1
  """Data loading utilities for pre-computed PETIMOT predictions."""
2
+ import os, json, glob, torch, zipfile, io
3
  import numpy as np
4
  import pandas as pd
5
  from pathlib import Path
6
  from functools import lru_cache
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ # ── Cache the zip namelist for fast lookups ──
12
+ _zip_namelist_cache = {}
13
+
14
+ def _get_zip_namelist(zip_path: str) -> list[str]:
15
+ """Cache the zip namelist to avoid reopening the zip for every call."""
16
+ if zip_path not in _zip_namelist_cache:
17
+ try:
18
+ with zipfile.ZipFile(zip_path, 'r') as zf:
19
+ _zip_namelist_cache[zip_path] = zf.namelist()
20
+ except Exception as e:
21
+ logger.warning(f"Failed to read zip {zip_path}: {e}")
22
+ _zip_namelist_cache[zip_path] = []
23
+ return _zip_namelist_cache[zip_path]
24
 
25
 
26
  def get_predictions_zip(root: str) -> str | None:
 
30
 
31
 
32
  def find_predictions_dir(root: str) -> str | None:
33
+ """Find the predictions directory (most recent model) or zip.
34
+
35
+ Returns root if predictions.zip exists, or the latest predictions subdir.
36
+ """
37
  if get_predictions_zip(root):
38
+ return root
39
  pred_root = os.path.join(root, "predictions")
40
  if not os.path.isdir(pred_root):
41
  return None
 
50
  def load_prediction_index(pred_dir: str) -> pd.DataFrame:
51
  """Build index of all predicted proteins with metadata."""
52
  rows = []
53
+
54
+ # ── Try reading from predictions.zip ──
55
  zip_path = get_predictions_zip(pred_dir)
 
56
  if zip_path:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  try:
58
+ with zipfile.ZipFile(zip_path, 'r') as zf:
59
+ # Look for index.json inside the zip
60
+ idx_file = next((f for f in zf.namelist() if f.endswith("index.json")), None)
61
+ if idx_file:
62
+ with zf.open(idx_file) as f:
63
+ index_dict = json.load(f)
64
+ for k, v in index_dict.items():
65
+ rows.append({
66
+ "name": k,
67
+ "seq_len": v.get("seq_len", 0),
68
+ "n_modes": v.get("n_modes", 0),
69
+ "mean_disp_m0": v.get("mean_disp", 0.0),
70
+ "max_disp_m0": v.get("max_disp", 0.0),
71
+ "top_residue": -1,
72
+ })
73
  else:
74
+ # No index.json β€” scan zip for _mode_0.txt files
75
+ mode0_files = [f for f in zf.namelist() if f.endswith("_mode_0.txt")]
76
+ for mf in mode0_files:
77
+ base = os.path.basename(mf).replace("_mode_0.txt", "")
78
+ try:
79
+ with zf.open(mf) as f:
80
+ vecs = np.loadtxt(f)
81
+ mag = np.linalg.norm(vecs, axis=1)
82
+ rows.append({
83
+ "name": base,
84
+ "seq_len": len(vecs),
85
+ "n_modes": 4, # assume default
86
+ "mean_disp_m0": float(mag.mean()),
87
+ "max_disp_m0": float(mag.max()),
88
+ "top_residue": int(np.argmax(mag)) + 1,
89
+ })
90
+ except Exception:
91
+ continue
92
+ except Exception as e:
93
+ logger.warning(f"Failed to load predictions from zip: {e}")
94
+
95
+ if rows:
96
+ return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
97
+
98
+ # ── Fallback to loose files on disk ──
99
+ if os.path.isdir(pred_dir):
100
+ mode_files = glob.glob(os.path.join(pred_dir, "*_mode_0.txt"))
101
+ for mf in mode_files:
102
+ base = os.path.basename(mf).replace("_mode_0.txt", "")
103
+ try:
104
+ vecs = np.loadtxt(mf)
105
+ n_res = len(vecs)
106
+ mag = np.linalg.norm(vecs, axis=1)
107
+ n_modes = sum(1 for k in range(10)
108
+ if os.path.exists(os.path.join(pred_dir, f"{base}_mode_{k}.txt")))
109
+ rows.append({
110
+ "name": base,
111
+ "seq_len": n_res,
112
+ "n_modes": n_modes,
113
+ "mean_disp_m0": float(mag.mean()),
114
+ "max_disp_m0": float(mag.max()),
115
+ "top_residue": int(np.argmax(mag)) + 1,
116
+ })
117
+ except Exception:
118
+ continue
119
+
120
+ if not rows:
121
+ return pd.DataFrame(columns=["name", "seq_len", "n_modes", "mean_disp_m0", "max_disp_m0", "top_residue"])
122
  return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
123
 
124
 
125
  def load_modes(pred_dir: str, name: str) -> dict[int, np.ndarray]:
126
  """Load all mode files for a protein."""
127
  modes = {}
128
+
129
+ # ── Try from zip ──
130
  zip_path = get_predictions_zip(pred_dir)
 
131
  if zip_path:
132
+ namelist = _get_zip_namelist(zip_path)
133
+ try:
134
+ with zipfile.ZipFile(zip_path, 'r') as zf:
135
+ for k in range(10):
136
+ found = False
137
+ for pfx in [f"extracted_{name}", name]:
138
+ suffix = f"{pfx}_mode_{k}.txt"
139
+ matched = next((f for f in namelist if f.endswith(f"/{suffix}") or f == suffix), None)
140
+ if matched:
141
+ with zf.open(matched) as f:
142
+ modes[k] = np.loadtxt(f)
143
+ found = True
144
+ break
145
+ if not found and k > 0:
146
+ break # No more modes
147
+ except Exception as e:
148
+ logger.warning(f"Failed to load modes from zip for {name}: {e}")
149
+
150
  if modes:
151
  return modes
152
 
153
+ # ── Fallback for loose files ──
154
  for k in range(10):
155
+ found = False
156
  for pfx in [f"extracted_{name}", name]:
157
  mf = os.path.join(pred_dir, f"{pfx}_mode_{k}.txt")
158
  if os.path.exists(mf):
159
  modes[k] = np.loadtxt(mf)
160
+ found = True
161
  break
162
+ if not found and k > 0:
163
+ break
164
  return modes
165
 
166
 
167
  def load_ground_truth(gt_dir: str, name: str) -> dict | None:
168
  """Load ground truth data for a protein."""
169
+ # Search in subdirectories too
170
+ for search_dir in [gt_dir] + [os.path.join(gt_dir, d) for d in os.listdir(gt_dir) if os.path.isdir(os.path.join(gt_dir, d))]:
171
+ path = os.path.join(search_dir, f"{name}.pt")
172
+ if os.path.exists(path):
173
+ try:
174
+ data = torch.load(path, map_location="cpu", weights_only=True)
175
+ return {k: v.numpy() if isinstance(v, torch.Tensor) else v
176
+ for k, v in data.items()}
177
+ except Exception:
178
+ return None
179
+ return None
180
 
181
 
182
  def load_pdb_text(pdb_path: str) -> str | None: