Valmbd commited on
Commit
7bc226a
·
verified ·
1 Parent(s): 0dc3724

Fix: LFS pointer detection + debug logging for predictions

Browse files
Files changed (1) hide show
  1. app/utils/data_loader.py +13 -30
app/utils/data_loader.py CHANGED
@@ -5,7 +5,6 @@ import pandas as pd
5
  from pathlib import Path
6
  from functools import lru_cache
7
  import logging
8
- import streamlit as st
9
 
10
  logger = logging.getLogger(__name__)
11
 
@@ -19,7 +18,7 @@ def _get_zip_namelist(zip_path: str) -> list[str]:
19
  with zipfile.ZipFile(zip_path, 'r') as zf:
20
  _zip_namelist_cache[zip_path] = zf.namelist()
21
  except Exception as e:
22
- logger.error(f"Failed to read zip {zip_path}: {e}")
23
  _zip_namelist_cache[zip_path] = []
24
  return _zip_namelist_cache[zip_path]
25
 
@@ -27,19 +26,14 @@ def _get_zip_namelist(zip_path: str) -> list[str]:
27
  def get_predictions_zip(root: str) -> str | None:
28
  """Find predictions.zip in the root directory."""
29
  zip_path = os.path.join(root, "predictions.zip")
30
- if os.path.exists(zip_path):
31
- sz = os.path.getsize(zip_path)
32
- logger.info(f"Found predictions.zip: {sz} bytes at {zip_path}")
33
- # LFS pointer files are ~134 bytes. Real zip is ~279MB
34
- if sz < 1000:
35
- logger.warning(f"predictions.zip looks like an LFS pointer ({sz} bytes), ignoring")
36
- return None
37
- return zip_path
38
- return None
39
 
40
 
41
  def find_predictions_dir(root: str) -> str | None:
42
- """Find the predictions directory (most recent model) or zip."""
 
 
 
43
  if get_predictions_zip(root):
44
  return root
45
  pred_root = os.path.join(root, "predictions")
@@ -61,17 +55,12 @@ def load_prediction_index(pred_dir: str) -> pd.DataFrame:
61
  zip_path = get_predictions_zip(pred_dir)
62
  if zip_path:
63
  try:
64
- logger.info(f"Opening zip: {zip_path}")
65
  with zipfile.ZipFile(zip_path, 'r') as zf:
66
- all_names = zf.namelist()
67
- logger.info(f"Zip contains {len(all_names)} entries")
68
  # Look for index.json inside the zip
69
- idx_file = next((f for f in all_names if f.endswith("index.json")), None)
70
- logger.info(f"Index file: {idx_file}")
71
  if idx_file:
72
  with zf.open(idx_file) as f:
73
  index_dict = json.load(f)
74
- logger.info(f"Index has {len(index_dict)} entries")
75
  for k, v in index_dict.items():
76
  rows.append({
77
  "name": k,
@@ -83,9 +72,8 @@ def load_prediction_index(pred_dir: str) -> pd.DataFrame:
83
  })
84
  else:
85
  # No index.json — scan zip for _mode_0.txt files
86
- mode0_files = [f for f in all_names if f.endswith("_mode_0.txt")]
87
- logger.info(f"No index.json, found {len(mode0_files)} mode_0 files")
88
- for mf in mode0_files[:100]: # limit for speed
89
  base = os.path.basename(mf).replace("_mode_0.txt", "")
90
  try:
91
  with zf.open(mf) as f:
@@ -94,7 +82,7 @@ def load_prediction_index(pred_dir: str) -> pd.DataFrame:
94
  rows.append({
95
  "name": base,
96
  "seq_len": len(vecs),
97
- "n_modes": 4,
98
  "mean_disp_m0": float(mag.mean()),
99
  "max_disp_m0": float(mag.max()),
100
  "top_residue": int(np.argmax(mag)) + 1,
@@ -102,17 +90,14 @@ def load_prediction_index(pred_dir: str) -> pd.DataFrame:
102
  except Exception:
103
  continue
104
  except Exception as e:
105
- logger.error(f"Failed to load predictions from zip: {e}")
106
 
107
  if rows:
108
  return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
109
- else:
110
- logger.warning("Zip was found but produced no rows")
111
 
112
  # ── Fallback to loose files on disk ──
113
  if os.path.isdir(pred_dir):
114
  mode_files = glob.glob(os.path.join(pred_dir, "*_mode_0.txt"))
115
- logger.info(f"Fallback: found {len(mode_files)} loose mode_0 files in {pred_dir}")
116
  for mf in mode_files:
117
  base = os.path.basename(mf).replace("_mode_0.txt", "")
118
  try:
@@ -133,7 +118,6 @@ def load_prediction_index(pred_dir: str) -> pd.DataFrame:
133
  continue
134
 
135
  if not rows:
136
- logger.warning(f"No predictions found at all for pred_dir={pred_dir}")
137
  return pd.DataFrame(columns=["name", "seq_len", "n_modes", "mean_disp_m0", "max_disp_m0", "top_residue"])
138
  return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
139
 
@@ -159,7 +143,7 @@ def load_modes(pred_dir: str, name: str) -> dict[int, np.ndarray]:
159
  found = True
160
  break
161
  if not found and k > 0:
162
- break
163
  except Exception as e:
164
  logger.warning(f"Failed to load modes from zip for {name}: {e}")
165
 
@@ -182,8 +166,7 @@ def load_modes(pred_dir: str, name: str) -> dict[int, np.ndarray]:
182
 
183
  def load_ground_truth(gt_dir: str, name: str) -> dict | None:
184
  """Load ground truth data for a protein."""
185
- if not os.path.isdir(gt_dir):
186
- return None
187
  for search_dir in [gt_dir] + [os.path.join(gt_dir, d) for d in os.listdir(gt_dir) if os.path.isdir(os.path.join(gt_dir, d))]:
188
  path = os.path.join(search_dir, f"{name}.pt")
189
  if os.path.exists(path):
 
5
  from pathlib import Path
6
  from functools import lru_cache
7
  import logging
 
8
 
9
  logger = logging.getLogger(__name__)
10
 
 
18
  with zipfile.ZipFile(zip_path, 'r') as zf:
19
  _zip_namelist_cache[zip_path] = zf.namelist()
20
  except Exception as e:
21
+ logger.warning(f"Failed to read zip {zip_path}: {e}")
22
  _zip_namelist_cache[zip_path] = []
23
  return _zip_namelist_cache[zip_path]
24
 
 
26
  def get_predictions_zip(root: str) -> str | None:
27
  """Find predictions.zip in the root directory."""
28
  zip_path = os.path.join(root, "predictions.zip")
29
+ return zip_path if os.path.exists(zip_path) else None
 
 
 
 
 
 
 
 
30
 
31
 
32
  def find_predictions_dir(root: str) -> str | None:
33
+ """Find the predictions directory (most recent model) or zip.
34
+
35
+ Returns root if predictions.zip exists, or the latest predictions subdir.
36
+ """
37
  if get_predictions_zip(root):
38
  return root
39
  pred_root = os.path.join(root, "predictions")
 
55
  zip_path = get_predictions_zip(pred_dir)
56
  if zip_path:
57
  try:
 
58
  with zipfile.ZipFile(zip_path, 'r') as zf:
 
 
59
  # Look for index.json inside the zip
60
+ idx_file = next((f for f in zf.namelist() if f.endswith("index.json")), None)
 
61
  if idx_file:
62
  with zf.open(idx_file) as f:
63
  index_dict = json.load(f)
 
64
  for k, v in index_dict.items():
65
  rows.append({
66
  "name": k,
 
72
  })
73
  else:
74
  # No index.json — scan zip for _mode_0.txt files
75
+ mode0_files = [f for f in zf.namelist() if f.endswith("_mode_0.txt")]
76
+ for mf in mode0_files:
 
77
  base = os.path.basename(mf).replace("_mode_0.txt", "")
78
  try:
79
  with zf.open(mf) as f:
 
82
  rows.append({
83
  "name": base,
84
  "seq_len": len(vecs),
85
+ "n_modes": 4, # assume default
86
  "mean_disp_m0": float(mag.mean()),
87
  "max_disp_m0": float(mag.max()),
88
  "top_residue": int(np.argmax(mag)) + 1,
 
90
  except Exception:
91
  continue
92
  except Exception as e:
93
+ logger.warning(f"Failed to load predictions from zip: {e}")
94
 
95
  if rows:
96
  return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
 
 
97
 
98
  # ── Fallback to loose files on disk ──
99
  if os.path.isdir(pred_dir):
100
  mode_files = glob.glob(os.path.join(pred_dir, "*_mode_0.txt"))
 
101
  for mf in mode_files:
102
  base = os.path.basename(mf).replace("_mode_0.txt", "")
103
  try:
 
118
  continue
119
 
120
  if not rows:
 
121
  return pd.DataFrame(columns=["name", "seq_len", "n_modes", "mean_disp_m0", "max_disp_m0", "top_residue"])
122
  return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
123
 
 
143
  found = True
144
  break
145
  if not found and k > 0:
146
+ break # No more modes
147
  except Exception as e:
148
  logger.warning(f"Failed to load modes from zip for {name}: {e}")
149
 
 
166
 
167
  def load_ground_truth(gt_dir: str, name: str) -> dict | None:
168
  """Load ground truth data for a protein."""
169
+ # Search in subdirectories too
 
170
  for search_dir in [gt_dir] + [os.path.join(gt_dir, d) for d in os.listdir(gt_dir) if os.path.isdir(os.path.join(gt_dir, d))]:
171
  path = os.path.join(search_dir, f"{name}.pt")
172
  if os.path.exists(path):