Spaces:

Valmbd
/

Petimot

Running

App Files Files Community

Valmbd commited on Mar 24

Commit

539c642

verified ·

1 Parent(s): f1e15b2

Fix: robust zip reading + handle empty DataFrame

Browse files

Files changed (1) hide show

app/utils/data_loader.py +124 -72

app/utils/data_loader.py CHANGED Viewed

@@ -1,9 +1,26 @@
 """Data loading utilities for pre-computed PETIMOT predictions."""
-import os, json, glob, torch
 import numpy as np
 import pandas as pd
 from pathlib import Path
 from functools import lru_cache
 def get_predictions_zip(root: str) -> str | None:
@@ -13,9 +30,12 @@ def get_predictions_zip(root: str) -> str | None:
 def find_predictions_dir(root: str) -> str | None:
-    """Find the predictions directory (most recent model) or zip."""
     if get_predictions_zip(root):
-        return root  # Signal that we have a zip
     pred_root = os.path.join(root, "predictions")
     if not os.path.isdir(pred_root):
         return None
@@ -30,101 +50,133 @@ def find_predictions_dir(root: str) -> str | None:
 def load_prediction_index(pred_dir: str) -> pd.DataFrame:
     """Build index of all predicted proteins with metadata."""
     rows = []
     zip_path = get_predictions_zip(pred_dir)
     if zip_path:
-        import zipfile
-        with zipfile.ZipFile(zip_path, 'r') as zf:
-            idx_file = next((f for f in zf.namelist() if f.endswith("index.json")), None)
-            if idx_file:
-                with zf.open(idx_file) as f:
-                    index_dict = json.load(f)
-                    for k, v in index_dict.items():
-                        rows.append({
-                            "name": k,
-                            "seq_len": v.get("seq_len", 0),
-                            "n_modes": v.get("n_modes", 0),
-                            "mean_disp_m0": v.get("mean_disp", 0.0),
-                            "max_disp_m0": v.get("max_disp", 0.0),
-                            "top_residue": -1,
-                        })
-                    return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
-    # Fallback to loose files
-    if not os.path.isdir(pred_dir):
-        return pd.DataFrame()
-    mode_files = glob.glob(os.path.join(pred_dir, "*_mode_0.txt"))
-    for mf in mode_files:
-        base = os.path.basename(mf).replace("_mode_0.txt", "")
         try:
-            vecs = np.loadtxt(mf)
-            n_res = len(vecs)
-            mag = np.linalg.norm(vecs, axis=1)
-            n_modes = 0
-            for k in range(10):
-                if os.path.exists(os.path.join(pred_dir, f"{base}_mode_{k}.txt")):
-                    n_modes += 1
                 else:
-                    break
-            rows.append({
-                "name": base,
-                "seq_len": n_res,
-                "n_modes": n_modes,
-                "mean_disp_m0": float(mag.mean()),
-                "max_disp_m0": float(mag.max()),
-                "top_residue": int(np.argmax(mag)) + 1,
-            })
-        except Exception:
-            continue
     return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
 def load_modes(pred_dir: str, name: str) -> dict[int, np.ndarray]:
     """Load all mode files for a protein."""
     modes = {}
     zip_path = get_predictions_zip(pred_dir)
     if zip_path:
-        import zipfile
-        with zipfile.ZipFile(zip_path, 'r') as zf:
-            namelist = zf.namelist()
-            for k in range(10):
-                for pfx in [f"extracted_{name}", name]:
-                    suffix = f"{pfx}_mode_{k}.txt"
-                    # Fast check if any path ends with suffix
-                    matched = next((f for f in namelist if f.endswith(f"/{suffix}") or f == suffix), None)
-                    if matched:
-                        with zf.open(matched) as f:
-                            modes[k] = np.loadtxt(f)
-                        break
         if modes:
             return modes
-    # Fallback for loose files
     for k in range(10):
         for pfx in [f"extracted_{name}", name]:
             mf = os.path.join(pred_dir, f"{pfx}_mode_{k}.txt")
             if os.path.exists(mf):
                 modes[k] = np.loadtxt(mf)
                 break
     return modes
 def load_ground_truth(gt_dir: str, name: str) -> dict | None:
     """Load ground truth data for a protein."""
-    path = os.path.join(gt_dir, f"{name}.pt")
-    if not os.path.exists(path):
-        return None
-    try:
-        data = torch.load(path, map_location="cpu", weights_only=True)
-        return {k: v.numpy() if isinstance(v, torch.Tensor) else v
-                for k, v in data.items()}
-    except Exception:
-        return None
 def load_pdb_text(pdb_path: str) -> str | None:

 """Data loading utilities for pre-computed PETIMOT predictions."""
+import os, json, glob, torch, zipfile, io
 import numpy as np
 import pandas as pd
 from pathlib import Path
 from functools import lru_cache
+import logging
+logger = logging.getLogger(__name__)
+# ── Cache the zip namelist for fast lookups ──
+_zip_namelist_cache = {}
+def _get_zip_namelist(zip_path: str) -> list[str]:
+    """Cache the zip namelist to avoid reopening the zip for every call."""
+    if zip_path not in _zip_namelist_cache:
+        try:
+            with zipfile.ZipFile(zip_path, 'r') as zf:
+                _zip_namelist_cache[zip_path] = zf.namelist()
+        except Exception as e:
+            logger.warning(f"Failed to read zip {zip_path}: {e}")
+            _zip_namelist_cache[zip_path] = []
+    return _zip_namelist_cache[zip_path]
 def get_predictions_zip(root: str) -> str | None:
 def find_predictions_dir(root: str) -> str | None:
+    """Find the predictions directory (most recent model) or zip.
+    Returns root if predictions.zip exists, or the latest predictions subdir.
+    """
     if get_predictions_zip(root):
+        return root
     pred_root = os.path.join(root, "predictions")
     if not os.path.isdir(pred_root):
         return None
 def load_prediction_index(pred_dir: str) -> pd.DataFrame:
     """Build index of all predicted proteins with metadata."""
     rows = []
+    # ── Try reading from predictions.zip ──
     zip_path = get_predictions_zip(pred_dir)
     if zip_path:
         try:
+            with zipfile.ZipFile(zip_path, 'r') as zf:
+                # Look for index.json inside the zip
+                idx_file = next((f for f in zf.namelist() if f.endswith("index.json")), None)
+                if idx_file:
+                    with zf.open(idx_file) as f:
+                        index_dict = json.load(f)
+                        for k, v in index_dict.items():
+                            rows.append({
+                                "name": k,
+                                "seq_len": v.get("seq_len", 0),
+                                "n_modes": v.get("n_modes", 0),
+                                "mean_disp_m0": v.get("mean_disp", 0.0),
+                                "max_disp_m0": v.get("max_disp", 0.0),
+                                "top_residue": -1,
+                            })
                 else:
+                    # No index.json — scan zip for _mode_0.txt files
+                    mode0_files = [f for f in zf.namelist() if f.endswith("_mode_0.txt")]
+                    for mf in mode0_files:
+                        base = os.path.basename(mf).replace("_mode_0.txt", "")
+                        try:
+                            with zf.open(mf) as f:
+                                vecs = np.loadtxt(f)
+                            mag = np.linalg.norm(vecs, axis=1)
+                            rows.append({
+                                "name": base,
+                                "seq_len": len(vecs),
+                                "n_modes": 4,  # assume default
+                                "mean_disp_m0": float(mag.mean()),
+                                "max_disp_m0": float(mag.max()),
+                                "top_residue": int(np.argmax(mag)) + 1,
+                            })
+                        except Exception:
+                            continue
+        except Exception as e:
+            logger.warning(f"Failed to load predictions from zip: {e}")
+        if rows:
+            return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
+    # ── Fallback to loose files on disk ──
+    if os.path.isdir(pred_dir):
+        mode_files = glob.glob(os.path.join(pred_dir, "*_mode_0.txt"))
+        for mf in mode_files:
+            base = os.path.basename(mf).replace("_mode_0.txt", "")
+            try:
+                vecs = np.loadtxt(mf)
+                n_res = len(vecs)
+                mag = np.linalg.norm(vecs, axis=1)
+                n_modes = sum(1 for k in range(10)
+                              if os.path.exists(os.path.join(pred_dir, f"{base}_mode_{k}.txt")))
+                rows.append({
+                    "name": base,
+                    "seq_len": n_res,
+                    "n_modes": n_modes,
+                    "mean_disp_m0": float(mag.mean()),
+                    "max_disp_m0": float(mag.max()),
+                    "top_residue": int(np.argmax(mag)) + 1,
+                })
+            except Exception:
+                continue
+    if not rows:
+        return pd.DataFrame(columns=["name", "seq_len", "n_modes", "mean_disp_m0", "max_disp_m0", "top_residue"])
     return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
 def load_modes(pred_dir: str, name: str) -> dict[int, np.ndarray]:
     """Load all mode files for a protein."""
     modes = {}
+    # ── Try from zip ──
     zip_path = get_predictions_zip(pred_dir)
     if zip_path:
+        namelist = _get_zip_namelist(zip_path)
+        try:
+            with zipfile.ZipFile(zip_path, 'r') as zf:
+                for k in range(10):
+                    found = False
+                    for pfx in [f"extracted_{name}", name]:
+                        suffix = f"{pfx}_mode_{k}.txt"
+                        matched = next((f for f in namelist if f.endswith(f"/{suffix}") or f == suffix), None)
+                        if matched:
+                            with zf.open(matched) as f:
+                                modes[k] = np.loadtxt(f)
+                            found = True
+                            break
+                    if not found and k > 0:
+                        break  # No more modes
+        except Exception as e:
+            logger.warning(f"Failed to load modes from zip for {name}: {e}")
         if modes:
             return modes
+    # ── Fallback for loose files ──
     for k in range(10):
+        found = False
         for pfx in [f"extracted_{name}", name]:
             mf = os.path.join(pred_dir, f"{pfx}_mode_{k}.txt")
             if os.path.exists(mf):
                 modes[k] = np.loadtxt(mf)
+                found = True
                 break
+        if not found and k > 0:
+            break
     return modes
 def load_ground_truth(gt_dir: str, name: str) -> dict | None:
     """Load ground truth data for a protein."""
+    # Search in subdirectories too
+    for search_dir in [gt_dir] + [os.path.join(gt_dir, d) for d in os.listdir(gt_dir) if os.path.isdir(os.path.join(gt_dir, d))]:
+        path = os.path.join(search_dir, f"{name}.pt")
+        if os.path.exists(path):
+            try:
+                data = torch.load(path, map_location="cpu", weights_only=True)
+                return {k: v.numpy() if isinstance(v, torch.Tensor) else v
+                        for k, v in data.items()}
+            except Exception:
+                return None
+    return None
 def load_pdb_text(pdb_path: str) -> str | None: