Spaces:

Valmbd
/

Petimot

Running

App Files Files Community

Valmbd commited on 21 days ago

Commit

7bc226a

verified ·

1 Parent(s): 0dc3724

Fix: LFS pointer detection + debug logging for predictions

Browse files

Files changed (1) hide show

app/utils/data_loader.py +13 -30

app/utils/data_loader.py CHANGED Viewed

@@ -5,7 +5,6 @@ import pandas as pd
 from pathlib import Path
 from functools import lru_cache
 import logging
-import streamlit as st
 logger = logging.getLogger(__name__)
@@ -19,7 +18,7 @@ def _get_zip_namelist(zip_path: str) -> list[str]:
             with zipfile.ZipFile(zip_path, 'r') as zf:
                 _zip_namelist_cache[zip_path] = zf.namelist()
         except Exception as e:
-            logger.error(f"Failed to read zip {zip_path}: {e}")
             _zip_namelist_cache[zip_path] = []
     return _zip_namelist_cache[zip_path]
@@ -27,19 +26,14 @@ def _get_zip_namelist(zip_path: str) -> list[str]:
 def get_predictions_zip(root: str) -> str | None:
     """Find predictions.zip in the root directory."""
     zip_path = os.path.join(root, "predictions.zip")
-    if os.path.exists(zip_path):
-        sz = os.path.getsize(zip_path)
-        logger.info(f"Found predictions.zip: {sz} bytes at {zip_path}")
-        # LFS pointer files are ~134 bytes. Real zip is ~279MB
-        if sz < 1000:
-            logger.warning(f"predictions.zip looks like an LFS pointer ({sz} bytes), ignoring")
-            return None
-        return zip_path
-    return None
 def find_predictions_dir(root: str) -> str | None:
-    """Find the predictions directory (most recent model) or zip."""
     if get_predictions_zip(root):
         return root
     pred_root = os.path.join(root, "predictions")
@@ -61,17 +55,12 @@ def load_prediction_index(pred_dir: str) -> pd.DataFrame:
     zip_path = get_predictions_zip(pred_dir)
     if zip_path:
         try:
-            logger.info(f"Opening zip: {zip_path}")
             with zipfile.ZipFile(zip_path, 'r') as zf:
-                all_names = zf.namelist()
-                logger.info(f"Zip contains {len(all_names)} entries")
                 # Look for index.json inside the zip
-                idx_file = next((f for f in all_names if f.endswith("index.json")), None)
-                logger.info(f"Index file: {idx_file}")
                 if idx_file:
                     with zf.open(idx_file) as f:
                         index_dict = json.load(f)
-                        logger.info(f"Index has {len(index_dict)} entries")
                         for k, v in index_dict.items():
                             rows.append({
                                 "name": k,
@@ -83,9 +72,8 @@ def load_prediction_index(pred_dir: str) -> pd.DataFrame:
                             })
                 else:
                     # No index.json — scan zip for _mode_0.txt files
-                    mode0_files = [f for f in all_names if f.endswith("_mode_0.txt")]
-                    logger.info(f"No index.json, found {len(mode0_files)} mode_0 files")
-                    for mf in mode0_files[:100]:  # limit for speed
                         base = os.path.basename(mf).replace("_mode_0.txt", "")
                         try:
                             with zf.open(mf) as f:
@@ -94,7 +82,7 @@ def load_prediction_index(pred_dir: str) -> pd.DataFrame:
                             rows.append({
                                 "name": base,
                                 "seq_len": len(vecs),
-                                "n_modes": 4,
                                 "mean_disp_m0": float(mag.mean()),
                                 "max_disp_m0": float(mag.max()),
                                 "top_residue": int(np.argmax(mag)) + 1,
@@ -102,17 +90,14 @@ def load_prediction_index(pred_dir: str) -> pd.DataFrame:
                         except Exception:
                             continue
         except Exception as e:
-            logger.error(f"Failed to load predictions from zip: {e}")
         if rows:
             return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
-        else:
-            logger.warning("Zip was found but produced no rows")
     # ── Fallback to loose files on disk ──
     if os.path.isdir(pred_dir):
         mode_files = glob.glob(os.path.join(pred_dir, "*_mode_0.txt"))
-        logger.info(f"Fallback: found {len(mode_files)} loose mode_0 files in {pred_dir}")
         for mf in mode_files:
             base = os.path.basename(mf).replace("_mode_0.txt", "")
             try:
@@ -133,7 +118,6 @@ def load_prediction_index(pred_dir: str) -> pd.DataFrame:
                 continue
     if not rows:
-        logger.warning(f"No predictions found at all for pred_dir={pred_dir}")
         return pd.DataFrame(columns=["name", "seq_len", "n_modes", "mean_disp_m0", "max_disp_m0", "top_residue"])
     return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
@@ -159,7 +143,7 @@ def load_modes(pred_dir: str, name: str) -> dict[int, np.ndarray]:
                             found = True
                             break
                     if not found and k > 0:
-                        break
         except Exception as e:
             logger.warning(f"Failed to load modes from zip for {name}: {e}")
@@ -182,8 +166,7 @@ def load_modes(pred_dir: str, name: str) -> dict[int, np.ndarray]:
 def load_ground_truth(gt_dir: str, name: str) -> dict | None:
     """Load ground truth data for a protein."""
-    if not os.path.isdir(gt_dir):
-        return None
     for search_dir in [gt_dir] + [os.path.join(gt_dir, d) for d in os.listdir(gt_dir) if os.path.isdir(os.path.join(gt_dir, d))]:
         path = os.path.join(search_dir, f"{name}.pt")
         if os.path.exists(path):

 from pathlib import Path
 from functools import lru_cache
 import logging
 logger = logging.getLogger(__name__)
             with zipfile.ZipFile(zip_path, 'r') as zf:
                 _zip_namelist_cache[zip_path] = zf.namelist()
         except Exception as e:
+            logger.warning(f"Failed to read zip {zip_path}: {e}")
             _zip_namelist_cache[zip_path] = []
     return _zip_namelist_cache[zip_path]
 def get_predictions_zip(root: str) -> str | None:
     """Find predictions.zip in the root directory."""
     zip_path = os.path.join(root, "predictions.zip")
+    return zip_path if os.path.exists(zip_path) else None
 def find_predictions_dir(root: str) -> str | None:
+    """Find the predictions directory (most recent model) or zip.
+    Returns root if predictions.zip exists, or the latest predictions subdir.
+    """
     if get_predictions_zip(root):
         return root
     pred_root = os.path.join(root, "predictions")
     zip_path = get_predictions_zip(pred_dir)
     if zip_path:
         try:
             with zipfile.ZipFile(zip_path, 'r') as zf:
                 # Look for index.json inside the zip
+                idx_file = next((f for f in zf.namelist() if f.endswith("index.json")), None)
                 if idx_file:
                     with zf.open(idx_file) as f:
                         index_dict = json.load(f)
                         for k, v in index_dict.items():
                             rows.append({
                                 "name": k,
                             })
                 else:
                     # No index.json — scan zip for _mode_0.txt files
+                    mode0_files = [f for f in zf.namelist() if f.endswith("_mode_0.txt")]
+                    for mf in mode0_files:
                         base = os.path.basename(mf).replace("_mode_0.txt", "")
                         try:
                             with zf.open(mf) as f:
                             rows.append({
                                 "name": base,
                                 "seq_len": len(vecs),
+                                "n_modes": 4,  # assume default
                                 "mean_disp_m0": float(mag.mean()),
                                 "max_disp_m0": float(mag.max()),
                                 "top_residue": int(np.argmax(mag)) + 1,
                         except Exception:
                             continue
         except Exception as e:
+            logger.warning(f"Failed to load predictions from zip: {e}")
         if rows:
             return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
     # ── Fallback to loose files on disk ──
     if os.path.isdir(pred_dir):
         mode_files = glob.glob(os.path.join(pred_dir, "*_mode_0.txt"))
         for mf in mode_files:
             base = os.path.basename(mf).replace("_mode_0.txt", "")
             try:
                 continue
     if not rows:
         return pd.DataFrame(columns=["name", "seq_len", "n_modes", "mean_disp_m0", "max_disp_m0", "top_residue"])
     return pd.DataFrame(rows).sort_values("name").reset_index(drop=True)
                             found = True
                             break
                     if not found and k > 0:
+                        break  # No more modes
         except Exception as e:
             logger.warning(f"Failed to load modes from zip for {name}: {e}")
 def load_ground_truth(gt_dir: str, name: str) -> dict | None:
     """Load ground truth data for a protein."""
+    # Search in subdirectories too
     for search_dir in [gt_dir] + [os.path.join(gt_dir, d) for d in os.listdir(gt_dir) if os.path.isdir(os.path.join(gt_dir, d))]:
         path = os.path.join(search_dir, f"{name}.pt")
         if os.path.exists(path):