Spaces:

wi-lab
/

dataset-distancing-lab

Running

App Files Files Community

wi-lab commited on Oct 18, 2025

Commit

f439c65

1 Parent(s): 7589a7e

Update embed_lwm.py

Browse files

Files changed (1) hide show

embed_lwm.py +108 -115

embed_lwm.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import sys
 from typing import List, Optional, Tuple
@@ -7,110 +8,63 @@ import torch
 def _log(msg: str):
     print(msg, flush=True)
-def _candidate_repo_dirs():
-    return [
-        os.getenv("LWM_REPO_DIR", "").strip(),
-        "./LWM-v1.1",
-        "/home/user/app/LWM-v1.1",
-    ]
-def _ensure_repo_on_path() -> Optional[str]:
-    for d in _candidate_repo_dirs():
-        if d and os.path.isdir(d):
-            if d not in sys.path:
-                sys.path.insert(0, d)
-            return d
-    return None
-def _ensure_pretrained_model_shim(repo_dir: str) -> None:
     """
-    Some LWM examples import: `from pretrained_model import lwm`
-    If the repo doesn't ship `pretrained_model.py`, but has `lwm_model.py` with class `LWM`,
-    we create a tiny shim so imports succeed.
     """
-    shim_path = os.path.join(repo_dir, "pretrained_model.py")
-    lwm_path  = os.path.join(repo_dir, "lwm_model.py")
-    if os.path.isfile(shim_path):
-        return
-    if not os.path.isfile(lwm_path):
-        return  # nothing we can do
-    # Create a simple factory around LWM
-    shim_code = """# Auto-generated shim to satisfy `from pretrained_model import lwm`
-import torch
-try:
-    from lwm_model import LWM
-except Exception as e:
-    raise ImportError(f"Shim could not import LWM from lwm_model.py: {e}")
-def lwm():
-    # Build a default LWM encoder (adjust constructor args if your repo requires them)
-    return LWM()
-"""
-    try:
-        with open(shim_path, "w", encoding="utf-8") as f:
-            f.write(shim_code)
-        _log(f"[INFO] Created shim: {shim_path}")
-    except Exception as e:
-        _log(f"[WARN] Could not create pretrained_model shim: {e}")
-def _maybe_load_weights(model, repo_dir: str):
-    # Try common weight locations
     candidates = [
-        os.path.join(repo_dir, "models", "model.pth"),
-        os.path.join(repo_dir, "model.pth"),
     ]
-    for w in candidates:
-        if os.path.isfile(w):
-            try:
-                sd = torch.load(w, map_location="cpu")
-                # Sometimes saved as {'model': state_dict}
-                if isinstance(sd, dict) and "state_dict" in sd:
-                    sd = sd["state_dict"]
-                elif isinstance(sd, dict) and "model" in sd:
-                    sd = sd["model"]
-                model.load_state_dict(sd, strict=False)
-                _log(f"[INFO] Loaded LWM weights from {w}")
-                return
-            except Exception as e:
-                _log(f"[WARN] Failed to load weights from {w}: {e}")
-    _log("[WARN] No weights file found; using randomly-initialized LWM.")
 def get_lwm_encoder():
     """
-    Try to build an LWM encoder using the cloned repo.
-    Returns a torch.nn.Module or None.
     """
-    repo_dir = _ensure_repo_on_path()
-    if not repo_dir:
-        _log("[WARN] LWM repo not found; set LWM_REPO_DIR or clone to ./LWM-v1.1")
         return None
-    # If the repo's modules expect `pretrained_model`, make sure it exists
-    _ensure_pretrained_model_shim(repo_dir)
-    # Try the most common entry point used in examples
     try:
-        # Import order: prefer pretrained_model.lwm() if available
-        import pretrained_model  # type: ignore
-        if hasattr(pretrained_model, "lwm"):
-            model = pretrained_model.lwm()
-        else:
-            # Fallback: try lwm_model directly
-            import lwm_model  # type: ignore
-            if hasattr(lwm_model, "LWM"):
-                model = lwm_model.LWM()
-            elif hasattr(lwm_model, "build_model"):
-                model = lwm_model.build_model()
-            else:
-                raise ImportError("No LWM builder found in lwm_model or pretrained_model")
-        _maybe_load_weights(model, repo_dir)
-        model.eval()
-        return model
     except Exception as e:
-        _log(f"[WARN] Failed to load LWM encoder: {e}")
         return None
 @torch.no_grad()
 def build_lwm_embeddings(
@@ -120,51 +74,90 @@ def build_lwm_embeddings(
     label_aware: bool
 ) -> Tuple[torch.Tensor, Optional[List[torch.Tensor]]]:
     """
-    Generic embedding builder:
-    - Flattens each complex channel (concat real/imag),
-    - Forwards through the model if it accepts a flat vector,
-    - Pads to a common embedding dim.
-    If forward fails, falls back to the raw flattened vector.
     """
     all_feats = []
     labels_per_ds = [] if label_aware else None
     try:
-        device = next(model.parameters()).device
-    except StopIteration:
         device = torch.device("cpu")
-    model = model.to(device).eval()
-    for chs, y, _name in datasets:
         n = min(int(n_per_dataset), int(chs.shape[0]))
         idx = torch.randperm(chs.shape[0])[:n]
         sub = chs[idx]
         feats_this = []
         for x in sub:
-            if x.ndim > 2:
-                x = x.squeeze(0)
-            vec = x.reshape(-1)
-            if torch.is_complex(vec):
-                vec = torch.cat([vec.real, vec.imag], dim=0)
-            vec = vec.to(torch.float32).unsqueeze(0).to(device)  # [1, d]
             try:
-                out = model(vec)  # adapt here if your model expects another shape
-                out = out.reshape(1, -1).detach().cpu()
             except Exception:
-                # If the model forward signature mismatches, use the raw vector
-                out = vec.detach().cpu()
-            feats_this.append(out)
         embs_this = torch.cat(feats_this, dim=0)  # [n, d’]
         all_feats.append(embs_this)
-        if label_aware and y is not None and y.numel() > 0:
-            labels_per_ds.append(y[idx].clone())
-    # Pad to common dim
     max_d = max(t.shape[1] for t in all_feats)
     padded = []
     for t in all_feats:
@@ -173,7 +166,7 @@ def build_lwm_embeddings(
             t = torch.cat([t, pad], dim=1)
         padded.append(t)
-    embs = torch.stack(padded, dim=0)  # [D, n, d]
     if label_aware:
         return embs, labels_per_ds if labels_per_ds is not None else []
     return embs, None

+# embed_lwm.py
 import os
 import sys
 from typing import List, Optional, Tuple
 def _log(msg: str):
     print(msg, flush=True)
+def _maybe_add_lwm_repo_to_path():
     """
+    Ensure the HF-cloned LWM repo is importable.
+    You can override the location with env var LWM_REPO_DIR.
     """
     candidates = [
+        os.getenv("LWM_REPO_DIR", ""),      # user override
+        "./LWM-v1.1",                       # local default
+        "/home/user/app/LWM-v1.1",          # HF Space default path
     ]
+    for c in candidates:
+        if c and os.path.isdir(c) and c not in sys.path:
+            sys.path.insert(0, c)
 def get_lwm_encoder():
     """
+    Try to load the encoder from pretrained_model.py in the HF repo.
+    Returns a torch.nn.Module or None if it can’t be loaded.
     """
+    _maybe_add_lwm_repo_to_path()
+    try:
+        # HF repo exports a builder called `lwm`
+        from pretrained_model import lwm  # type: ignore
+    except Exception as e:
+        _log(f"[WARN] Failed to import pretrained_model.lwm: {e}")
         return None
     try:
+        model = lwm()
     except Exception as e:
+        _log(f"[WARN] pretrained_model.lwm() failed to build model: {e}")
         return None
+    # Load weights if present
+    weights = None
+    for cand in ("models/model.pth", "./LWM-v1.1/models/model.pth"):
+        if os.path.isfile(cand):
+            weights = cand
+            break
+    if weights:
+        try:
+            sd = torch.load(weights, map_location="cpu")
+            try:
+                model.load_state_dict(sd)
+            except Exception:
+                # sometimes saved as {"model": state_dict}
+                if isinstance(sd, dict) and "model" in sd:
+                    model.load_state_dict(sd["model"])
+                else:
+                    raise
+        except Exception as e:
+            _log(f"[WARN] Could not load weights from {weights}: {e}")
+    model.eval()
+    return model
 @torch.no_grad()
 def build_lwm_embeddings(
     label_aware: bool
 ) -> Tuple[torch.Tensor, Optional[List[torch.Tensor]]]:
     """
+    Build per-dataset embeddings using the LWM encoder.
+    Strategy:
+    1) If `utils.tokenizer` exists in the repo, try tokenizing each channel sample
+       and pass the tokenized tensor to the model.
+    2) If that fails, try feeding a flattened real-valued vector to the model.
+    3) If the forward still fails, fall back to using the flattened vector as the “embedding”.
+    Returns:
+      embs: [D, n, d]
+      labels_per_ds (optional)
     """
+    _maybe_add_lwm_repo_to_path()
+    # Try to import tokenizer if present; fall back to identity
+    def _identity(x): return x
+    try:
+        from utils import tokenizer as lwm_tokenizer  # type: ignore
+    except Exception:
+        lwm_tokenizer = _identity  # type: ignore
     all_feats = []
     labels_per_ds = [] if label_aware else None
     try:
+        params = list(model.parameters())
+        device = next(p.device for p in params) if params else torch.device("cpu")
+    except Exception:
         device = torch.device("cpu")
+    model = model.to(device)
+    model.eval()
+    for chs, labels, _name in datasets:
         n = min(int(n_per_dataset), int(chs.shape[0]))
         idx = torch.randperm(chs.shape[0])[:n]
         sub = chs[idx]
         feats_this = []
         for x in sub:
+            # Ensure 2D (e.g., [N_ant, SC]) if possible
+            x_proc = x
+            if x_proc.ndim > 2:
+                x_proc = x_proc.squeeze(0)
+            # First, try tokenizer-based forward
+            did_forward = False
             try:
+                tok = lwm_tokenizer(x_proc)  # repo-specific; often returns a tensor
+                tok = tok.to(device)
+                y = model(tok)
+                y = torch.as_tensor(y).reshape(1, -1).detach().cpu()
+                feats_this.append(y)
+                did_forward = True
             except Exception:
+                # If tokenizer-based call fails, try flat-vector forward
+                pass
+            if not did_forward:
+                try:
+                    # Flatten to real vector
+                    vec = x_proc.reshape(-1)
+                    if torch.is_complex(vec):
+                        vec = torch.cat([vec.real, vec.imag], dim=0)
+                    vec = vec.to(torch.float32).unsqueeze(0).to(device)  # [1, d]
+                    y2 = model(vec)
+                    y2 = torch.as_tensor(y2).reshape(1, -1).detach().cpu()
+                    feats_this.append(y2)
+                    did_forward = True
+                except Exception:
+                    # Last resort: use the flattened vector as the embedding
+                    vec = x_proc.reshape(-1)
+                    if torch.is_complex(vec):
+                        vec = torch.cat([vec.real, vec.imag], dim=0)
+                    vec = vec.to(torch.float32).unsqueeze(0).cpu()
+                    feats_this.append(vec)
         embs_this = torch.cat(feats_this, dim=0)  # [n, d’]
         all_feats.append(embs_this)
+        if label_aware and labels is not None and labels.numel() > 0:
+            labels_per_ds.append(labels[idx].clone())
+    # Pad to common dimension
     max_d = max(t.shape[1] for t in all_feats)
     padded = []
     for t in all_feats:
             t = torch.cat([t, pad], dim=1)
         padded.append(t)
+    embs = torch.stack(padded, dim=0)  # [D, n, d’]
     if label_aware:
         return embs, labels_per_ds if labels_per_ds is not None else []
     return embs, None