Spaces:

wi-lab
/

LWM-Spectro

Running

App Files Files Community

Namhyun Kim commited on 3 days ago

Commit

2a6ccf4

1 Parent(s): 0275ff2

Harden demo data loading (token, LFS, schema)

Browse files

Files changed (1) hide show

app.py +85 -2

app.py CHANGED Viewed

@@ -22,7 +22,19 @@ APP_DIR = Path(__file__).resolve().parent
 DEMO_DATA_PATH = APP_DIR / "demo_data.pt"
 MOE_DATA_PATH = APP_DIR / "demo_data_moe.pt"
 HUB_REPO_ID = "wi-lab/lwm-spectro"
-HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HF_HUB_TOKEN")
 # Fixed ordering for the 14 joint SNR/Doppler labels
 JOINT_LABELS = [
@@ -72,6 +84,62 @@ def _safe_load_tensor(path: Path):
     return torch.load(path, weights_only=False)
 def _create_dummy_dataset(base_path: Path, moe_path: Path) -> None:
     """Create a tiny synthetic dataset so the Space can start even if hub download fails."""
     print(f"[WARN] Creating synthetic demo dataset at {base_path}")
@@ -109,7 +177,7 @@ def _create_dummy_dataset(base_path: Path, moe_path: Path) -> None:
 def _ensure_local_file(local_path: Path, hub_filename: str) -> Optional[Path]:
     """Ensure a file exists locally; try Hub download if missing."""
-    if local_path.exists():
         return local_path
     try:
         cached = hf_hub_download(
@@ -145,7 +213,20 @@ def load_data(mapping: Dict[str, object]):
     pair_to_id = mapping["pair_to_id"]
     records = []
     for i, sample in enumerate(data):
         embedding = sample["embedding"]
         if isinstance(embedding, torch.Tensor):
             base_embedding = embedding.detach().cpu().numpy()
@@ -212,6 +293,8 @@ def load_data(mapping: Dict[str, object]):
         )
     df = pd.DataFrame(records)
     print(f"[INFO] Loaded {len(df)} samples (MoE embeddings: {has_moe})")
     return df, has_moe

 DEMO_DATA_PATH = APP_DIR / "demo_data.pt"
 MOE_DATA_PATH = APP_DIR / "demo_data_moe.pt"
 HUB_REPO_ID = "wi-lab/lwm-spectro"
+def _get_hf_token() -> str | None:
+    # Spaces / HF Hub tooling uses a few common names.
+    return (
+        os.getenv("HF_TOKEN")
+        or os.getenv("HF_HUB_TOKEN")
+        or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+        or os.getenv("HF_API_TOKEN")
+    )
+HF_TOKEN = _get_hf_token()
 # Fixed ordering for the 14 joint SNR/Doppler labels
 JOINT_LABELS = [
     return torch.load(path, weights_only=False)
+def _is_git_lfs_pointer(path: Path) -> bool:
+    try:
+        with path.open("rb") as handle:
+            head = handle.read(256)
+        return b"git-lfs.github.com/spec" in head
+    except OSError:
+        return False
+def _normalize_tech_label(value: object) -> object:
+    if value is None:
+        return value
+    text = str(value).strip()
+    if not text:
+        return value
+    normalized = text.lower().replace(" ", "").replace("-", "")
+    if normalized in {"wifi", "wi-fi", "wi_fi"}:
+        return "WiFi"
+    if normalized == "lte":
+        return "LTE"
+    if normalized in {"5g", "nr", "5gnr", "sub6", "sub6ghz", "5gsub6", "5gsub6ghz"}:
+        return "5G"
+    return text
+def _normalize_mobility_label(value: object) -> object:
+    if value is None:
+        return value
+    text = str(value).strip()
+    if not text:
+        return value
+    normalized = text.lower().replace(" ", "").replace("-", "")
+    if normalized in {"ped", "pedestrian", "walking"}:
+        return "pedestrian"
+    if normalized in {"veh", "vehicular", "vehicle", "driving", "car"}:
+        return "vehicular"
+    return text
+def _normalize_sample(sample: Dict[str, object]) -> Dict[str, object]:
+    out = dict(sample)
+    # Schema aliases (some artifacts use longer names).
+    if "tech" not in out and "technology" in out:
+        out["tech"] = out.get("technology")
+    if "mod" not in out and "modulation" in out:
+        out["mod"] = out.get("modulation")
+    if "mob" not in out and "mobility" in out:
+        out["mob"] = out.get("mobility")
+    if "snr" not in out and "snr_label" in out:
+        out["snr"] = out.get("snr_label")
+    out["tech"] = _normalize_tech_label(out.get("tech"))
+    out["mob"] = _normalize_mobility_label(out.get("mob"))
+    return out
 def _create_dummy_dataset(base_path: Path, moe_path: Path) -> None:
     """Create a tiny synthetic dataset so the Space can start even if hub download fails."""
     print(f"[WARN] Creating synthetic demo dataset at {base_path}")
 def _ensure_local_file(local_path: Path, hub_filename: str) -> Optional[Path]:
     """Ensure a file exists locally; try Hub download if missing."""
+    if local_path.exists() and not _is_git_lfs_pointer(local_path):
         return local_path
     try:
         cached = hf_hub_download(
     pair_to_id = mapping["pair_to_id"]
     records = []
+    skipped = 0
     for i, sample in enumerate(data):
+        if not isinstance(sample, dict):
+            skipped += 1
+            continue
+        sample = _normalize_sample(sample)
+        if not sample.get("tech") or not sample.get("snr") or not sample.get("mob") or not sample.get("mod"):
+            skipped += 1
+            continue
+        if "embedding" not in sample or "data" not in sample:
+            skipped += 1
+            continue
         embedding = sample["embedding"]
         if isinstance(embedding, torch.Tensor):
             base_embedding = embedding.detach().cpu().numpy()
         )
     df = pd.DataFrame(records)
+    if skipped:
+        print(f"[WARN] Skipped {skipped} malformed samples while loading demo data")
     print(f"[INFO] Loaded {len(df)} samples (MoE embeddings: {has_moe})")
     return df, has_moe