stevekor commited on
Commit
97d5381
·
1 Parent(s): 5849182

Fix demo data download auth + avoid synthetic shadowing

Browse files
Files changed (1) hide show
  1. app.py +55 -48
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import shutil
 
3
  from pathlib import Path
4
  from typing import Dict, List, Tuple, Optional
5
 
@@ -26,12 +27,25 @@ HUB_REPO_ID = "wi-lab/lwm-spectro"
26
 
27
  def _get_hf_token() -> str | None:
28
  # Spaces / HF Hub tooling uses a few common names.
29
- return (
30
  os.getenv("HF_TOKEN")
31
  or os.getenv("HF_HUB_TOKEN")
32
  or os.getenv("HUGGINGFACEHUB_API_TOKEN")
33
  or os.getenv("HF_API_TOKEN")
 
 
34
  )
 
 
 
 
 
 
 
 
 
 
 
35
 
36
 
37
  HF_TOKEN = _get_hf_token()
@@ -141,38 +155,8 @@ def _normalize_sample(sample: Dict[str, object]) -> Dict[str, object]:
141
 
142
 
143
  def _create_dummy_dataset(base_path: Path, moe_path: Path) -> None:
144
- """Create a tiny synthetic dataset so the Space can start even if hub download fails."""
145
- print(f"[WARN] Creating synthetic demo dataset at {base_path}")
146
- rng = np.random.default_rng(42)
147
- samples: List[Dict[str, object]] = []
148
- techs = ["LTE", "WiFi", "5G"]
149
- snrs = ["SNR0dB", "SNR10dB", "SNR20dB"]
150
- mods = ["QPSK", "16QAM", "64QAM"]
151
- mobs = ["pedestrian", "vehicular"]
152
-
153
- for i in range(30):
154
- tech = techs[i % len(techs)]
155
- snr = snrs[i % len(snrs)]
156
- mob = mobs[i % len(mobs)]
157
- mod = mods[i % len(mods)]
158
- spectrogram = rng.normal(size=(128, 128)).astype(np.float32)
159
- embedding = rng.normal(size=(128,)).astype(np.float32)
160
- moe_embedding = rng.normal(size=(128,)).astype(np.float32)
161
- samples.append(
162
- {
163
- "tech": tech,
164
- "snr": snr,
165
- "mod": mod,
166
- "mob": mob,
167
- "data": spectrogram,
168
- "embedding": embedding,
169
- "moe_embedding": moe_embedding,
170
- }
171
- )
172
-
173
- torch.save(samples, base_path)
174
- torch.save(samples, moe_path)
175
- print(f"[INFO] Synthetic dataset written to {base_path} and {moe_path}")
176
 
177
 
178
  def _create_dummy_samples() -> List[Dict[str, object]]:
@@ -210,20 +194,41 @@ def _ensure_local_file(local_path: Path, hub_filename: str) -> Optional[Path]:
210
  """Ensure a file exists locally; try Hub download if missing."""
211
  if local_path.exists() and not _is_git_lfs_pointer(local_path):
212
  return local_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  try:
214
  cached = hf_hub_download(
215
- repo_id=HUB_REPO_ID,
216
  filename=hub_filename,
217
- token=HF_TOKEN,
218
- repo_type="model",
219
  )
220
  cached_path = Path(cached)
221
- # Prefer using the cached path directly to avoid large file copies or
222
- # failures if the repo directory is not writable.
223
- print(f"[INFO] Using cached Hub file for {hub_filename}: {cached_path}")
224
  return cached_path
225
  except Exception as exc:
226
- print(f"[WARN] Could not download {hub_filename} from Hub ({exc}); continuing without it.")
 
 
227
  return None
228
 
229
 
@@ -239,14 +244,13 @@ def load_augmented_samples() -> Tuple[List[Dict[str, object]], bool]:
239
  print(f"[WARN] MoE data missing; falling back to base data: {base_path}")
240
  return _safe_load_tensor(base_path), False
241
 
242
- # Last resort: attempt on-disk synthetic data, otherwise in-memory.
243
- try:
244
- _create_dummy_dataset(DEMO_DATA_PATH, MOE_DATA_PATH)
245
- print(f"[WARN] Using synthetic on-disk dataset in {APP_DIR}")
246
- return _safe_load_tensor(DEMO_DATA_PATH), False
247
- except Exception as exc:
248
- print(f"[WARN] Could not create synthetic files ({exc}); using in-memory synthetic dataset")
249
- return _create_dummy_samples(), False
250
 
251
 
252
  def load_data(mapping: Dict[str, object]):
@@ -890,6 +894,8 @@ mapping_info = load_joint_mapping()
890
  df, has_moe_embeddings = load_data(mapping_info)
891
  CLASS_LABELS = mapping_info["label_names"]
892
 
 
 
893
  has_moe_column = df["moe_embedding"].apply(lambda x: x is not None)
894
  joint_eval_df = df[has_moe_column & df["joint_label_id"].notna()]
895
 
@@ -922,6 +928,7 @@ def update_modulation_choices(selected_tech: Optional[str]):
922
 
923
  with gr.Blocks(title="LWM-Spectro Lab") as demo:
924
  gr.Markdown("# 🔬 LWM-Spectro Interactive Demo")
 
925
  gr.Markdown(
926
  """
927
  **Having trouble seeing plots/images?**
 
1
  import os
2
  import shutil
3
+ import netrc
4
  from pathlib import Path
5
  from typing import Dict, List, Tuple, Optional
6
 
 
27
 
28
  def _get_hf_token() -> str | None:
29
  # Spaces / HF Hub tooling uses a few common names.
30
+ token = (
31
  os.getenv("HF_TOKEN")
32
  or os.getenv("HF_HUB_TOKEN")
33
  or os.getenv("HUGGINGFACEHUB_API_TOKEN")
34
  or os.getenv("HF_API_TOKEN")
35
+ or os.getenv("HUGGINGFACE_TOKEN")
36
+ or os.getenv("HUGGINGFACE_ACCESS_TOKEN")
37
  )
38
+ if token:
39
+ return token
40
+
41
+ # If a token exists in ~/.netrc (common in some environments), use it.
42
+ try:
43
+ auth = netrc.netrc().authenticators("huggingface.co")
44
+ if auth and auth[2]:
45
+ return auth[2]
46
+ except Exception:
47
+ return None
48
+ return None
49
 
50
 
51
  HF_TOKEN = _get_hf_token()
 
155
 
156
 
157
  def _create_dummy_dataset(base_path: Path, moe_path: Path) -> None:
158
+ """Deprecated: kept for backward compatibility, but avoided in production."""
159
+ raise RuntimeError("Synthetic on-disk dataset generation disabled")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
 
162
  def _create_dummy_samples() -> List[Dict[str, object]]:
 
194
  """Ensure a file exists locally; try Hub download if missing."""
195
  if local_path.exists() and not _is_git_lfs_pointer(local_path):
196
  return local_path
197
+
198
+ # Prefer a stored token if present (Spaces sometimes have credentials available
199
+ # even when HF_TOKEN env var is not explicitly set).
200
+ token = HF_TOKEN or True
201
+
202
+ # Try multiple repo types because the artifact may live under a model or dataset repo.
203
+ for repo_type in ("model", "dataset"):
204
+ try:
205
+ cached = hf_hub_download(
206
+ repo_id=HUB_REPO_ID,
207
+ filename=hub_filename,
208
+ token=token,
209
+ repo_type=repo_type,
210
+ )
211
+ cached_path = Path(cached)
212
+ print(f"[INFO] Using cached Hub file for {hub_filename}: {cached_path} (repo_type={repo_type})")
213
+ return cached_path
214
+ except Exception as exc:
215
+ last_exc = exc
216
+
217
+ # Final fallback: try downloading from the Space repo itself (useful when artifacts are stored in Space).
218
  try:
219
  cached = hf_hub_download(
220
+ repo_id="wi-lab/LWM-Spectro",
221
  filename=hub_filename,
222
+ token=token,
223
+ repo_type="space",
224
  )
225
  cached_path = Path(cached)
226
+ print(f"[INFO] Using cached Space file for {hub_filename}: {cached_path}")
 
 
227
  return cached_path
228
  except Exception as exc:
229
+ print(
230
+ f"[WARN] Could not download {hub_filename} from Hub ({last_exc}) or Space repo ({exc}); continuing without it."
231
+ )
232
  return None
233
 
234
 
 
244
  print(f"[WARN] MoE data missing; falling back to base data: {base_path}")
245
  return _safe_load_tensor(base_path), False
246
 
247
+ # Last resort: in-memory synthetic data (keeps app alive, but clearly not the full demo dataset).
248
+ print(
249
+ "[WARN] Falling back to a tiny synthetic dataset (30 samples). "
250
+ "This usually means the real demo_data*.pt could not be downloaded. "
251
+ "If the Hub repo is private, add a Space secret named HF_TOKEN with read access."
252
+ )
253
+ return _create_dummy_samples(), False
 
254
 
255
 
256
  def load_data(mapping: Dict[str, object]):
 
894
  df, has_moe_embeddings = load_data(mapping_info)
895
  CLASS_LABELS = mapping_info["label_names"]
896
 
897
+ DATASET_STATUS = f"Dataset loaded: {len(df)} samples | MoE embeddings: {'yes' if has_moe_embeddings else 'no'}"
898
+
899
  has_moe_column = df["moe_embedding"].apply(lambda x: x is not None)
900
  joint_eval_df = df[has_moe_column & df["joint_label_id"].notna()]
901
 
 
928
 
929
  with gr.Blocks(title="LWM-Spectro Lab") as demo:
930
  gr.Markdown("# 🔬 LWM-Spectro Interactive Demo")
931
+ gr.Markdown(f"**{DATASET_STATUS}**")
932
  gr.Markdown(
933
  """
934
  **Having trouble seeing plots/images?**