cjc0013 commited on
Commit
b576e9f
·
verified ·
1 Parent(s): f0573fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -17
app.py CHANGED
@@ -72,7 +72,13 @@ except Exception:
72
  hf_hub_download = None # type: ignore
73
 
74
 
75
- APP_VERSION = "2026-02-11_app_g_signals"
 
 
 
 
 
 
76
 
77
 
78
  # -----------------------------
@@ -101,6 +107,8 @@ def _parse_dataset_ref(repo_like: str) -> Tuple[str, Optional[str]]:
101
  Accept either:
102
  - "user/repo"
103
  - "https://huggingface.co/datasets/user/repo/blob/main/file.ext"
 
 
104
 
105
  Returns: (repo_id, inferred_filename_or_None)
106
  """
@@ -113,17 +121,21 @@ def _parse_dataset_ref(repo_like: str) -> Tuple[str, Optional[str]]:
113
  p = (u.path or "").strip("/")
114
  parts = p.split("/")
115
 
116
- if len(parts) >= 3 and parts[0] == "datasets":
 
117
  repo_id = f"{parts[1]}/{parts[2]}"
118
  inferred_file: Optional[str] = None
119
 
120
- if "blob" in parts:
121
- try:
122
- i = parts.index("blob")
123
- if i + 2 < len(parts):
124
- inferred_file = "/".join(parts[i + 2 :])
125
- except Exception:
126
- inferred_file = None
 
 
 
127
 
128
  return repo_id, inferred_file
129
 
@@ -200,10 +212,10 @@ def ensure_db_file() -> Path:
200
  print(f"[db] using local file: {p}")
201
  return p
202
 
203
- ds_repo_raw = _env("DATASET_REPO_ID", "")
204
  ds_repo, inferred_file = _parse_dataset_ref(ds_repo_raw)
205
 
206
- ds_file_raw = _env("DATASET_FILENAME", "corpus.sqlite")
207
  ds_file = _clean_env_value(ds_file_raw)
208
 
209
  if inferred_file and (not os.environ.get("DATASET_FILENAME") or not ds_file):
@@ -214,7 +226,7 @@ def ensure_db_file() -> Path:
214
 
215
  local_dir = Path(_env("DB_LOCAL_DIR", "./data")).expanduser().resolve()
216
  local_dir.mkdir(parents=True, exist_ok=True)
217
- target = (local_dir / (ds_file if ds_file else "corpus.sqlite")).resolve()
218
 
219
  print(f"[db] DATASET_REPO_ID={ds_repo!r}")
220
  print(f"[db] DATASET_FILENAME={ds_file!r}")
@@ -225,7 +237,7 @@ def ensure_db_file() -> Path:
225
  raise RuntimeError("DATASET_REPO_ID is set, but huggingface_hub is not installed. Add it to requirements.txt.")
226
 
227
  if not ds_file:
228
- ds_file = "corpus.sqlite"
229
 
230
  cached_path = hf_hub_download(
231
  repo_id=ds_repo,
@@ -303,11 +315,11 @@ def ensure_signals_file() -> Optional[Path]:
303
  return p
304
 
305
  # optional dataset download
306
- ds_repo_raw = _env("METHOD_SIGNALS_DATASET_REPO_ID", "")
307
  ds_repo, inferred_file = _parse_dataset_ref(ds_repo_raw)
308
  ds_repo = _clean_env_value(ds_repo)
309
 
310
- ds_file_raw = _env("METHOD_SIGNALS_FILENAME", "public_method_sanitized_topN.jsonl")
311
  ds_file = _clean_env_value(ds_file_raw)
312
  if inferred_file and (not os.environ.get("METHOD_SIGNALS_FILENAME") or not ds_file):
313
  ds_file = inferred_file
@@ -322,7 +334,7 @@ def ensure_signals_file() -> Optional[Path]:
322
 
323
  local_dir = Path(_env("METHOD_SIGNALS_LOCAL_DIR", "./data")).expanduser().resolve()
324
  local_dir.mkdir(parents=True, exist_ok=True)
325
- target = (local_dir / (ds_file if ds_file else "public_method_sanitized_topN.jsonl")).resolve()
326
 
327
  print(f"[signals] METHOD_SIGNALS_DATASET_REPO_ID={ds_repo!r}")
328
  print(f"[signals] METHOD_SIGNALS_FILENAME={ds_file!r}")
@@ -1301,7 +1313,8 @@ def ui_signals_pick(choice: str, state: Dict[str, Dict[str, Any]]):
1301
  order_index=_safe_int(r.get("order_index"), -1),
1302
  )
1303
 
1304
- details = _signal_details_text(r, SIGNALS_PATH)
 
1305
  return uid, details, f"✅ Picked {cid}", ""
1306
  except Exception as e:
1307
  return "", "", f"⚠️ {type(e).__name__}: {e}", _fmt_debug(e)
@@ -1329,6 +1342,12 @@ def build_ui() -> gr.Blocks:
1329
  - **Search:** type words -> Search -> pick result -> Open
1330
  - **Clusters:** Load clusters -> pick one -> Load chunks -> pick chunk -> Open
1331
  - **Signals (optional):** Load signal cards -> pick card -> Open linked chunk
 
 
 
 
 
 
1332
  """
1333
  )
1334
 
 
72
  hf_hub_download = None # type: ignore
73
 
74
 
75
+ APP_VERSION = "2026-02-11_app_h_dataset_defaults"
76
+
77
+ # Defaults for your separated dataset + space setup
78
+ DEFAULT_DATASET_REPO_ID = "cjc0013/EpsteinWithAnomScore"
79
+ DEFAULT_DATASET_FILENAME = "corpus.sqlite"
80
+ DEFAULT_SIGNALS_DATASET_REPO_ID = "cjc0013/EpsteinWithAnomScore"
81
+ DEFAULT_SIGNALS_FILENAME = "public_method_sanitized_topN.jsonl"
82
 
83
 
84
  # -----------------------------
 
107
  Accept either:
108
  - "user/repo"
109
  - "https://huggingface.co/datasets/user/repo/blob/main/file.ext"
110
+ - "https://huggingface.co/datasets/user/repo/resolve/main/file.ext"
111
+ - "https://huggingface.co/spaces/user/repo"
112
 
113
  Returns: (repo_id, inferred_filename_or_None)
114
  """
 
121
  p = (u.path or "").strip("/")
122
  parts = p.split("/")
123
 
124
+ # Accept both datasets and spaces URL shapes; return owner/repo
125
+ if len(parts) >= 3 and parts[0] in ("datasets", "spaces"):
126
  repo_id = f"{parts[1]}/{parts[2]}"
127
  inferred_file: Optional[str] = None
128
 
129
+ # Try extracting filename from blob/resolve URL forms
130
+ for marker in ("blob", "resolve"):
131
+ if marker in parts:
132
+ try:
133
+ i = parts.index(marker)
134
+ if i + 2 < len(parts):
135
+ inferred_file = "/".join(parts[i + 2 :])
136
+ break
137
+ except Exception:
138
+ pass
139
 
140
  return repo_id, inferred_file
141
 
 
212
  print(f"[db] using local file: {p}")
213
  return p
214
 
215
+ ds_repo_raw = _env("DATASET_REPO_ID", DEFAULT_DATASET_REPO_ID)
216
  ds_repo, inferred_file = _parse_dataset_ref(ds_repo_raw)
217
 
218
+ ds_file_raw = _env("DATASET_FILENAME", DEFAULT_DATASET_FILENAME)
219
  ds_file = _clean_env_value(ds_file_raw)
220
 
221
  if inferred_file and (not os.environ.get("DATASET_FILENAME") or not ds_file):
 
226
 
227
  local_dir = Path(_env("DB_LOCAL_DIR", "./data")).expanduser().resolve()
228
  local_dir.mkdir(parents=True, exist_ok=True)
229
+ target = (local_dir / (ds_file if ds_file else DEFAULT_DATASET_FILENAME)).resolve()
230
 
231
  print(f"[db] DATASET_REPO_ID={ds_repo!r}")
232
  print(f"[db] DATASET_FILENAME={ds_file!r}")
 
237
  raise RuntimeError("DATASET_REPO_ID is set, but huggingface_hub is not installed. Add it to requirements.txt.")
238
 
239
  if not ds_file:
240
+ ds_file = DEFAULT_DATASET_FILENAME
241
 
242
  cached_path = hf_hub_download(
243
  repo_id=ds_repo,
 
315
  return p
316
 
317
  # optional dataset download
318
+ ds_repo_raw = _env("METHOD_SIGNALS_DATASET_REPO_ID", DEFAULT_SIGNALS_DATASET_REPO_ID)
319
  ds_repo, inferred_file = _parse_dataset_ref(ds_repo_raw)
320
  ds_repo = _clean_env_value(ds_repo)
321
 
322
+ ds_file_raw = _env("METHOD_SIGNALS_FILENAME", DEFAULT_SIGNALS_FILENAME)
323
  ds_file = _clean_env_value(ds_file_raw)
324
  if inferred_file and (not os.environ.get("METHOD_SIGNALS_FILENAME") or not ds_file):
325
  ds_file = inferred_file
 
334
 
335
  local_dir = Path(_env("METHOD_SIGNALS_LOCAL_DIR", "./data")).expanduser().resolve()
336
  local_dir.mkdir(parents=True, exist_ok=True)
337
+ target = (local_dir / (ds_file if ds_file else DEFAULT_SIGNALS_FILENAME)).resolve()
338
 
339
  print(f"[signals] METHOD_SIGNALS_DATASET_REPO_ID={ds_repo!r}")
340
  print(f"[signals] METHOD_SIGNALS_FILENAME={ds_file!r}")
 
1313
  order_index=_safe_int(r.get("order_index"), -1),
1314
  )
1315
 
1316
+ src_path = SIGNALS_PATH if SIGNALS_PATH is not None else ensure_signals_file()
1317
+ details = _signal_details_text(r, src_path)
1318
  return uid, details, f"✅ Picked {cid}", ""
1319
  except Exception as e:
1320
  return "", "", f"⚠️ {type(e).__name__}: {e}", _fmt_debug(e)
 
1342
  - **Search:** type words -> Search -> pick result -> Open
1343
  - **Clusters:** Load clusters -> pick one -> Load chunks -> pick chunk -> Open
1344
  - **Signals (optional):** Load signal cards -> pick card -> Open linked chunk
1345
+
1346
+ **Default dataset repo (override with env vars):**
1347
+ - <code>DATASET_REPO_ID={DEFAULT_DATASET_REPO_ID}</code>
1348
+ - <code>DATASET_FILENAME={DEFAULT_DATASET_FILENAME}</code>
1349
+ - <code>METHOD_SIGNALS_DATASET_REPO_ID={DEFAULT_SIGNALS_DATASET_REPO_ID}</code>
1350
+ - <code>METHOD_SIGNALS_FILENAME={DEFAULT_SIGNALS_FILENAME}</code>
1351
  """
1352
  )
1353