Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -72,7 +72,13 @@ except Exception:
|
|
| 72 |
hf_hub_download = None # type: ignore
|
| 73 |
|
| 74 |
|
| 75 |
-
APP_VERSION = "2026-02-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
|
| 78 |
# -----------------------------
|
|
@@ -101,6 +107,8 @@ def _parse_dataset_ref(repo_like: str) -> Tuple[str, Optional[str]]:
|
|
| 101 |
Accept either:
|
| 102 |
- "user/repo"
|
| 103 |
- "https://huggingface.co/datasets/user/repo/blob/main/file.ext"
|
|
|
|
|
|
|
| 104 |
|
| 105 |
Returns: (repo_id, inferred_filename_or_None)
|
| 106 |
"""
|
|
@@ -113,17 +121,21 @@ def _parse_dataset_ref(repo_like: str) -> Tuple[str, Optional[str]]:
|
|
| 113 |
p = (u.path or "").strip("/")
|
| 114 |
parts = p.split("/")
|
| 115 |
|
| 116 |
-
|
|
|
|
| 117 |
repo_id = f"{parts[1]}/{parts[2]}"
|
| 118 |
inferred_file: Optional[str] = None
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
return repo_id, inferred_file
|
| 129 |
|
|
@@ -200,10 +212,10 @@ def ensure_db_file() -> Path:
|
|
| 200 |
print(f"[db] using local file: {p}")
|
| 201 |
return p
|
| 202 |
|
| 203 |
-
ds_repo_raw = _env("DATASET_REPO_ID",
|
| 204 |
ds_repo, inferred_file = _parse_dataset_ref(ds_repo_raw)
|
| 205 |
|
| 206 |
-
ds_file_raw = _env("DATASET_FILENAME",
|
| 207 |
ds_file = _clean_env_value(ds_file_raw)
|
| 208 |
|
| 209 |
if inferred_file and (not os.environ.get("DATASET_FILENAME") or not ds_file):
|
|
@@ -214,7 +226,7 @@ def ensure_db_file() -> Path:
|
|
| 214 |
|
| 215 |
local_dir = Path(_env("DB_LOCAL_DIR", "./data")).expanduser().resolve()
|
| 216 |
local_dir.mkdir(parents=True, exist_ok=True)
|
| 217 |
-
target = (local_dir / (ds_file if ds_file else
|
| 218 |
|
| 219 |
print(f"[db] DATASET_REPO_ID={ds_repo!r}")
|
| 220 |
print(f"[db] DATASET_FILENAME={ds_file!r}")
|
|
@@ -225,7 +237,7 @@ def ensure_db_file() -> Path:
|
|
| 225 |
raise RuntimeError("DATASET_REPO_ID is set, but huggingface_hub is not installed. Add it to requirements.txt.")
|
| 226 |
|
| 227 |
if not ds_file:
|
| 228 |
-
ds_file =
|
| 229 |
|
| 230 |
cached_path = hf_hub_download(
|
| 231 |
repo_id=ds_repo,
|
|
@@ -303,11 +315,11 @@ def ensure_signals_file() -> Optional[Path]:
|
|
| 303 |
return p
|
| 304 |
|
| 305 |
# optional dataset download
|
| 306 |
-
ds_repo_raw = _env("METHOD_SIGNALS_DATASET_REPO_ID",
|
| 307 |
ds_repo, inferred_file = _parse_dataset_ref(ds_repo_raw)
|
| 308 |
ds_repo = _clean_env_value(ds_repo)
|
| 309 |
|
| 310 |
-
ds_file_raw = _env("METHOD_SIGNALS_FILENAME",
|
| 311 |
ds_file = _clean_env_value(ds_file_raw)
|
| 312 |
if inferred_file and (not os.environ.get("METHOD_SIGNALS_FILENAME") or not ds_file):
|
| 313 |
ds_file = inferred_file
|
|
@@ -322,7 +334,7 @@ def ensure_signals_file() -> Optional[Path]:
|
|
| 322 |
|
| 323 |
local_dir = Path(_env("METHOD_SIGNALS_LOCAL_DIR", "./data")).expanduser().resolve()
|
| 324 |
local_dir.mkdir(parents=True, exist_ok=True)
|
| 325 |
-
target = (local_dir / (ds_file if ds_file else
|
| 326 |
|
| 327 |
print(f"[signals] METHOD_SIGNALS_DATASET_REPO_ID={ds_repo!r}")
|
| 328 |
print(f"[signals] METHOD_SIGNALS_FILENAME={ds_file!r}")
|
|
@@ -1301,7 +1313,8 @@ def ui_signals_pick(choice: str, state: Dict[str, Dict[str, Any]]):
|
|
| 1301 |
order_index=_safe_int(r.get("order_index"), -1),
|
| 1302 |
)
|
| 1303 |
|
| 1304 |
-
|
|
|
|
| 1305 |
return uid, details, f"✅ Picked {cid}", ""
|
| 1306 |
except Exception as e:
|
| 1307 |
return "", "", f"⚠️ {type(e).__name__}: {e}", _fmt_debug(e)
|
|
@@ -1329,6 +1342,12 @@ def build_ui() -> gr.Blocks:
|
|
| 1329 |
- **Search:** type words -> Search -> pick result -> Open
|
| 1330 |
- **Clusters:** Load clusters -> pick one -> Load chunks -> pick chunk -> Open
|
| 1331 |
- **Signals (optional):** Load signal cards -> pick card -> Open linked chunk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1332 |
"""
|
| 1333 |
)
|
| 1334 |
|
|
|
|
| 72 |
hf_hub_download = None # type: ignore
|
| 73 |
|
| 74 |
|
| 75 |
+
APP_VERSION = "2026-02-11_app_h_dataset_defaults"
|
| 76 |
+
|
| 77 |
+
# Defaults for your separated dataset + space setup
|
| 78 |
+
DEFAULT_DATASET_REPO_ID = "cjc0013/EpsteinWithAnomScore"
|
| 79 |
+
DEFAULT_DATASET_FILENAME = "corpus.sqlite"
|
| 80 |
+
DEFAULT_SIGNALS_DATASET_REPO_ID = "cjc0013/EpsteinWithAnomScore"
|
| 81 |
+
DEFAULT_SIGNALS_FILENAME = "public_method_sanitized_topN.jsonl"
|
| 82 |
|
| 83 |
|
| 84 |
# -----------------------------
|
|
|
|
| 107 |
Accept either:
|
| 108 |
- "user/repo"
|
| 109 |
- "https://huggingface.co/datasets/user/repo/blob/main/file.ext"
|
| 110 |
+
- "https://huggingface.co/datasets/user/repo/resolve/main/file.ext"
|
| 111 |
+
- "https://huggingface.co/spaces/user/repo"
|
| 112 |
|
| 113 |
Returns: (repo_id, inferred_filename_or_None)
|
| 114 |
"""
|
|
|
|
| 121 |
p = (u.path or "").strip("/")
|
| 122 |
parts = p.split("/")
|
| 123 |
|
| 124 |
+
# Accept both datasets and spaces URL shapes; return owner/repo
|
| 125 |
+
if len(parts) >= 3 and parts[0] in ("datasets", "spaces"):
|
| 126 |
repo_id = f"{parts[1]}/{parts[2]}"
|
| 127 |
inferred_file: Optional[str] = None
|
| 128 |
|
| 129 |
+
# Try extracting filename from blob/resolve URL forms
|
| 130 |
+
for marker in ("blob", "resolve"):
|
| 131 |
+
if marker in parts:
|
| 132 |
+
try:
|
| 133 |
+
i = parts.index(marker)
|
| 134 |
+
if i + 2 < len(parts):
|
| 135 |
+
inferred_file = "/".join(parts[i + 2 :])
|
| 136 |
+
break
|
| 137 |
+
except Exception:
|
| 138 |
+
pass
|
| 139 |
|
| 140 |
return repo_id, inferred_file
|
| 141 |
|
|
|
|
| 212 |
print(f"[db] using local file: {p}")
|
| 213 |
return p
|
| 214 |
|
| 215 |
+
ds_repo_raw = _env("DATASET_REPO_ID", DEFAULT_DATASET_REPO_ID)
|
| 216 |
ds_repo, inferred_file = _parse_dataset_ref(ds_repo_raw)
|
| 217 |
|
| 218 |
+
ds_file_raw = _env("DATASET_FILENAME", DEFAULT_DATASET_FILENAME)
|
| 219 |
ds_file = _clean_env_value(ds_file_raw)
|
| 220 |
|
| 221 |
if inferred_file and (not os.environ.get("DATASET_FILENAME") or not ds_file):
|
|
|
|
| 226 |
|
| 227 |
local_dir = Path(_env("DB_LOCAL_DIR", "./data")).expanduser().resolve()
|
| 228 |
local_dir.mkdir(parents=True, exist_ok=True)
|
| 229 |
+
target = (local_dir / (ds_file if ds_file else DEFAULT_DATASET_FILENAME)).resolve()
|
| 230 |
|
| 231 |
print(f"[db] DATASET_REPO_ID={ds_repo!r}")
|
| 232 |
print(f"[db] DATASET_FILENAME={ds_file!r}")
|
|
|
|
| 237 |
raise RuntimeError("DATASET_REPO_ID is set, but huggingface_hub is not installed. Add it to requirements.txt.")
|
| 238 |
|
| 239 |
if not ds_file:
|
| 240 |
+
ds_file = DEFAULT_DATASET_FILENAME
|
| 241 |
|
| 242 |
cached_path = hf_hub_download(
|
| 243 |
repo_id=ds_repo,
|
|
|
|
| 315 |
return p
|
| 316 |
|
| 317 |
# optional dataset download
|
| 318 |
+
ds_repo_raw = _env("METHOD_SIGNALS_DATASET_REPO_ID", DEFAULT_SIGNALS_DATASET_REPO_ID)
|
| 319 |
ds_repo, inferred_file = _parse_dataset_ref(ds_repo_raw)
|
| 320 |
ds_repo = _clean_env_value(ds_repo)
|
| 321 |
|
| 322 |
+
ds_file_raw = _env("METHOD_SIGNALS_FILENAME", DEFAULT_SIGNALS_FILENAME)
|
| 323 |
ds_file = _clean_env_value(ds_file_raw)
|
| 324 |
if inferred_file and (not os.environ.get("METHOD_SIGNALS_FILENAME") or not ds_file):
|
| 325 |
ds_file = inferred_file
|
|
|
|
| 334 |
|
| 335 |
local_dir = Path(_env("METHOD_SIGNALS_LOCAL_DIR", "./data")).expanduser().resolve()
|
| 336 |
local_dir.mkdir(parents=True, exist_ok=True)
|
| 337 |
+
target = (local_dir / (ds_file if ds_file else DEFAULT_SIGNALS_FILENAME)).resolve()
|
| 338 |
|
| 339 |
print(f"[signals] METHOD_SIGNALS_DATASET_REPO_ID={ds_repo!r}")
|
| 340 |
print(f"[signals] METHOD_SIGNALS_FILENAME={ds_file!r}")
|
|
|
|
| 1313 |
order_index=_safe_int(r.get("order_index"), -1),
|
| 1314 |
)
|
| 1315 |
|
| 1316 |
+
src_path = SIGNALS_PATH if SIGNALS_PATH is not None else ensure_signals_file()
|
| 1317 |
+
details = _signal_details_text(r, src_path)
|
| 1318 |
return uid, details, f"✅ Picked {cid}", ""
|
| 1319 |
except Exception as e:
|
| 1320 |
return "", "", f"⚠️ {type(e).__name__}: {e}", _fmt_debug(e)
|
|
|
|
| 1342 |
- **Search:** type words -> Search -> pick result -> Open
|
| 1343 |
- **Clusters:** Load clusters -> pick one -> Load chunks -> pick chunk -> Open
|
| 1344 |
- **Signals (optional):** Load signal cards -> pick card -> Open linked chunk
|
| 1345 |
+
|
| 1346 |
+
**Default dataset repo (override with env vars):**
|
| 1347 |
+
- <code>DATASET_REPO_ID={DEFAULT_DATASET_REPO_ID}</code>
|
| 1348 |
+
- <code>DATASET_FILENAME={DEFAULT_DATASET_FILENAME}</code>
|
| 1349 |
+
- <code>METHOD_SIGNALS_DATASET_REPO_ID={DEFAULT_SIGNALS_DATASET_REPO_ID}</code>
|
| 1350 |
+
- <code>METHOD_SIGNALS_FILENAME={DEFAULT_SIGNALS_FILENAME}</code>
|
| 1351 |
"""
|
| 1352 |
)
|
| 1353 |
|