Spaces:

sidataplus
/

THIRAWAT-mapper-demo

Running

App Files Files Community

na399 commited on Feb 24

Commit

25c66a0

verified ·

1 Parent(s): ca78da0

Deploy THIRAWAT mapper app

Browse files

Files changed (28) hide show

.gitattributes +2 -35
.gitignore +11 -0
AGENTS.md +39 -0
README.md +122 -14
app.py +43 -0
docs/upstream-instruction-issues.md +73 -0
pyproject.toml +30 -0
requirements.txt +6 -0
scripts/offline/build_duckdb.py +85 -0
scripts/offline/build_lancedb_index.py +179 -0
scripts/offline/publish_index_hf.py +83 -0
spec/athena +0 -0
spec/example.py +327 -0
spec/spec.md +11 -0
src/thirawat_demo/__init__.py +2 -0
src/thirawat_demo/runtime/__init__.py +8 -0
src/thirawat_demo/runtime/config.py +138 -0
src/thirawat_demo/runtime/index_loader.py +69 -0
src/thirawat_demo/runtime/peft_reranker.py +227 -0
src/thirawat_demo/runtime/search_service.py +307 -0
src/thirawat_demo/space_ui.py +310 -0
tests/conftest.py +12 -0
tests/test_build_lancedb_index.py +53 -0
tests/test_peft_reranker.py +87 -0
tests/test_runtime_config.py +82 -0
tests/test_search_service.py +152 -0
tests/test_space_ui.py +97 -0
uv.lock +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ # Auto detect text files and perform LF normalization
2	+ * text=auto

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+.venv/
+__pycache__/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.DS_Store
+data/
+!data/.gitkeep
+temp/*.tmp

AGENTS.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# Repository Guidelines
+## Project Structure & Module Organization
+- `src/thirawat_demo/runtime/`: runtime configuration, index loading, reranking, and search orchestration.
+- `src/thirawat_demo/space_ui.py`: Gradio interface logic; `app.py` is the local/Space entrypoint.
+- `scripts/offline/`: offline pipeline scripts (`build_duckdb.py`, `build_lancedb_index.py`, `publish_index_hf.py`).
+- `tests/`: pytest suite for runtime behavior and indexing workflow.
+- `data/`: local/generated artifacts (DuckDB + LanceDB); avoid treating as source code.
+- `docs/` and `spec/`: reference notes and specification examples.
+## Build, Test, and Development Commands
+- `uv sync --python 3.11`: install dependencies from `uv.lock`.
+- `uv run pytest`: run all tests in `tests/`.
+- `uv run pytest tests/test_search_service.py -q`: run a focused test file.
+- `uv run python app.py`: start the Gradio runtime app locally.
+- `uv run python scripts/offline/build_duckdb.py --help`: inspect offline build options before running data jobs.
+## Coding Style & Naming Conventions
+- Target Python 3.11+, 4-space indentation, and PEP 8-compatible formatting.
+- Prefer explicit type hints and small, single-purpose functions.
+- Naming: `snake_case` for modules/functions/variables, `PascalCase` for classes, `UPPER_SNAKE_CASE` for constants.
+- Keep the architecture boundary strict: heavy indexing stays in `scripts/offline/`; runtime code in `src/thirawat_demo/runtime/`.
+## Testing Guidelines
+- Framework: `pytest` (configured in `pyproject.toml` with `testpaths = ["tests"]`).
+- File naming: `tests/test_*.py`; test function naming: `test_*`.
+- Use `monkeypatch` for environment-variable behavior and `tmp_path` for filesystem-dependent cases.
+- Add or update tests with every behavioral change, especially around config parsing, retrieval/reranking flow, and UI integration points.
+## Commit & Pull Request Guidelines
+- Current history is minimal and uses short subject lines (example: `Initial commit`).
+- Use concise, imperative commit subjects (prefer <= 72 chars), e.g., `Add post-score validation for runtime config`.
+- PRs should include: purpose, key changes, verification commands run, and any required env var/data changes.
+- Include screenshots for UI-impacting changes (`space_ui.py`) and note any model/index compatibility implications.
+## Security & Configuration Tips
+- Never commit secrets (for example `HF_TOKEN`).
+- Keep runtime config in environment variables (`INDEX_REPO`, `DEVICE`, `LANCEDB_TABLE`, etc.).
+- Publish large index artifacts through the HF dataset workflow instead of Git.

README.md CHANGED Viewed

@@ -1,14 +1,122 @@
----
-title: THIRAWAT Mapper Demo
-emoji: 😻
-colorFrom: red
-colorTo: gray
-sdk: gradio
-sdk_version: 6.3.0
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Demo of THIRAWAT Mapper
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# THIRAWAT Mapper Demo
+This repository is intentionally split into:
+1. **Offline indexing pipeline** (Athena -> DuckDB -> LanceDB -> HF dataset upload)
+2. **Hugging Face Space runtime app** (download prebuilt index + serve Gradio UI)
+## Separation Rule
+The Space runtime must **not** build indexes.
+All heavy data preparation happens offline via scripts in `/scripts/offline`.
+## Project Layout
+- `/Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/scripts/offline/build_duckdb.py`
+- `/Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/scripts/offline/build_lancedb_index.py`
+- `/Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/scripts/offline/publish_index_hf.py`
+- `/Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/src/thirawat_demo/runtime/config.py`
+- `/Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/src/thirawat_demo/runtime/index_loader.py`
+- `/Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/src/thirawat_demo/runtime/search_service.py`
+- `/Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/src/thirawat_demo/space_ui.py`
+- `/Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/app.py`
+## Setup
+```bash
+uv sync --python 3.11
+```
+## Offline Build Flow
+### 1) Athena -> DuckDB
+```bash
+uv run python scripts/offline/build_duckdb.py \
+  --athena-dir /path/to/athena-export \
+  --out /Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/data/derived/concepts.duckdb \
+  --overwrite
+```
+### 2) DuckDB -> LanceDB index
+`--device auto` behavior:
+- macOS + MPS available: `mps`
+- otherwise if CUDA available: `cuda`
+- otherwise: `cpu`
+```bash
+uv run python scripts/offline/build_lancedb_index.py \
+  --duckdb /Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/data/derived/concepts.duckdb \
+  --out-db /Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/data/lancedb/db \
+  --table concepts_drug \
+  --device auto
+```
+Default concept classes used by the builder:
+- Clinical Drug
+- Quant Clinical Drug
+- Clinical Drug Comp
+- Clinical Drug Form
+- Branded Drug
+- Quant Branded Drug
+- Branded Drug Comp
+- Branded Drug Form
+- Ingredient
+### 3) Publish index artifact to HF dataset repo
+```bash
+export HF_TOKEN=hf_xxx
+uv run python scripts/offline/publish_index_hf.py \
+  --repo-id your-org/thirawat-mapper-demo-index \
+  --source-dir /Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/data/lancedb \
+  --revision main
+```
+## HF Space Deployment Flow
+In Space settings, configure env vars as needed:
+- `INDEX_REPO` (required unless local index is baked into image)
+- `INDEX_REVISION` (default `main`)
+- `LANCEDB_TABLE` (default `concepts_drug`)
+- `DEVICE` (default `auto`)
+- `TOP_K_DEFAULT`, `CANDIDATE_TOPK`, `RETRIEVAL_TOPK` (optional)
+- `POST_MODE` (default `tiebreak`; supported: `blend|tiebreak|lex`)
+- `POST_WEIGHT` (default `0.05`; used for `blend`)
+- `TIEBREAK_EPS` (default `0.01`)
+- `TIEBREAK_TOPN` (default `50`)
+- `POST_STRENGTH_WEIGHT` (default `0.6`)
+- `POST_JACCARD_WEIGHT` (default `0.4`)
+- `POST_BRAND_PENALTY` (default `0.3`)
+- `POST_MINMAX` (default `true`)
+- `BRAND_STRICT` (default `false`)
+- `HF_TOKEN` (optional for private repos)
+Space entrypoint:
+```bash
+python app.py
+```
+## Local Runtime Test
+```bash
+export INDEX_REPO=your-org/thirawat-mapper-demo-index
+export DEVICE=auto
+uv run python app.py
+```
+## Notes
+- Runtime search enforces two-stage retrieval: SapBERT vector retrieval + THIRAWAT reranking.
+- Runtime reranker loads PEFT adapters directly (trainer-style) and does not merge checkpoints to disk at startup.
+- Runtime ranking applies THIRAWAT deterministic post/tie-break rules.
+- Warning note: `No sentence-transformers model found with name cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR.` can appear during reranker startup. This is informational in the current setup: PyLate ColBERT first checks for a Sentence-Transformers-packaged model and then falls back to building from the base Hugging Face encoder.
+- Retrieval embeddings use Hugging Face `transformers` directly, while reranking uses PyLate ColBERT (which relies on Sentence-Transformers internals). So `sentence-transformers` is still required for reranking even though retrieval itself does not depend on it.
+- Gradio UI includes concept-class multi-select filters (include-only). Defaults come from index manifest when present.
+- Domain is fixed to `Drug` in this v1 app.
+- Upstream doc issues are tracked in:
+  - `/Users/na399/GitHub/sidataplus/THIRAWAT-mapper-demo/docs/upstream-instruction-issues.md`

app.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Hugging Face Space entrypoint."""
+from __future__ import annotations
+import os
+from pathlib import Path
+import socket
+import sys
+ROOT = Path(__file__).resolve().parent
+SRC = ROOT / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+from thirawat_demo.space_ui import build_demo
+def _is_port_free(port: int) -> bool:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return sock.connect_ex(("127.0.0.1", port)) != 0
+def _resolve_server_port() -> int:
+    """Use fixed PORT in managed environments; choose first free local port."""
+    port_env = os.getenv("PORT")
+    if port_env:
+        return int(port_env)
+    gradio_port_env = os.getenv("GRADIO_SERVER_PORT")
+    if gradio_port_env:
+        return int(gradio_port_env)
+    for candidate in range(7860, 7871):
+        if _is_port_free(candidate):
+            return candidate
+    return 7860
+demo = build_demo()
+if __name__ == "__main__":
+    port = _resolve_server_port()
+    print(f"Starting THIRAWAT Mapper Demo on http://127.0.0.1:{port}", flush=True)
+    demo.launch(server_name="0.0.0.0", server_port=port, share=False)

docs/upstream-instruction-issues.md ADDED Viewed

	@@ -0,0 +1,73 @@

+# Upstream Instruction Issues
+This note captures discrepancies found while implementing the demo so they can be fixed upstream.
+## 1) `athena2duckdb` README default output mismatch
+- Repo: <https://github.com/sidataplus/athena2duckdb>
+- README says `--out` default is `vocab.duckdb`.
+- CLI source (`src/athena2duckdb/cli.py`) uses `omop_vocab.duckdb`.
+- Suggested fix:
+  - Align README default value with code, or
+  - Change code default to match README.
+## 2) `THIRAWAT-mapper` README says reranker is gated
+- Repo: <https://github.com/sidataplus/THIRAWAT-mapper>
+- README notes `sidataplus/THIRAWAT-SapBERT` is gated.
+- Verified on **February 10, 2026** via HF model API that `gated=false`, `private=false`.
+- Suggested fix:
+  - Update README access notes and include verification date.
+## 3) `THIRAWAT-mapper` Cloudflare markdown fence issue
+- Repo: <https://github.com/sidataplus/THIRAWAT-mapper>
+- Cloudflare section has a duplicated fenced code opener in README, causing broken rendering.
+- Suggested fix:
+  - Remove the extra code fence and validate markdown rendering.
+## 4) `THIRAWAT-mapper` index build docs under-document MPS usage
+- Repo: <https://github.com/sidataplus/THIRAWAT-mapper>
+- README examples mostly show `--device cuda`.
+- Apple Silicon users can run with `--device mps`; CPU fallback is also valid.
+- Suggested fix:
+  - Add a short device matrix and one MPS example command.
+## 5) `THIRAWAT-mapper` docs can confuse on `--profiles-table`
+- Repo: <https://github.com/sidataplus/THIRAWAT-mapper>
+- README examples require `--profiles-table concept_profiles`.
+- Athena-to-DuckDB outputs typically do not contain `concept_profiles`.
+- Current builder code does fallback to inline profile creation if `profiles_table` is absent.
+- Suggested fix:
+  - Document fallback behavior clearly and show an Athena-only example path.
+## 6) `athena2duckdb==0.1.0` import-time syntax error in `loader.py`
+- Repo: <https://github.com/sidataplus/athena2duckdb>
+- In our local run on **February 10, 2026**, importing `athena2duckdb` fails:
+  - `SyntaxError: f-string expression part cannot include a backslash`
+- Affected code pattern in `loader.py`:
+  - `f"    {',\\n    '.join(columns_sql)}\\n"` inside an f-string expression.
+- Impact:
+  - `build_duckdb.py` cannot execute with published `athena2duckdb==0.1.0`.
+- Suggested fix:
+  - Refactor string assembly to avoid backslash-containing expression inside the f-string and release `0.1.1`.
+## 7) `THIRAWAT-mapper` docs mix `transformers` defaults with `--backend st` examples
+- Repo: <https://github.com/sidataplus/thirawat-mapper>
+- README states embedding backend default is `transformers` and says `--backend transformers` can be omitted.
+- But `docs/retrieval_reranker.md` examples repeatedly use `--backend st` (vector build/eval/RAG examples).
+- Suggested fix:
+  - Standardize examples to default `transformers`, or
+  - Add a clear "when to use `st` vs `transformers`" note and keep examples consistent with that guidance.
+## 8) `THIRAWAT-mapper` lacks troubleshooting note for SapBERT sentence-transformers warning
+- Repo: <https://github.com/sidataplus/thirawat-mapper>
+- During ColBERT/PyLate startup with SapBERT, users can see:
+  - `No sentence-transformers model found with name cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR.`
+- This is often benign fallback behavior, but docs do not explain it, so users may treat it as a hard failure.
+- Suggested fix:
+  - Add a short troubleshooting note clarifying expected behavior and when it is truly an error (actual load failure).
+## 9) `THIRAWAT-mapper` docs under-document deterministic post/tie-break inference rules
+- Repo: <https://github.com/sidataplus/thirawat-mapper>
+- Installed package behavior (`thirawat_mapper==0.1.4`) includes deterministic tie-break/post ranking (`tiebreak_rerank`, `enrich_with_post_scores`, `eps`, `topn`, and ordered tie-break keys).
+- Public markdown docs focus on blend-style post scoring but do not document tie-break mode semantics and knobs.
+- Suggested fix:
+  - Add an inference-ranking section documenting `blend|lex|tiebreak` modes, default parameters, and deterministic sorting behavior.

pyproject.toml ADDED Viewed

	@@ -0,0 +1,30 @@

+[project]
+name = "thirawat-mapper-demo"
+version = "0.1.0"
+description = "THIRAWAT mapper demo split into offline index pipeline and HF Space runtime app."
+readme = "README.md"
+requires-python = ">=3.11,<3.13"
+dependencies = [
+    "athena2duckdb==0.1.0",
+    "gradio==6.5.1",
+    "huggingface_hub>=0.27.0",
+    "lancedb==0.29.2",
+    "pandas>=2.2.0",
+    "peft>=0.17.0",
+    "thirawat-mapper==0.1.4",
+]
+[dependency-groups]
+dev = [
+    "pytest>=8.3.0",
+]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+[build-system]
+requires = ["hatchling>=1.24.0"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/thirawat_demo"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio==6.5.1
+huggingface_hub>=0.27.0
+lancedb==0.29.2
+pandas>=2.2.0
+peft>=0.17.0
+thirawat-mapper==0.1.4

scripts/offline/build_duckdb.py ADDED Viewed

	@@ -0,0 +1,85 @@

+#!/usr/bin/env python3
+"""Offline: convert Athena vocabulary export to DuckDB."""
+from __future__ import annotations
+import argparse
+from pathlib import Path
+REPO_ROOT = Path(__file__).resolve().parents[2]
+DEFAULT_DUCKDB_PATH = REPO_ROOT / "data" / "derived" / "concepts.duckdb"
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Build DuckDB from Athena vocabulary export.")
+    parser.add_argument("--athena-dir", required=True, help="Directory containing Athena CSV/TSV files.")
+    parser.add_argument(
+        "--out",
+        default=str(DEFAULT_DUCKDB_PATH),
+        help=f"Output DuckDB path (default: {DEFAULT_DUCKDB_PATH}).",
+    )
+    parser.add_argument("--overwrite", action="store_true", help="Overwrite existing DuckDB file.")
+    parser.add_argument("--threads", type=int, default=None, help="DuckDB threads (default: auto).")
+    parser.add_argument("--schema", default="main", help="Target schema name (default: main).")
+    parser.add_argument("--sep", default="\t", help="Input delimiter (default: tab).")
+    parser.add_argument("--encoding", default="UTF-8", help="Input encoding (default: UTF-8).")
+    return parser.parse_args()
+def run(args: argparse.Namespace) -> int:
+    try:
+        from athena2duckdb import CSVOptions, load_vocab_dir, verify_row_counts
+    except SyntaxError as exc:
+        raise RuntimeError(
+            "Failed to import athena2duckdb due upstream syntax error. "
+            "Current workaround: use an already-built DuckDB file (for example a previous vocab.duckdb) "
+            "and continue with scripts/offline/build_lancedb_index.py."
+        ) from exc
+    athena_dir = Path(args.athena_dir).expanduser().resolve()
+    out_path = Path(args.out).expanduser().resolve()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    if not athena_dir.exists():
+        raise FileNotFoundError(f"Athena directory does not exist: {athena_dir}")
+    csv_options = CSVOptions(sep=args.sep, encoding=args.encoding)
+    summary = load_vocab_dir(
+        input_dir=athena_dir,
+        out_path=out_path,
+        csv_options=csv_options,
+        overwrite=bool(args.overwrite),
+        threads=args.threads,
+        schema=args.schema,
+    )
+    print(f"Loaded {len(summary.vocab_files)} tables into {summary.db_path} (schema {summary.schema}).")
+    results = verify_row_counts(
+        db_path=summary.db_path,
+        vocab_files=summary.vocab_files,
+        csv_options=csv_options,
+        threads=args.threads,
+        schema=summary.schema,
+    )
+    mismatches = [result for result in results if not result.matches]
+    for result in results:
+        status = "OK" if result.matches else "MISMATCH"
+        print(
+            f"{status:9s} table={result.table_name:<25s} "
+            f"csv_rows={result.csv_rows:,} table_rows={result.table_rows:,}"
+        )
+    if mismatches:
+        print(f"Found {len(mismatches)} row-count mismatches.")
+        return 2
+    return 0
+def main() -> int:
+    args = parse_args()
+    return run(args)
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/offline/build_lancedb_index.py ADDED Viewed

	@@ -0,0 +1,179 @@

+#!/usr/bin/env python3
+"""Offline: build LanceDB index for THIRAWAT mapper."""
+from __future__ import annotations
+import argparse
+from pathlib import Path
+import platform
+REPO_ROOT = Path(__file__).resolve().parents[2]
+DEFAULT_DUCKDB_PATH = REPO_ROOT / "data" / "derived" / "concepts.duckdb"
+DEFAULT_LANCEDB_DIR = REPO_ROOT / "data" / "lancedb" / "db"
+DEFAULT_DOMAIN_IDS = ["Drug"]
+DEFAULT_CONCEPT_CLASSES = [
+    "Clinical Drug",
+    "Quant Clinical Drug",
+    "Clinical Drug Comp",
+    "Clinical Drug Form",
+    "Branded Drug",
+    "Quant Branded Drug",
+    "Branded Drug Comp",
+    "Branded Drug Form",
+    "Ingredient",
+]
+DEFAULT_EXCLUDED_CONCEPT_CLASSES = [
+    "Clinical Drug Box",
+    "Branded Drug Box",
+    "Branded Pack Box",
+    "Clinical Pack Box",
+    "Marketed Product",
+    "Quant Branded Box",
+    "Quant Clinical Box",
+]
+DEFAULT_EXTRA_COLUMNS = ["concept_name", "concept_code", "domain_id", "vocabulary_id", "concept_class_id"]
+DEFAULT_MODEL_ID = "cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR"
+def _split_multi(values: list[str] | None) -> list[str]:
+    if not values:
+        return []
+    items: list[str] = []
+    for value in values:
+        parts = [part.strip() for part in str(value).split(",")]
+        items.extend([part for part in parts if part])
+    return items
+def resolve_device(device: str) -> str:
+    requested = device.strip().lower()
+    if requested != "auto":
+        return requested
+    try:
+        import torch  # type: ignore
+    except Exception:
+        return "cpu"
+    is_darwin = platform.system().lower() == "darwin"
+    has_mps = bool(getattr(torch.backends, "mps", None)) and torch.backends.mps.is_available()
+    has_cuda = bool(torch.cuda.is_available())
+    # Policy requested for this repo:
+    # auto => macOS MPS first, then CUDA, then CPU.
+    if is_darwin and has_mps:
+        return "mps"
+    if has_cuda:
+        return "cuda"
+    return "cpu"
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Build LanceDB index using thirawat-mapper.")
+    parser.add_argument("--duckdb", default=str(DEFAULT_DUCKDB_PATH), help="Path to DuckDB vocabulary file.")
+    parser.add_argument("--profiles-table", default="concept_profiles", help="Profiles table name.")
+    parser.add_argument("--concepts-table", default="concept", help="Concepts table name.")
+    parser.add_argument("--out-db", default=str(DEFAULT_LANCEDB_DIR), help="Output LanceDB directory.")
+    parser.add_argument("--table", default="concepts_drug", help="Output LanceDB table name.")
+    parser.add_argument(
+        "--domain-id",
+        action="append",
+        default=None,
+        help="Domain filters, comma-separated or repeated (default: Drug).",
+    )
+    parser.add_argument(
+        "--concept-class-id",
+        action="append",
+        default=None,
+        help="Concept class filters, comma-separated or repeated.",
+    )
+    parser.add_argument(
+        "--exclude-concept-class-id",
+        action="append",
+        default=None,
+        help="Concept class exclusions, comma-separated or repeated.",
+    )
+    parser.add_argument(
+        "--extra-column",
+        action="append",
+        default=None,
+        help="Extra columns to carry into index table (comma-separated or repeated).",
+    )
+    parser.add_argument("--batch-size", type=int, default=256, help="Embedding batch size.")
+    parser.add_argument("--model-id", default=DEFAULT_MODEL_ID, help="Encoder model id.")
+    parser.add_argument("--pooling", choices=["cls", "mean"], default="cls", help="Pooling type.")
+    parser.add_argument("--max-length", type=int, default=128, help="Encoder max token length.")
+    parser.add_argument(
+        "--device",
+        choices=["auto", "mps", "cuda", "cpu"],
+        default="auto",
+        help="Index build device; auto resolves to mps (Darwin), then cuda, then cpu.",
+    )
+    parser.add_argument("--trust-remote-code", action="store_true", help="Pass trust_remote_code to encoder.")
+    return parser.parse_args()
+def run(args: argparse.Namespace) -> int:
+    from thirawat_mapper.index.build import main as thirawat_index_build_main
+    duckdb_path = Path(args.duckdb).expanduser().resolve()
+    out_db = Path(args.out_db).expanduser().resolve()
+    out_db.mkdir(parents=True, exist_ok=True)
+    if not duckdb_path.exists():
+        raise FileNotFoundError(f"DuckDB file does not exist: {duckdb_path}")
+    resolved_device = resolve_device(args.device)
+    domain_ids = _split_multi(args.domain_id) or DEFAULT_DOMAIN_IDS
+    concept_classes = _split_multi(args.concept_class_id) or DEFAULT_CONCEPT_CLASSES
+    excluded_classes = _split_multi(args.exclude_concept_class_id) or DEFAULT_EXCLUDED_CONCEPT_CLASSES
+    extra_columns = _split_multi(args.extra_column) or DEFAULT_EXTRA_COLUMNS
+    cli_args = [
+        "--duckdb",
+        str(duckdb_path),
+        "--profiles-table",
+        args.profiles_table,
+        "--concepts-table",
+        args.concepts_table,
+        "--domain-id",
+        ",".join(domain_ids),
+        "--concept-class-id",
+        ",".join(concept_classes),
+        "--exclude-concept-class-id",
+        ",".join(excluded_classes),
+        "--extra-column",
+        ",".join(extra_columns),
+        "--out-db",
+        str(out_db),
+        "--table",
+        args.table,
+        "--batch-size",
+        str(args.batch_size),
+        "--model-id",
+        args.model_id,
+        "--pooling",
+        args.pooling,
+        "--max-length",
+        str(args.max_length),
+        "--device",
+        resolved_device,
+    ]
+    if args.trust_remote_code:
+        cli_args.append("--trust-remote-code")
+    print(f"Resolved build device: {resolved_device}")
+    print("Invoking: python -m thirawat_mapper.index.build " + " ".join(cli_args))
+    thirawat_index_build_main(cli_args)
+    return 0
+def main() -> int:
+    args = parse_args()
+    return run(args)
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/offline/publish_index_hf.py ADDED Viewed

	@@ -0,0 +1,83 @@

+#!/usr/bin/env python3
+"""Offline: publish built index artifact to Hugging Face dataset repo."""
+from __future__ import annotations
+import argparse
+from pathlib import Path
+import os
+from huggingface_hub import HfApi, create_repo, upload_folder
+REPO_ROOT = Path(__file__).resolve().parents[2]
+DEFAULT_SOURCE_DIR = REPO_ROOT / "data" / "lancedb"
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Upload index artifact to Hugging Face dataset repo.")
+    parser.add_argument("--repo-id", required=True, help="Target HF dataset repo (e.g. org/name).")
+    parser.add_argument(
+        "--source-dir",
+        default=str(DEFAULT_SOURCE_DIR),
+        help=f"Directory to upload (default: {DEFAULT_SOURCE_DIR}).",
+    )
+    parser.add_argument("--revision", default="main", help="Target branch/revision (default: main).")
+    parser.add_argument("--private", action="store_true", help="Create repo as private if it does not exist.")
+    parser.add_argument(
+        "--token-env",
+        default="HF_TOKEN",
+        help="Environment variable name holding HF write token (default: HF_TOKEN).",
+    )
+    return parser.parse_args()
+def run(args: argparse.Namespace) -> int:
+    source_dir = Path(args.source_dir).expanduser().resolve()
+    if not source_dir.exists():
+        raise FileNotFoundError(f"Source directory does not exist: {source_dir}")
+    token = os.getenv(args.token_env)
+    if not token:
+        raise ValueError(f"Missing Hugging Face token in environment variable: {args.token_env}")
+    create_repo(
+        repo_id=args.repo_id,
+        repo_type="dataset",
+        private=bool(args.private),
+        token=token,
+        exist_ok=True,
+    )
+    api = HfApi(token=token)
+    if args.revision != "main":
+        try:
+            api.create_branch(repo_id=args.repo_id, repo_type="dataset", branch=args.revision, exist_ok=True)
+        except TypeError:
+            # Backward-compatible path for clients without exist_ok support.
+            try:
+                api.create_branch(repo_id=args.repo_id, repo_type="dataset", branch=args.revision)
+            except Exception:
+                pass
+    commit_info = upload_folder(
+        repo_id=args.repo_id,
+        repo_type="dataset",
+        folder_path=str(source_dir),
+        path_in_repo=".",
+        revision=args.revision,
+        commit_message="Update THIRAWAT mapper demo index artifact",
+        token=token,
+    )
+    print(f"Uploaded index artifact to {args.repo_id}@{args.revision}")
+    print(f"Commit URL: {commit_info.commit_url}")
+    return 0
+def main() -> int:
+    args = parse_args()
+    return run(args)
+if __name__ == "__main__":
+    raise SystemExit(main())

spec/athena ADDED Viewed

Binary file (948 Bytes). View file

spec/example.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""Drug Concept Entity Linking - HuggingFace Space"""
+import os
+import tempfile
+import traceback
+from pathlib import Path
+import gradio as gr
+import lancedb
+from sentence_transformers import SentenceTransformer
+import pandas as pd
+# ===== CONFIG =====
+# ดึงจาก Space Secrets (ตั้งค่าใน Settings > Secrets)
+def _get_env(name: str, default: str | None = None) -> str | None:
+    value = os.environ.get(name)
+    if value is None:
+        return default
+    value = value.strip()
+    if not value or value.lower() == "none":
+        return default
+    return value
+HF_TOKEN = _get_env("HF_TOKEN")  # หรือไม่ใส่ก็ได้ถ้า public
+INDEX_REPO = _get_env("INDEX_REPO", "amnnma/drug-concept-index")  # เปลี่ยนชื่อ repo
+LOCAL_INDEX_PATH = _get_env("LOCAL_INDEX_PATH", "data/lancedb")
+DEBUG = _get_env("DEBUG", "0") == "1"
+# Model
+MODEL_ID = "cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR"
+TOP_K = 10
+class DrugConceptSearcher:
+    def __init__(self):
+        self.model = None
+        self.db = None
+        self.table = None
+        self._load()
+    def _load(self):
+        """Load model and connect to LanceDB"""
+        print("Loading model...")
+        # Force slow tokenizer to avoid fast-tokenizer conversion issues on Space
+        self.model = SentenceTransformer(MODEL_ID, tokenizer_kwargs={"use_fast": False})
+        # Prefer local index when available (useful for local runs)
+        local_root = Path(LOCAL_INDEX_PATH) if LOCAL_INDEX_PATH else None
+        if local_root and local_root.exists() and (local_root / "db").exists():
+            index_root = local_root
+            print(f"Connecting to local LanceDB at {index_root}...")
+        else:
+            repo_id = INDEX_REPO or "amnnma/drug-concept-index"
+            if not isinstance(repo_id, str):
+                repo_id = str(repo_id)
+            repo_id = repo_id.strip()
+            if repo_id.startswith("http"):
+                # Accept full HF URLs and extract the repo id
+                parts = repo_id.split("/")
+                if "datasets" in parts:
+                    repo_id = "/".join(parts[parts.index("datasets") + 1 :]).strip("/")
+                elif "spaces" in parts:
+                    repo_id = "/".join(parts[parts.index("spaces") + 1 :]).strip("/")
+                else:
+                    repo_id = "/".join(parts[-2:]).strip("/")
+            if repo_id.startswith("datasets/"):
+                repo_id = repo_id[len("datasets/") :]
+            print(f"Connecting to LanceDB from {repo_id}...")
+            # Download และ connect ไปยัง LanceDB ใน HF repo
+            from huggingface_hub import snapshot_download
+            # Download index (cache ไว้ใน /data)
+            download_root = Path(os.environ.get("HF_DATA_DIR", "/data")) / "lancedb"
+            try:
+                download_root.mkdir(parents=True, exist_ok=True)
+            except OSError:
+                download_root = Path("data/lancedb")
+                download_root.mkdir(parents=True, exist_ok=True)
+            # Avoid implicit token usage for public datasets
+            os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
+            try:
+                index_root = Path(
+                    snapshot_download(
+                        repo_id=repo_id,
+                        repo_type="dataset",
+                        token=False,
+                        revision=os.environ.get("HF_DATASET_REVISION", "main"),
+                        local_dir=str(download_root),
+                    )
+                )
+            except Exception as e:
+                if HF_TOKEN:
+                    index_root = Path(
+                        snapshot_download(
+                            repo_id=repo_id,
+                            repo_type="dataset",
+                            token=HF_TOKEN,
+                            local_dir=str(download_root),
+                        )
+                    )
+                else:
+                    raise e
+        # Connect to LanceDB
+        self.db = lancedb.connect(str(index_root / "db"))
+        self.table = self.db.open_table("concepts_drug")
+        print("✅ Ready!")
+    def search(self, query: str, top_k: int = TOP_K):
+        """Search drug concepts"""
+        if not query or not query.strip():
+            return pd.DataFrame()
+        # Encode query
+        query_emb = self.model.encode(query, normalize_embeddings=True)
+        # Search
+        results = self.table.search(query_emb).limit(top_k).to_pandas()
+        # Format output
+        if "_distance" in results.columns:
+            results["score"] = 1 - results["_distance"]  # Convert distance to similarity
+            results = results.sort_values("score", ascending=False)
+        return results[["concept_id", "concept_name", "concept_code", "vocabulary_id", "score"]]
+# Initialize
+searcher = None
+def get_searcher():
+    global searcher
+    if searcher is None:
+        searcher = DrugConceptSearcher()
+    return searcher
+def _format_results(results: pd.DataFrame, query: str) -> tuple[str, pd.DataFrame]:
+    if results.empty:
+        return "No results found. Try a different search term.", results
+    output = f"## Results for: \"{query}\"\n\n"
+    best = results.iloc[0]
+    output += f"**Top match:** {best['concept_name']} (score {best['score']:.4f})\n\n"
+    return output, results
+def search_drugs(query: str, top_k: int):
+    """Gradio search function (single query)"""
+    try:
+        s = get_searcher()
+        results = s.search(query, top_k)
+        output, table = _format_results(results, query)
+        return output, table
+    except Exception as e:
+        print("Search error:", e)
+        print(traceback.format_exc())
+        if DEBUG:
+            return f"❌ Error: {str(e)}\n\n```\n{traceback.format_exc()}\n```", pd.DataFrame()
+        return f"❌ Error: {str(e)}", pd.DataFrame()
+def search_batch(queries_text: str, top_k: int):
+    """Gradio search function (batch queries)"""
+    try:
+        if not queries_text or not queries_text.strip():
+            return "Please enter clinical terms to search.", gr.update(visible=False)
+        lines = [line.strip() for line in queries_text.splitlines() if line.strip()]
+        if not lines:
+            return "No valid queries found.", gr.update(visible=False)
+        s = get_searcher()
+        rows = []
+        for q in lines:
+            results = s.search(q, top_k)
+            for i, (_, row) in enumerate(results.iterrows(), start=1):
+                rows.append(
+                    {
+                        "query_text": q,
+                        "rank": i,
+                        "concept_id": row["concept_id"],
+                        "concept_name": row["concept_name"],
+                        "concept_code": row["concept_code"],
+                        "vocabulary_id": row["vocabulary_id"],
+                        "score": float(row["score"]),
+                    }
+                )
+        if not rows:
+            return "No results found.", gr.update(visible=False)
+        df = pd.DataFrame(rows)
+        tmp_dir = Path(tempfile.gettempdir()) / "thirawat_results"
+        tmp_dir.mkdir(parents=True, exist_ok=True)
+        out_path = tmp_dir / "batch_results.csv"
+        df.to_csv(out_path, index=False)
+        md = f"""## Batch Search Complete
+- **Queries processed:** {len(lines)}
+- **Rows returned:** {len(rows)}
+- **Top-K per query:** {top_k}
+"""
+        return md, gr.update(value=str(out_path), visible=True)
+    except Exception as e:
+        print("Batch search error:", e)
+        print(traceback.format_exc())
+        if DEBUG:
+            return f"❌ Error: {str(e)}\n\n```\n{traceback.format_exc()}\n```", gr.update(visible=False)
+        return f"❌ Error: {str(e)}", gr.update(visible=False)
+# ===== GRADIO INTERFACE =====
+with gr.Blocks(title="THIRAWAT - Drug Concept Search") as demo:
+    gr.HTML(
+        """
+        <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; margin-bottom: 20px;">
+            <h1 style="color: white; margin: 0; font-size: 2em;">THIRAWAT</h1>
+            <p style="color: rgba(255,255,255,0.9); margin: 5px 0 0 0;">Drug Concept Entity Linking</p>
+            <p style="color: rgba(255,255,255,0.8); margin: 5px 0 0 0;">Map drug names to OMOP concepts using SapBERT + LanceDB.</p>
+        </div>
+        """
+    )
+    with gr.Tabs():
+        with gr.Tab("Single Query"):
+            with gr.Row():
+                with gr.Column(scale=3):
+                    query_input = gr.Textbox(
+                        label="Drug name or query",
+                        placeholder="e.g., aspirin, paracetamol, amoxicillin 500mg...",
+                        lines=2,
+                    )
+                with gr.Column(scale=1):
+                    domain_hint = gr.Dropdown(
+                        label="Domain",
+                        choices=["Drug", "Condition", "Procedure", "Observation", "Device", "Unit"],
+                        value="Drug",
+                        interactive=False,
+                    )
+                    top_k = gr.Slider(
+                        minimum=1,
+                        maximum=50,
+                        value=10,
+                        step=1,
+                        label="Number of results",
+                    )
+            with gr.Row():
+                search_btn = gr.Button("Search", variant="primary")
+                clear_btn = gr.Button("Clear", variant="secondary")
+            output_md = gr.Markdown(label="Results")
+            output_table = gr.Dataframe(label="Results Table", interactive=False)
+        with gr.Tab("Batch Query"):
+            with gr.Row():
+                with gr.Column(scale=3):
+                    batch_queries = gr.Textbox(
+                        label="Drug names (one per line)",
+                        placeholder="aspirin\nparacetamol\namoxicillin 500mg",
+                        lines=10,
+                    )
+                with gr.Column(scale=1):
+                    batch_domain_hint = gr.Dropdown(
+                        label="Domain",
+                        choices=["Drug", "Condition", "Procedure", "Observation", "Device", "Unit"],
+                        value="Drug",
+                        interactive=False,
+                    )
+                    batch_topk = gr.Slider(
+                        minimum=1,
+                        maximum=50,
+                        value=10,
+                        step=1,
+                        label="Top-K per query",
+                    )
+            with gr.Row():
+                batch_btn = gr.Button("Process Batch", variant="primary")
+                batch_clear = gr.Button("Clear", variant="secondary")
+            batch_output = gr.Markdown(label="Summary")
+            batch_download = gr.DownloadButton(
+                label="Download Results (CSV)",
+                variant="secondary",
+                visible=False,
+            )
+    def clear_single():
+        return "", 10, "", pd.DataFrame()
+    def clear_batch():
+        return "", 10, "", gr.update(visible=False)
+    search_btn.click(
+        fn=search_drugs,
+        inputs=[query_input, top_k],
+        outputs=[output_md, output_table],
+        api_name=False,
+    )
+    clear_btn.click(
+        fn=clear_single,
+        outputs=[query_input, top_k, output_md, output_table],
+        api_name=False,
+    )
+    batch_btn.click(
+        fn=search_batch,
+        inputs=[batch_queries, batch_topk],
+        outputs=[batch_output, batch_download],
+        api_name=False,
+    )
+    batch_clear.click(
+        fn=clear_batch,
+        outputs=[batch_queries, batch_topk, batch_output, batch_download],
+        api_name=False,
+    )
+    gr.Markdown(
+        """
+        ---
+        **THIRAWAT** is a dense retrieval toolkit for mapping drug terminology to OMOP standard concepts.
+        """
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

spec/spec.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# THIRAWAT-mapper-demo
+End-to-end demo of THIRAWAT-mapper, a tool for mapping concepts from non-standard terminologies to standard terminologies in OHDSI/OMOP CDM.
+## Key steps
+1. Turn the [vocab set downloaded from Athena](spec/athena) into DuckDB format using `pip install athena2duckdb` [https://pypi.org/project/athena2duckdb/]
+2. Follow instructions in [THIRAWAT-mapper](https://github.com/sidataplus/THIRAWAT-mapper)
+3. Use [https://huggingface.co/cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR] for retrieval with CLS pooling and [https://huggingface.co/sidataplus/THIRAWAT-SapBERT] for ColBERT reranker
+4. Build a complete gradio app, see prelim example [spec/example.py]
+5. Package everything into a Hugging Face Space [https://huggingface.co/spaces/sidataplus/THIRAWAT-mapper-demo]

src/thirawat_demo/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """THIRAWAT mapper demo package."""
2	+

src/thirawat_demo/runtime/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Runtime modules for the Hugging Face Space app."""
+from .config import RuntimeConfig
+from .index_loader import resolve_lancedb_dir
+from .peft_reranker import ThirawatPeftReranker
+from .search_service import SearchService
+__all__ = ["RuntimeConfig", "resolve_lancedb_dir", "ThirawatPeftReranker", "SearchService"]

src/thirawat_demo/runtime/config.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""Runtime configuration for the Gradio app."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+import os
+DEFAULT_ENCODER_MODEL_ID = "cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR"
+DEFAULT_RERANKER_ID = "sidataplus/THIRAWAT-SapBERT"
+def _clean_env(name: str, default: str | None = None) -> str | None:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    cleaned = value.strip()
+    if cleaned == "" or cleaned.lower() == "none":
+        return default
+    return cleaned
+def _env_int(name: str, default: int) -> int:
+    value = _clean_env(name)
+    if value is None:
+        return default
+    return int(value)
+def _env_float(name: str, default: float) -> float:
+    value = _clean_env(name)
+    if value is None:
+        return default
+    return float(value)
+def _env_bool(name: str, default: bool = False) -> bool:
+    value = _clean_env(name)
+    if value is None:
+        return default
+    return value.lower() in {"1", "true", "yes", "on"}
+@dataclass(frozen=True)
+class RuntimeConfig:
+    local_index_path: Path
+    index_repo: str | None
+    index_revision: str
+    hf_token: str | None
+    hf_data_dir: Path
+    lancedb_table: str
+    top_k_default: int
+    candidate_topk: int
+    retrieval_topk: int
+    device: str
+    encoder_model_id: str
+    reranker_id: str
+    post_mode: str
+    post_weight: float
+    tiebreak_eps: float
+    tiebreak_topn: int
+    post_strength_weight: float
+    post_jaccard_weight: float
+    post_brand_penalty: float
+    post_minmax: bool
+    brand_strict: bool
+    debug: bool
+    @classmethod
+    def from_env(cls) -> "RuntimeConfig":
+        local_index_path = Path(_clean_env("LOCAL_INDEX_PATH", "data/lancedb") or "data/lancedb")
+        index_repo = _clean_env("INDEX_REPO")
+        index_revision = _clean_env("INDEX_REVISION", "main") or "main"
+        hf_token = _clean_env("HF_TOKEN")
+        hf_data_dir = Path(_clean_env("HF_DATA_DIR", "/data") or "/data")
+        lancedb_table = _clean_env("LANCEDB_TABLE", "concepts_drug") or "concepts_drug"
+        top_k_default = _env_int("TOP_K_DEFAULT", 10)
+        candidate_topk = _env_int("CANDIDATE_TOPK", 100)
+        retrieval_topk = _env_int("RETRIEVAL_TOPK", 200)
+        device = (_clean_env("DEVICE", "auto") or "auto").lower()
+        encoder_model_id = _clean_env("ENCODER_MODEL_ID", DEFAULT_ENCODER_MODEL_ID) or DEFAULT_ENCODER_MODEL_ID
+        reranker_id = _clean_env("RERANKER_ID", DEFAULT_RERANKER_ID) or DEFAULT_RERANKER_ID
+        post_mode = (_clean_env("POST_MODE", "tiebreak") or "tiebreak").strip().lower()
+        post_weight = _env_float("POST_WEIGHT", 0.05)
+        tiebreak_eps = _env_float("TIEBREAK_EPS", 0.01)
+        tiebreak_topn = _env_int("TIEBREAK_TOPN", 50)
+        post_strength_weight = _env_float("POST_STRENGTH_WEIGHT", 0.6)
+        post_jaccard_weight = _env_float("POST_JACCARD_WEIGHT", 0.4)
+        post_brand_penalty = _env_float("POST_BRAND_PENALTY", 0.3)
+        post_minmax = _env_bool("POST_MINMAX", True)
+        brand_strict = _env_bool("BRAND_STRICT", False)
+        debug = _env_bool("DEBUG", False)
+        if candidate_topk <= 0:
+            raise ValueError("CANDIDATE_TOPK must be > 0.")
+        if retrieval_topk < candidate_topk:
+            retrieval_topk = candidate_topk
+        if top_k_default <= 0:
+            raise ValueError("TOP_K_DEFAULT must be > 0.")
+        if device not in {"cpu", "cuda", "mps", "auto"}:
+            raise ValueError("DEVICE must be one of: auto, cpu, cuda, mps.")
+        if post_mode not in {"blend", "tiebreak", "lex"}:
+            raise ValueError("POST_MODE must be one of: blend, tiebreak, lex.")
+        if tiebreak_topn <= 0:
+            raise ValueError("TIEBREAK_TOPN must be > 0.")
+        local_has_index = (local_index_path / "db").exists() or (
+            local_index_path.name == "db" and local_index_path.exists()
+        )
+        if not local_has_index and not index_repo:
+            raise ValueError(
+                "INDEX_REPO is required when LOCAL_INDEX_PATH does not point to an existing index."
+            )
+        return cls(
+            local_index_path=local_index_path,
+            index_repo=index_repo,
+            index_revision=index_revision,
+            hf_token=hf_token,
+            hf_data_dir=hf_data_dir,
+            lancedb_table=lancedb_table,
+            top_k_default=top_k_default,
+            candidate_topk=candidate_topk,
+            retrieval_topk=retrieval_topk,
+            device=device,
+            encoder_model_id=encoder_model_id,
+            reranker_id=reranker_id,
+            post_mode=post_mode,
+            post_weight=post_weight,
+            tiebreak_eps=tiebreak_eps,
+            tiebreak_topn=tiebreak_topn,
+            post_strength_weight=post_strength_weight,
+            post_jaccard_weight=post_jaccard_weight,
+            post_brand_penalty=post_brand_penalty,
+            post_minmax=post_minmax,
+            brand_strict=brand_strict,
+            debug=debug,
+        )

src/thirawat_demo/runtime/index_loader.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Index download/load helpers for runtime app."""
+from __future__ import annotations
+from pathlib import Path
+from huggingface_hub import snapshot_download
+from .config import RuntimeConfig
+def _can_connect_lancedb(path: Path) -> bool:
+    try:
+        import lancedb
+        lancedb.connect(str(path))
+        return True
+    except Exception:
+        return False
+def _find_lancedb_dir(root: Path) -> Path:
+    candidates = []
+    if root.name == "db":
+        candidates.append(root)
+    candidates.append(root / "db")
+    candidates.append(root)
+    seen: set[Path] = set()
+    for candidate in candidates:
+        if candidate in seen:
+            continue
+        seen.add(candidate)
+        if candidate.exists() and candidate.is_dir() and _can_connect_lancedb(candidate):
+            return candidate
+    for candidate in root.rglob("db"):
+        if candidate.is_dir() and _can_connect_lancedb(candidate):
+            return candidate
+    raise FileNotFoundError(f"Could not find a valid LanceDB directory under: {root}")
+def resolve_lancedb_dir(config: RuntimeConfig) -> Path:
+    local = config.local_index_path
+    if local.exists():
+        return _find_lancedb_dir(local)
+    if not config.index_repo:
+        raise ValueError("INDEX_REPO must be configured when local index is unavailable.")
+    download_root = config.hf_data_dir / "lancedb"
+    try:
+        download_root.mkdir(parents=True, exist_ok=True)
+    except OSError:
+        download_root = Path("data/lancedb")
+        download_root.mkdir(parents=True, exist_ok=True)
+    snapshot_path = Path(
+        snapshot_download(
+            repo_id=config.index_repo,
+            repo_type="dataset",
+            revision=config.index_revision,
+            token=config.hf_token or False,
+            local_dir=str(download_root),
+        )
+    )
+    return _find_lancedb_dir(snapshot_path)

src/thirawat_demo/runtime/peft_reranker.py ADDED Viewed

	@@ -0,0 +1,227 @@

+"""PEFT-aware THIRAWAT reranker compatible with LanceDB rerank API."""
+from __future__ import annotations
+from os import PathLike
+from pathlib import Path
+from typing import List, Optional
+import numpy as np
+import pyarrow as pa
+import torch
+from thirawat_mapper.models.bms_pooling import bms_scores
+DEFAULT_RERANKER_ID = "sidataplus/THIRAWAT-SapBERT"
+def disable_mean_resizing() -> None:
+    """Patch HF resize_token_embeddings default to mean_resizing=False."""
+    try:
+        from transformers import PreTrainedModel  # type: ignore
+    except Exception:
+        return
+    if getattr(PreTrainedModel, "_mean_resizing_patched", False):
+        return
+    orig_resize = PreTrainedModel.resize_token_embeddings
+    def patched(self, new_num_tokens=None, pad_to_multiple_of=None, mean_resizing=False, **kwargs):
+        return orig_resize(
+            self,
+            new_num_tokens=new_num_tokens,
+            pad_to_multiple_of=pad_to_multiple_of,
+            mean_resizing=mean_resizing,
+            **kwargs,
+        )
+    PreTrainedModel.resize_token_embeddings = patched  # type: ignore[assignment]
+    PreTrainedModel._mean_resizing_patched = True  # type: ignore[attr-defined]
+def _resolve_model_id(model_id: str | PathLike[str]) -> str:
+    try:
+        path = Path(model_id).expanduser()
+    except TypeError:
+        return str(model_id)
+    if path.exists():
+        return str(path.resolve())
+    return str(model_id)
+def _load_colbert_with_peft(
+    model_id: str,
+    device: Optional[str],
+    max_query_len: int,
+    max_doc_len: int,
+):
+    """Load ColBERT and attach PEFT adapter when the checkpoint is adapter-only."""
+    disable_mean_resizing()
+    try:
+        from peft import PeftConfig, PeftModel  # type: ignore
+    except Exception:
+        return None
+    try:
+        peft_cfg = PeftConfig.from_pretrained(model_id)
+    except Exception:
+        return None
+    try:
+        from pylate.models import ColBERT  # type: ignore
+        from safetensors.torch import load_file  # type: ignore
+    except Exception:
+        return None
+    base_id = peft_cfg.base_model_name_or_path
+    tok_kwargs = {"tokenizer_name_or_path": model_id}
+    model = ColBERT(
+        model_name_or_path=base_id,
+        device=device or None,
+        query_length=int(max_query_len),
+        document_length=int(max_doc_len),
+        tokenizer_kwargs=tok_kwargs,
+    )
+    encoder = model._first_module().auto_model  # type: ignore[attr-defined]
+    try:
+        encoder.resize_token_embeddings(len(model.tokenizer), mean_resizing=False)
+    except Exception:
+        pass
+    try:
+        encoder = PeftModel.from_pretrained(encoder, model_id, is_trainable=False)
+        encoder.eval()
+        model._first_module().auto_model = encoder  # type: ignore[attr-defined]
+    except Exception:
+        return None
+    dense_dir = Path(model_id).expanduser() / "1_Dense"
+    dense_weights = None
+    if dense_dir.exists():
+        safetensor_path = dense_dir / "model.safetensors"
+        if safetensor_path.exists():
+            try:
+                dense_weights = load_file(safetensor_path)
+            except Exception:
+                dense_weights = None
+        if dense_weights is None:
+            legacy_bin = dense_dir / "pytorch_model.bin"
+            if legacy_bin.exists():
+                try:
+                    dense_weights = torch.load(legacy_bin, map_location="cpu")
+                except Exception:
+                    dense_weights = None
+    if dense_weights and len(model) > 1:
+        try:
+            model[1].load_state_dict(dense_weights, strict=False)
+        except Exception:
+            pass
+    return model
+class _PylateColbert:
+    def __init__(self, model_id: str, device: Optional[str], max_query_len: int = 128, max_doc_len: int = 128):
+        from pylate.models import ColBERT  # type: ignore
+        peft_model = _load_colbert_with_peft(model_id, device, max_query_len, max_doc_len)
+        if peft_model is not None:
+            self.model = peft_model
+        else:
+            self.model = ColBERT(
+                model_name_or_path=model_id,
+                device=device or None,
+                query_length=int(max_query_len),
+                document_length=int(max_doc_len),
+            )
+        try:
+            if device:
+                self.model.to(torch.device(device))
+        except Exception:
+            pass
+    def encode_query(self, text: str):
+        tok = self.model.tokenize([text], is_query=True, pad=True)
+        try:
+            device = next(self.model.parameters()).device
+            tok = {k: (v.to(device) if hasattr(v, "to") else v) for k, v in tok.items()}
+        except Exception:
+            pass
+        out = self.model(tok)
+        return out["token_embeddings"], out.get("attention_mask")
+    def encode_docs(self, texts: List[str]):
+        tok = self.model.tokenize(texts, is_query=False, pad=False)
+        try:
+            device = next(self.model.parameters()).device
+            tok = {k: (v.to(device) if hasattr(v, "to") else v) for k, v in tok.items()}
+        except Exception:
+            pass
+        out = self.model(tok)
+        return out["token_embeddings"], out.get("attention_mask")
+class ThirawatPeftReranker:
+    """LanceDB-compatible THIRAWAT reranker with PEFT adapter support."""
+    def __init__(
+        self,
+        model_id: str | PathLike[str] = DEFAULT_RERANKER_ID,
+        *,
+        device: str | None = None,
+        return_score: str = "all",
+        column: str = "profile_text",
+    ) -> None:
+        self.model_id = _resolve_model_id(model_id)
+        self.device = device
+        self.return_score = return_score
+        self.score = return_score
+        self.column = column
+        self._scorer: Optional[_PylateColbert] = None
+    @property
+    def scorer(self) -> _PylateColbert:
+        if self._scorer is None:
+            self._scorer = _PylateColbert(self.model_id, self.device)
+        return self._scorer
+    def rerank(self, query: str | None, results: pa.Table) -> pa.Table:
+        return self.rerank_vector(query, results)
+    def rerank_vector(self, query: str | None, vector_results: pa.Table) -> pa.Table:
+        col = self.column if self.column in vector_results.column_names else None
+        if col is None and "profile_text_norm" in vector_results.column_names:
+            col = "profile_text_norm"
+        if col is None:
+            raise ValueError(f"Candidate table missing '{self.column}' (or 'profile_text_norm') column.")
+        texts: List[str] = vector_results[col].to_pylist()
+        qtext = str(query or "").strip()
+        if not qtext:
+            raise ValueError("A text query is required for reranking.")
+        q_emb, q_mask = self.scorer.encode_query(qtext)
+        d_emb, d_mask = self.scorer.encode_docs(texts)
+        with torch.no_grad():
+            scores = bms_scores(q_emb, d_emb, q_mask, d_mask)
+        scores = scores.detach().float().cpu().numpy().reshape(-1)
+        df = vector_results.to_pandas()
+        df["_relevance_score"] = scores.astype(np.float32)
+        df = df.sort_values("_relevance_score", ascending=False, kind="mergesort").reset_index(drop=True)
+        return pa.Table.from_pandas(df, preserve_index=False)
+    def rerank_fts(self, query: str | None, fts_results: pa.Table) -> pa.Table:
+        return self.rerank_vector(query, fts_results)
+    def rerank_hybrid(self, query: str | None, vector_results: pa.Table, fts_results: pa.Table) -> pa.Table:
+        return self.rerank_vector(query, vector_results)
+__all__ = [
+    "DEFAULT_RERANKER_ID",
+    "ThirawatPeftReranker",
+    "_load_colbert_with_peft",
+    "disable_mean_resizing",
+]

src/thirawat_demo/runtime/search_service.py ADDED Viewed

	@@ -0,0 +1,307 @@

+"""Two-stage retrieval+rereanking runtime service."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any, Sequence
+import pandas as pd
+from thirawat_mapper.infer.utils import enrich_with_post_scores, should_apply_post
+from .config import RuntimeConfig
+from .peft_reranker import ThirawatPeftReranker
+DEFAULT_CONCEPT_CLASSES = [
+    "Clinical Drug",
+    "Quant Clinical Drug",
+    "Clinical Drug Comp",
+    "Clinical Drug Form",
+    "Branded Drug",
+    "Quant Branded Drug",
+    "Branded Drug Comp",
+    "Branded Drug Form",
+    "Ingredient",
+]
+class SearchService:
+    """Runtime search service with mandatory two-stage ranking."""
+    def __init__(self, config: RuntimeConfig, lancedb_dir: Path) -> None:
+        self.config = config
+        self.lancedb_dir = lancedb_dir
+        self._table = None
+        self._vector_column: str | None = None
+        self._embedder = None
+        self._reranker = None
+        self._normalize_text = None
+        self._available_concept_classes = self._load_manifest_concept_classes()
+    def startup(self) -> None:
+        try:
+            from thirawat_mapper.infer.utils import configure_torch_for_infer, resolve_device
+            from thirawat_mapper.models import SapBERTEmbedder
+            from thirawat_mapper.utils import connect_table, normalize_text_value
+        except Exception as exc:  # pragma: no cover - import-time failure path
+            raise RuntimeError(f"Failed to import thirawat-mapper runtime dependencies: {exc}") from exc
+        device = self.config.device
+        if device == "auto":
+            device = resolve_device("auto")
+        configure_torch_for_infer(device)
+        try:
+            table, vector_column = connect_table(self.lancedb_dir, self.config.lancedb_table)
+        except Exception as exc:
+            raise RuntimeError(
+                f"Failed to connect LanceDB table '{self.config.lancedb_table}' in '{self.lancedb_dir}': {exc}"
+            ) from exc
+        try:
+            embedder = SapBERTEmbedder(
+                model_id=self.config.encoder_model_id,
+                device=device,
+                batch_size=64,
+                max_length=128,
+                pooling="cls",
+            )
+            reranker = ThirawatPeftReranker(
+                model_id=self.config.reranker_id,
+                device=device,
+                return_score="all",
+            )
+        except Exception as exc:
+            raise RuntimeError(f"Failed to initialize embedding/reranker models: {exc}") from exc
+        # Force lazy components to load at startup so runtime errors fail fast.
+        try:
+            _ = embedder.encode(["startup warmup"])
+            _ = reranker.scorer.encode_query("startup warmup")
+        except Exception as exc:
+            raise RuntimeError(f"Failed to warm up models at startup: {exc}") from exc
+        self._table = table
+        self._vector_column = vector_column
+        self._embedder = embedder
+        self._reranker = reranker
+        self._normalize_text = normalize_text_value
+    def available_concept_classes(self) -> list[str]:
+        if self._available_concept_classes:
+            return list(self._available_concept_classes)
+        self._ensure_started()
+        assert self._table is not None
+        try:
+            df = self._table.to_arrow().to_pandas()
+        except Exception:
+            return list(DEFAULT_CONCEPT_CLASSES)
+        if "concept_class_id" not in df.columns:
+            return list(DEFAULT_CONCEPT_CLASSES)
+        values = sorted(
+            {
+                str(value).strip()
+                for value in df["concept_class_id"].dropna().tolist()
+                if str(value).strip()
+            }
+        )
+        return values or list(DEFAULT_CONCEPT_CLASSES)
+    def _load_manifest_concept_classes(self) -> list[str]:
+        manifest_path = self.lancedb_dir / f"{self.config.lancedb_table}_manifest.json"
+        if not manifest_path.exists():
+            return list(DEFAULT_CONCEPT_CLASSES)
+        try:
+            payload = json.loads(manifest_path.read_text(encoding="utf-8"))
+        except Exception:
+            return list(DEFAULT_CONCEPT_CLASSES)
+        raw_values = payload.get("concept_class_id")
+        if not isinstance(raw_values, list):
+            return list(DEFAULT_CONCEPT_CLASSES)
+        values = [str(value).strip() for value in raw_values if str(value).strip()]
+        return values or list(DEFAULT_CONCEPT_CLASSES)
+    def _ensure_started(self) -> None:
+        if self._table is None or self._vector_column is None or self._embedder is None or self._reranker is None:
+            self.startup()
+    def search(self, query: str, top_k: int, concept_class_ids: Sequence[str] | None = None) -> pd.DataFrame:
+        where_clause = self._build_concept_class_where(concept_class_ids)
+        if concept_class_ids is not None and where_clause is None:
+            return pd.DataFrame(columns=self._ordered_output_columns())
+        self._ensure_started()
+        assert self._table is not None
+        assert self._vector_column is not None
+        assert self._embedder is not None
+        assert self._reranker is not None
+        assert self._normalize_text is not None
+        if not query or not query.strip():
+            return pd.DataFrame(columns=self._ordered_output_columns())
+        normalized_query = self._normalize_text(query)
+        query_emb = self._embedder.encode([normalized_query])[0]
+        builder = self._table.search(
+            query_emb.astype(float).tolist(),
+            vector_column_name=self._vector_column,
+            query_type="vector",
+        )
+        if where_clause:
+            schema_names = set(getattr(getattr(self._table, "schema", None), "names", []))
+            if "concept_class_id" not in schema_names:
+                raise RuntimeError("Requested concept class filtering but table has no 'concept_class_id' column.")
+            builder = builder.where(where_clause)
+        builder = builder.distance_type("cosine").limit(self.config.retrieval_topk)
+        # Mandatory two-stage retrieval: vector candidates then THIRAWAT reranking.
+        result = builder.rerank(reranker=self._reranker, query_string=normalized_query).limit(
+            self.config.candidate_topk
+        )
+        df = result.to_arrow().to_pandas()
+        if df.empty:
+            return pd.DataFrame(columns=self._ordered_output_columns())
+        df = self._apply_post_scoring(df, normalized_query)
+        df = self._finalize_scores(df)
+        df = self._ensure_columns(
+            df,
+            required=[
+                "concept_id",
+                "concept_name",
+                "concept_code",
+                "vocabulary_id",
+                "concept_class_id",
+                "score",
+                "retrieval_score",
+            ],
+        )
+        df.insert(0, "rank", list(range(1, len(df) + 1)))
+        return df[self._ordered_output_columns()].head(top_k).reset_index(drop=True)
+    def search_batch(
+        self,
+        queries: list[str],
+        top_k: int,
+        concept_class_ids: Sequence[str] | None = None,
+    ) -> pd.DataFrame:
+        rows: list[dict[str, Any]] = []
+        for query in queries:
+            result = self.search(query, top_k, concept_class_ids=concept_class_ids)
+            if result.empty:
+                continue
+            for _, row in result.iterrows():
+                rows.append(
+                    {
+                        "query_text": query,
+                        "rank": int(row["rank"]),
+                        "concept_id": row["concept_id"],
+                        "concept_name": row["concept_name"],
+                        "concept_code": row["concept_code"],
+                        "vocabulary_id": row["vocabulary_id"],
+                        "concept_class_id": row.get("concept_class_id"),
+                        "score": float(row["score"]),
+                        "retrieval_score": float(row["retrieval_score"]),
+                    }
+                )
+        if not rows:
+            return pd.DataFrame(
+                columns=[
+                    "query_text",
+                    "rank",
+                    "concept_id",
+                    "concept_name",
+                    "concept_code",
+                    "vocabulary_id",
+                    "concept_class_id",
+                    "score",
+                    "retrieval_score",
+                ]
+            )
+        return pd.DataFrame(rows)
+    def _apply_post_scoring(self, df: pd.DataFrame, normalized_query: str) -> pd.DataFrame:
+        if should_apply_post(self.config.post_mode, self.config.post_weight):
+            return enrich_with_post_scores(
+                df,
+                normalized_query,
+                post_strength_weight=self.config.post_strength_weight,
+                post_jaccard_weight=self.config.post_jaccard_weight,
+                post_brand_penalty=self.config.post_brand_penalty,
+                post_minmax=self.config.post_minmax,
+                post_weight=self.config.post_weight,
+                prefer_brand=True,
+                post_mode=self.config.post_mode,
+                tiebreak_eps=self.config.tiebreak_eps,
+                tiebreak_topn=self.config.tiebreak_topn,
+                brand_strict=self.config.brand_strict,
+            )
+        base_col = "_relevance_score" if "_relevance_score" in df.columns else "score"
+        if base_col in df.columns:
+            return df.sort_values(base_col, ascending=False, kind="mergesort").reset_index(drop=True)
+        return df.reset_index(drop=True)
+    @staticmethod
+    def _build_concept_class_where(concept_class_ids: Sequence[str] | None) -> str | None:
+        if concept_class_ids is None:
+            return None
+        values = sorted({str(value).strip() for value in concept_class_ids if str(value).strip()})
+        if not values:
+            return None
+        escaped = [value.replace("'", "''") for value in values]
+        if len(escaped) == 1:
+            return f"concept_class_id = '{escaped[0]}'"
+        return "concept_class_id IN (" + ",".join(f"'{value}'" for value in escaped) + ")"
+    @staticmethod
+    def _ensure_columns(df: pd.DataFrame, required: list[str]) -> pd.DataFrame:
+        for column in required:
+            if column not in df.columns:
+                df[column] = None
+        return df
+    @staticmethod
+    def _finalize_scores(df: pd.DataFrame) -> pd.DataFrame:
+        if "final_score" in df.columns:
+            df["score"] = pd.to_numeric(df["final_score"], errors="coerce").fillna(0.0)
+        elif "_relevance_score" in df.columns:
+            df["score"] = pd.to_numeric(df["_relevance_score"], errors="coerce").fillna(0.0)
+        elif "_distance" in df.columns:
+            distance = pd.to_numeric(df["_distance"], errors="coerce").fillna(1.0)
+            df["score"] = 1.0 - distance
+        elif "score" in df.columns:
+            df["score"] = pd.to_numeric(df["score"], errors="coerce").fillna(0.0)
+        else:
+            df["score"] = 0.0
+        if "retrieval_score" in df.columns:
+            df["retrieval_score"] = pd.to_numeric(df["retrieval_score"], errors="coerce").fillna(df["score"])
+        elif "_distance" in df.columns:
+            distance = pd.to_numeric(df["_distance"], errors="coerce").fillna(1.0)
+            df["retrieval_score"] = 1.0 - distance
+        else:
+            df["retrieval_score"] = df["score"]
+        return df.reset_index(drop=True)
+    @staticmethod
+    def _ordered_output_columns() -> list[str]:
+        return [
+            "rank",
+            "concept_id",
+            "concept_name",
+            "concept_code",
+            "vocabulary_id",
+            "concept_class_id",
+            "score",
+            "retrieval_score",
+        ]

src/thirawat_demo/space_ui.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""Gradio UI for Hugging Face Space runtime."""
+from __future__ import annotations
+import os
+import tempfile
+import traceback
+from pathlib import Path
+from typing import Sequence
+import gradio as gr
+import pandas as pd
+from thirawat_demo.runtime import RuntimeConfig, SearchService, resolve_lancedb_dir
+DEFAULT_CONCEPT_CLASSES = [
+    "Clinical Drug",
+    "Quant Clinical Drug",
+    "Clinical Drug Comp",
+    "Clinical Drug Form",
+    "Branded Drug",
+    "Quant Branded Drug",
+    "Branded Drug Comp",
+    "Branded Drug Form",
+    "Ingredient",
+]
+DEFAULT_SINGLE_QUERY = "Augmentin 875/125"
+_SERVICE: SearchService | None = None
+_CONFIG: RuntimeConfig | None = None
+_RESULT_COLUMNS = [
+    "rank",
+    "concept_id",
+    "concept_name",
+    "concept_code",
+    "vocabulary_id",
+    "concept_class",
+]
+def _get_service(*, startup: bool = True) -> tuple[SearchService, RuntimeConfig]:
+    global _SERVICE, _CONFIG
+    if _SERVICE is None or _CONFIG is None:
+        _CONFIG = RuntimeConfig.from_env()
+        db_dir = resolve_lancedb_dir(_CONFIG)
+        _SERVICE = SearchService(config=_CONFIG, lancedb_dir=db_dir)
+    if startup:
+        _SERVICE.startup()
+    return _SERVICE, _CONFIG
+def _resolve_concept_class_choices() -> list[str]:
+    try:
+        config = RuntimeConfig.from_env()
+    except Exception:
+        return list(DEFAULT_CONCEPT_CLASSES)
+    local = config.local_index_path
+    local_has_index = (local / "db").exists() or (local.name == "db" and local.exists())
+    if not local_has_index:
+        return list(DEFAULT_CONCEPT_CLASSES)
+    try:
+        db_dir = resolve_lancedb_dir(config)
+        service = SearchService(config=config, lancedb_dir=db_dir)
+        return service.available_concept_classes()
+    except Exception:
+        return list(DEFAULT_CONCEPT_CLASSES)
+def _top_k_default() -> int:
+    raw = (os.getenv("TOP_K_DEFAULT") or "").strip()
+    if not raw:
+        return 10
+    try:
+        value = int(raw)
+    except ValueError:
+        return 10
+    return value if value > 0 else 10
+def _empty_results_df() -> pd.DataFrame:
+    return pd.DataFrame(columns=_RESULT_COLUMNS)
+def _to_display_results(results: pd.DataFrame) -> pd.DataFrame:
+    if results is None or results.empty:
+        return _empty_results_df()
+    df = results.copy()
+    concept_class = (
+        df["concept_class_id"].astype(str)
+        if "concept_class_id" in df.columns
+        else pd.Series([""] * len(df), index=df.index, dtype=object)
+    )
+    concept_class = concept_class.where(~concept_class.isin(["None", "nan"]), "")
+    df["concept_class"] = concept_class
+    for column in _RESULT_COLUMNS:
+        if column not in df.columns:
+            df[column] = None
+    return df[_RESULT_COLUMNS].reset_index(drop=True)
+def _format_single_results(query: str, results: pd.DataFrame) -> tuple[str, pd.DataFrame]:
+    if results.empty:
+        return "No results found. Try a different query.", _empty_results_df()
+    top = results.iloc[0]
+    md = (
+        f"## Results for: \"{query}\"\n\n"
+        f"**Top match:** {top['concept_name']} (score {float(top['score']):.4f})\n\n"
+        f"Rows returned: **{len(results)}**"
+    )
+    return md, _to_display_results(results)
+def _validate_concept_classes(concept_class_ids: Sequence[str] | None) -> list[str] | None:
+    if concept_class_ids is None:
+        return None
+    cleaned = [str(value).strip() for value in concept_class_ids if str(value).strip()]
+    return cleaned
+def search_single(query: str, top_k: int, concept_class_ids: Sequence[str]) -> tuple[str, pd.DataFrame]:
+    try:
+        selected = _validate_concept_classes(concept_class_ids)
+        if selected is not None and not selected:
+            return "Please select at least one concept class.", _empty_results_df()
+        service, _ = _get_service()
+        results = service.search(query=query, top_k=top_k, concept_class_ids=selected)
+        return _format_single_results(query, results)
+    except Exception as exc:  # pragma: no cover - runtime path
+        stack = traceback.format_exc()
+        print(stack)
+        return f"Error: {exc}", _empty_results_df()
+def search_batch(queries_text: str, top_k: int, concept_class_ids: Sequence[str]) -> tuple[str, gr.update]:
+    try:
+        lines = [line.strip() for line in (queries_text or "").splitlines() if line.strip()]
+        if not lines:
+            return "Please provide one query per line.", gr.update(visible=False)
+        selected = _validate_concept_classes(concept_class_ids)
+        if selected is not None and not selected:
+            return "Please select at least one concept class.", gr.update(visible=False)
+        service, _ = _get_service()
+        results = service.search_batch(lines, top_k=top_k, concept_class_ids=selected)
+        if results.empty:
+            return "No batch results found.", gr.update(visible=False)
+        out_dir = Path(tempfile.gettempdir()) / "thirawat_mapper_demo"
+        out_dir.mkdir(parents=True, exist_ok=True)
+        out_path = out_dir / "batch_results.csv"
+        results.to_csv(out_path, index=False)
+        md = (
+            "## Batch Search Complete\n\n"
+            f"- Queries processed: **{len(lines)}**\n"
+            f"- Rows returned: **{len(results)}**\n"
+            f"- Top-K per query: **{top_k}**"
+        )
+        return md, gr.update(value=str(out_path), visible=True)
+    except Exception as exc:  # pragma: no cover - runtime path
+        stack = traceback.format_exc()
+        print(stack)
+        return f"Error: {exc}", gr.update(visible=False)
+def _clear_single(concept_classes: Sequence[str]) -> tuple[str, int, Sequence[str], str, pd.DataFrame]:
+    return DEFAULT_SINGLE_QUERY, _top_k_default(), concept_classes, "", _empty_results_df()
+def _clear_batch(concept_classes: Sequence[str]) -> tuple[str, int, Sequence[str], str, gr.update]:
+    return "", _top_k_default(), concept_classes, "", gr.update(visible=False)
+def build_demo() -> gr.Blocks:
+    top_k_default = _top_k_default()
+    concept_class_choices = _resolve_concept_class_choices()
+    concept_class_default = list(concept_class_choices)
+    with gr.Blocks(title="THIRAWAT Mapper Demo") as demo:
+        gr.Markdown(
+            """
+# THIRAWAT Mapper Demo
+Map non-standard drug terms to OMOP standard concepts using SapBERT retrieval + THIRAWAT reranking.
+"""
+        )
+        with gr.Tabs():
+            with gr.Tab("Single Query"):
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        query_input = gr.Textbox(
+                            label="Drug query",
+                            placeholder="e.g., aspirin, amoxicillin 500 mg, paracetamol",
+                            value=DEFAULT_SINGLE_QUERY,
+                            lines=1,
+                        )
+                    with gr.Column(scale=1):
+                        domain = gr.Dropdown(
+                            label="Domain",
+                            choices=["Drug"],
+                            value="Drug",
+                            interactive=False,
+                        )
+                        top_k = gr.Slider(
+                            minimum=1,
+                            maximum=50,
+                            value=top_k_default,
+                            step=1,
+                            label="Number of results",
+                        )
+                with gr.Row():
+                    single_search = gr.Button("Search", variant="primary")
+                    single_clear = gr.Button("Clear")
+                with gr.Row():
+                    single_concept_classes = gr.Dropdown(
+                        label="Concept classes",
+                        choices=concept_class_choices,
+                        value=concept_class_default,
+                        multiselect=True,
+                        filterable=True,
+                        allow_custom_value=False,
+                        interactive=True,
+                    )
+                single_md = gr.Markdown(label="Summary")
+                single_table = gr.Dataframe(
+                    label="Results",
+                    headers=_RESULT_COLUMNS,
+                    value=_empty_results_df(),
+                    interactive=False,
+                )
+            with gr.Tab("Batch Query"):
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        batch_input = gr.Textbox(
+                            label="Queries (one per line)",
+                            placeholder="aspirin\namoxicillin 500 mg\nparacetamol",
+                            lines=10,
+                        )
+                    with gr.Column(scale=1):
+                        batch_domain = gr.Dropdown(
+                            label="Domain",
+                            choices=["Drug"],
+                            value="Drug",
+                            interactive=False,
+                        )
+                        batch_top_k = gr.Slider(
+                            minimum=1,
+                            maximum=50,
+                            value=top_k_default,
+                            step=1,
+                            label="Top-K per query",
+                        )
+                with gr.Row():
+                    batch_search_btn = gr.Button("Process Batch", variant="primary")
+                    batch_clear_btn = gr.Button("Clear")
+                with gr.Row():
+                    batch_concept_classes = gr.Dropdown(
+                        label="Concept classes",
+                        choices=concept_class_choices,
+                        value=concept_class_default,
+                        multiselect=True,
+                        filterable=True,
+                        allow_custom_value=False,
+                        interactive=True,
+                    )
+                batch_md = gr.Markdown(label="Summary")
+                batch_download = gr.DownloadButton(
+                    label="Download CSV",
+                    variant="secondary",
+                    visible=False,
+                )
+        single_search.click(
+            fn=search_single,
+            inputs=[query_input, top_k, single_concept_classes],
+            outputs=[single_md, single_table],
+            api_name="search_single",
+        )
+        query_input.submit(
+            fn=search_single,
+            inputs=[query_input, top_k, single_concept_classes],
+            outputs=[single_md, single_table],
+            api_name=False,
+        )
+        single_clear.click(
+            fn=lambda: _clear_single(concept_class_default),
+            outputs=[query_input, top_k, single_concept_classes, single_md, single_table],
+            api_name=False,
+        )
+        batch_search_btn.click(
+            fn=search_batch,
+            inputs=[batch_input, batch_top_k, batch_concept_classes],
+            outputs=[batch_md, batch_download],
+            api_name="search_batch",
+        )
+        batch_clear_btn.click(
+            fn=lambda: _clear_batch(concept_class_default),
+            outputs=[batch_input, batch_top_k, batch_concept_classes, batch_md, batch_download],
+            api_name=False,
+        )
+        domain.change(lambda: "Drug", outputs=domain, api_name=False)
+        batch_domain.change(lambda: "Drug", outputs=batch_domain, api_name=False)
+    return demo

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Test bootstrap for src-layout imports."""
+from __future__ import annotations
+from pathlib import Path
+import sys
+ROOT = Path(__file__).resolve().parents[1]
+SRC = ROOT / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))

tests/test_build_lancedb_index.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from __future__ import annotations
+import importlib.util
+from pathlib import Path
+import sys
+import types
+def _load_script_module():
+    root = Path(__file__).resolve().parents[1]
+    script_path = root / "scripts" / "offline" / "build_lancedb_index.py"
+    spec = importlib.util.spec_from_file_location("build_lancedb_index", script_path)
+    assert spec is not None
+    assert spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+def test_default_concept_classes_exact():
+    module = _load_script_module()
+    expected = [
+        "Clinical Drug",
+        "Quant Clinical Drug",
+        "Clinical Drug Comp",
+        "Clinical Drug Form",
+        "Branded Drug",
+        "Quant Branded Drug",
+        "Branded Drug Comp",
+        "Branded Drug Form",
+        "Ingredient",
+    ]
+    assert module.DEFAULT_CONCEPT_CLASSES == expected
+def test_resolve_device_prefers_mps_on_darwin(monkeypatch):
+    module = _load_script_module()
+    monkeypatch.setattr(module.platform, "system", lambda: "Darwin")
+    fake_torch = types.SimpleNamespace(
+        cuda=types.SimpleNamespace(is_available=lambda: True),
+        backends=types.SimpleNamespace(mps=types.SimpleNamespace(is_available=lambda: True)),
+    )
+    monkeypatch.setitem(sys.modules, "torch", fake_torch)
+    assert module.resolve_device("auto") == "mps"
+def test_resolve_device_explicit_passthrough():
+    module = _load_script_module()
+    assert module.resolve_device("cpu") == "cpu"
+    assert module.resolve_device("cuda") == "cuda"
+    assert module.resolve_device("mps") == "mps"

tests/test_peft_reranker.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from __future__ import annotations
+import sys
+import types
+import thirawat_demo.runtime.peft_reranker as peft_mod
+def test_load_colbert_with_peft_resizes_before_adapter(monkeypatch):
+    events: list[tuple] = []
+    class FakeEncoder:
+        def resize_token_embeddings(self, size, mean_resizing=False):
+            events.append(("resize", size, mean_resizing))
+    class FakeFirstModule:
+        def __init__(self):
+            self.auto_model = FakeEncoder()
+    class FakeDense:
+        def load_state_dict(self, state_dict, strict=False):
+            events.append(("dense_load", strict))
+    class FakeColBERT:
+        def __init__(self, *args, **kwargs):
+            events.append(("colbert_init", kwargs.get("model_name_or_path"), kwargs.get("tokenizer_kwargs")))
+            self._first = FakeFirstModule()
+            self._dense = FakeDense()
+            self.tokenizer = [0, 1, 2, 3]
+        def _first_module(self):
+            return self._first
+        def __len__(self):
+            return 2
+        def __getitem__(self, idx):
+            assert idx == 1
+            return self._dense
+    class FakePeftCfg:
+        base_model_name_or_path = "cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR"
+    class FakePeftConfig:
+        @staticmethod
+        def from_pretrained(model_id):
+            events.append(("peft_config", model_id))
+            return FakePeftCfg()
+    class FakePeftModel:
+        @staticmethod
+        def from_pretrained(encoder, model_id, is_trainable=False):
+            events.append(("peft_load", model_id, is_trainable))
+            class Wrapped:
+                def eval(self):
+                    events.append(("peft_eval",))
+            return Wrapped()
+    pylate_module = types.ModuleType("pylate")
+    pylate_models = types.ModuleType("pylate.models")
+    pylate_models.ColBERT = FakeColBERT
+    pylate_module.models = pylate_models
+    safetensors_module = types.ModuleType("safetensors")
+    safetensors_torch = types.ModuleType("safetensors.torch")
+    safetensors_torch.load_file = lambda _: {}
+    safetensors_module.torch = safetensors_torch
+    monkeypatch.setattr(peft_mod, "disable_mean_resizing", lambda: events.append(("disable_mean_resizing",)))
+    monkeypatch.setitem(
+        sys.modules,
+        "peft",
+        types.SimpleNamespace(PeftConfig=FakePeftConfig, PeftModel=FakePeftModel),
+    )
+    monkeypatch.setitem(sys.modules, "pylate", pylate_module)
+    monkeypatch.setitem(sys.modules, "pylate.models", pylate_models)
+    monkeypatch.setitem(sys.modules, "safetensors", safetensors_module)
+    monkeypatch.setitem(sys.modules, "safetensors.torch", safetensors_torch)
+    model = peft_mod._load_colbert_with_peft("sidataplus/THIRAWAT-SapBERT", "cpu", 128, 128)
+    assert model is not None
+    resize_idx = next(idx for idx, entry in enumerate(events) if entry[0] == "resize")
+    peft_idx = next(idx for idx, entry in enumerate(events) if entry[0] == "peft_load")
+    assert resize_idx < peft_idx

tests/test_runtime_config.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from __future__ import annotations
+from pathlib import Path
+import pytest
+from thirawat_demo.runtime.config import RuntimeConfig
+def test_from_env_requires_index_repo_when_local_missing(monkeypatch):
+    monkeypatch.delenv("INDEX_REPO", raising=False)
+    monkeypatch.setenv("LOCAL_INDEX_PATH", "missing-local-index")
+    with pytest.raises(ValueError):
+        RuntimeConfig.from_env()
+def test_from_env_accepts_local_db_without_repo(monkeypatch, tmp_path: Path):
+    local_index = tmp_path / "data" / "lancedb" / "db"
+    local_index.mkdir(parents=True)
+    monkeypatch.delenv("INDEX_REPO", raising=False)
+    monkeypatch.setenv("LOCAL_INDEX_PATH", str(local_index.parent))
+    config = RuntimeConfig.from_env()
+    assert config.local_index_path == local_index.parent
+    assert config.index_repo is None
+def test_retrieval_topk_is_raised_to_candidate_topk(monkeypatch, tmp_path: Path):
+    local_index = tmp_path / "idx" / "db"
+    local_index.mkdir(parents=True)
+    monkeypatch.setenv("LOCAL_INDEX_PATH", str(local_index.parent))
+    monkeypatch.setenv("CANDIDATE_TOPK", "200")
+    monkeypatch.setenv("RETRIEVAL_TOPK", "10")
+    config = RuntimeConfig.from_env()
+    assert config.retrieval_topk == 200
+def test_device_defaults_to_auto(monkeypatch, tmp_path: Path):
+    local_index = tmp_path / "idx" / "db"
+    local_index.mkdir(parents=True)
+    monkeypatch.setenv("LOCAL_INDEX_PATH", str(local_index.parent))
+    monkeypatch.delenv("DEVICE", raising=False)
+    config = RuntimeConfig.from_env()
+    assert config.device == "auto"
+def test_post_config_defaults(monkeypatch, tmp_path: Path):
+    local_index = tmp_path / "idx" / "db"
+    local_index.mkdir(parents=True)
+    monkeypatch.setenv("LOCAL_INDEX_PATH", str(local_index.parent))
+    monkeypatch.delenv("POST_MODE", raising=False)
+    monkeypatch.delenv("POST_WEIGHT", raising=False)
+    monkeypatch.delenv("TIEBREAK_EPS", raising=False)
+    monkeypatch.delenv("TIEBREAK_TOPN", raising=False)
+    monkeypatch.delenv("POST_STRENGTH_WEIGHT", raising=False)
+    monkeypatch.delenv("POST_JACCARD_WEIGHT", raising=False)
+    monkeypatch.delenv("POST_BRAND_PENALTY", raising=False)
+    monkeypatch.delenv("POST_MINMAX", raising=False)
+    monkeypatch.delenv("BRAND_STRICT", raising=False)
+    config = RuntimeConfig.from_env()
+    assert config.post_mode == "tiebreak"
+    assert config.post_weight == 0.05
+    assert config.tiebreak_eps == 0.01
+    assert config.tiebreak_topn == 50
+    assert config.post_strength_weight == 0.6
+    assert config.post_jaccard_weight == 0.4
+    assert config.post_brand_penalty == 0.3
+    assert config.post_minmax is True
+    assert config.brand_strict is False
+def test_invalid_post_mode_raises(monkeypatch, tmp_path: Path):
+    local_index = tmp_path / "idx" / "db"
+    local_index.mkdir(parents=True)
+    monkeypatch.setenv("LOCAL_INDEX_PATH", str(local_index.parent))
+    monkeypatch.setenv("POST_MODE", "unsupported")
+    with pytest.raises(ValueError):
+        RuntimeConfig.from_env()

tests/test_search_service.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from __future__ import annotations
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import thirawat_demo.runtime.search_service as service_mod
+from thirawat_demo.runtime.config import RuntimeConfig
+from thirawat_demo.runtime.search_service import SearchService
+def _make_config(tmp_path: Path) -> RuntimeConfig:
+    return RuntimeConfig(
+        local_index_path=tmp_path / "data" / "lancedb",
+        index_repo=None,
+        index_revision="main",
+        hf_token=None,
+        hf_data_dir=tmp_path / "data",
+        lancedb_table="concepts_drug",
+        top_k_default=10,
+        candidate_topk=100,
+        retrieval_topk=200,
+        device="cpu",
+        encoder_model_id="cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR",
+        reranker_id="sidataplus/THIRAWAT-SapBERT",
+        post_mode="tiebreak",
+        post_weight=0.05,
+        tiebreak_eps=0.01,
+        tiebreak_topn=50,
+        post_strength_weight=0.6,
+        post_jaccard_weight=0.4,
+        post_brand_penalty=0.3,
+        post_minmax=True,
+        brand_strict=False,
+        debug=False,
+    )
+def test_build_concept_class_where_sorted_and_escaped():
+    expr = SearchService._build_concept_class_where(["Ingredient", "Clinical Drug", "O'Reilly", "Ingredient"])
+    assert expr == "concept_class_id IN ('Clinical Drug','Ingredient','O''Reilly')"
+def test_apply_post_scoring_uses_mapper_tiebreak(monkeypatch, tmp_path: Path):
+    config = _make_config(tmp_path)
+    service = SearchService(config=config, lancedb_dir=tmp_path)
+    source = pd.DataFrame(
+        {
+            "concept_name": ["a", "b"],
+            "profile_text": ["", ""],
+            "_relevance_score": [0.9, 0.89],
+        }
+    )
+    called: dict[str, object] = {}
+    def fake_should_apply_post(mode: str, weight: float) -> bool:
+        called["mode"] = mode
+        called["weight"] = weight
+        return True
+    def fake_enrich(df: pd.DataFrame, query_text_norm: str, **kwargs):
+        called["query"] = query_text_norm
+        called["kwargs"] = kwargs
+        out = df.copy()
+        out["final_score"] = [0.1, 0.2]
+        return out.iloc[[1, 0]].reset_index(drop=True)
+    monkeypatch.setattr(service_mod, "should_apply_post", fake_should_apply_post)
+    monkeypatch.setattr(service_mod, "enrich_with_post_scores", fake_enrich)
+    out = service._apply_post_scoring(source, "aspirin 81 mg")
+    assert out["concept_name"].tolist() == ["b", "a"]
+    assert called["mode"] == "tiebreak"
+    assert called["weight"] == 0.05
+    assert called["query"] == "aspirin 81 mg"
+    assert called["kwargs"]["tiebreak_eps"] == 0.01
+    assert called["kwargs"]["tiebreak_topn"] == 50
+def test_search_applies_concept_class_filter(monkeypatch, tmp_path: Path):
+    config = _make_config(tmp_path)
+    service = SearchService(config=config, lancedb_dir=tmp_path)
+    class FakeBuilder:
+        def __init__(self):
+            self.where_clause = None
+        def where(self, clause):
+            self.where_clause = clause
+            return self
+        def distance_type(self, _):
+            return self
+        def limit(self, _):
+            return self
+        def rerank(self, reranker=None, query_string=None):
+            return self
+        def to_arrow(self):
+            frame = pd.DataFrame(
+                {
+                    "concept_id": [1191],
+                    "concept_name": ["aspirin"],
+                    "concept_code": ["1191"],
+                    "vocabulary_id": ["RxNorm"],
+                    "_relevance_score": [0.998],
+                    "_distance": [0.2],
+                    "concept_class_id": ["Ingredient"],
+                }
+            )
+            return pa.Table.from_pandas(frame, preserve_index=False)
+    class FakeSchema:
+        names = ["concept_id", "concept_name", "concept_code", "vocabulary_id", "concept_class_id", "vector"]
+    class FakeTable:
+        schema = FakeSchema()
+        def __init__(self, builder: FakeBuilder):
+            self._builder = builder
+        def search(self, *args, **kwargs):
+            return self._builder
+    class FakeEmbedder:
+        def encode(self, texts):
+            return np.array([[0.0, 1.0]], dtype=float)
+    builder = FakeBuilder()
+    service._table = FakeTable(builder)
+    service._vector_column = "vector"
+    service._embedder = FakeEmbedder()
+    service._reranker = object()
+    service._normalize_text = lambda value: value.strip()
+    monkeypatch.setattr(SearchService, "_apply_post_scoring", lambda self, df, q: df)
+    result = service.search("aspirin", top_k=5, concept_class_ids=["Ingredient", "Clinical Drug"])
+    assert builder.where_clause == "concept_class_id IN ('Clinical Drug','Ingredient')"
+    assert result["concept_name"].tolist() == ["aspirin"]
+    assert result["concept_class_id"].tolist() == ["Ingredient"]
+    assert result["retrieval_score"].tolist() == [0.8]
+def test_search_returns_empty_when_no_concept_class_selected(tmp_path: Path):
+    config = _make_config(tmp_path)
+    service = SearchService(config=config, lancedb_dir=tmp_path)
+    out = service.search("aspirin", top_k=10, concept_class_ids=[])
+    assert out.empty

tests/test_space_ui.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from __future__ import annotations
+import pandas as pd
+from thirawat_demo import space_ui
+def test_to_display_results_replaces_scores_with_concept_class():
+    raw = pd.DataFrame(
+        {
+            "rank": [1],
+            "concept_id": [123],
+            "concept_name": ["aspirin"],
+            "concept_code": ["1191"],
+            "vocabulary_id": ["RxNorm"],
+            "concept_class_id": ["Ingredient"],
+            "score": [0.9],
+            "retrieval_score": [0.8],
+        }
+    )
+    out = space_ui._to_display_results(raw)
+    assert out.columns.tolist() == [
+        "rank",
+        "concept_id",
+        "concept_name",
+        "concept_code",
+        "vocabulary_id",
+        "concept_class",
+    ]
+    assert out.iloc[0]["concept_class"] == "Ingredient"
+    assert "score" not in out.columns
+    assert "retrieval_score" not in out.columns
+def test_to_display_results_handles_missing_concept_class_id():
+    raw = pd.DataFrame(
+        {
+            "rank": [1],
+            "concept_id": [123],
+            "concept_name": ["aspirin"],
+            "concept_code": ["1191"],
+            "vocabulary_id": ["RxNorm"],
+        }
+    )
+    out = space_ui._to_display_results(raw)
+    assert out.iloc[0]["concept_class"] == ""
+def test_format_single_results_keeps_score_text_but_returns_projected_table():
+    raw = pd.DataFrame(
+        {
+            "rank": [1],
+            "concept_id": [123],
+            "concept_name": ["aspirin"],
+            "concept_code": ["1191"],
+            "vocabulary_id": ["RxNorm"],
+            "concept_class_id": ["Ingredient"],
+            "score": [0.91],
+            "retrieval_score": [0.81],
+        }
+    )
+    md, table = space_ui._format_single_results("aspirin", raw)
+    assert "score 0.9100" in md
+    assert "concept_class" in table.columns
+    assert "score" not in table.columns
+def test_search_single_requires_at_least_one_concept_class():
+    md, table = space_ui.search_single("aspirin", 10, [])
+    assert md == "Please select at least one concept class."
+    assert table.empty
+def test_search_batch_requires_at_least_one_concept_class():
+    md, download = space_ui.search_batch("aspirin", 10, [])
+    assert md == "Please select at least one concept class."
+    assert download["visible"] is False
+def test_clear_single_resets_default_query_and_concept_classes():
+    concept_classes = ["Ingredient", "Clinical Drug"]
+    query, top_k, returned_classes, md, table = space_ui._clear_single(concept_classes)
+    assert query == space_ui.DEFAULT_SINGLE_QUERY
+    assert top_k == 10
+    assert returned_classes == concept_classes
+    assert md == ""
+    assert table.empty
+def test_clear_batch_resets_concept_classes_and_hides_download():
+    concept_classes = ["Ingredient", "Clinical Drug"]
+    queries, top_k, returned_classes, md, download = space_ui._clear_batch(concept_classes)
+    assert queries == ""
+    assert top_k == 10
+    assert returned_classes == concept_classes
+    assert md == ""
+    assert download["visible"] is False

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff