Spaces:

arjun10g
/

zeroshotGPU

Running on Zero

Arjunvir Singh commited on 23 days ago

Commit

52764bf

1 Parent(s): f2ac076

Fix ZeroGPU state-loss in EmbeddingRetriever; add einops + smoke surfaces

The @spaces.GPU decorator runs wrapped functions in a separate worker
process; mutations to `self` inside the worker do not propagate back to
the caller. Decorating EmbeddingRetriever.index() / .query() therefore
silently set the indexed vectors in the worker only, leaving the calling
instance with empty state and producing recall=0 even though the model
loaded and encoded successfully on GPU.

Refactor: GPU work moved to a free stateless helper
`_gpu_encode_batch(model_id, task, texts) -> vectors`. EmbeddingRetriever
methods stay in the main process and dispatch via a new `_encode` method
that picks the explicit injected embedder (test path) or the GPU helper
(production path). Same fix pattern applies to TransformersClient when
we wire live GPU repair end-to-end (deferred — current decorator is
single-shot bursty work where state loss doesn't matter for one call).

Also lands:
- einops>=0.7.0 in requirements.txt (jina-v3's xlm_roberta_flash custom
modeling needs it; sentence-transformers does not pull it in).
- pyproject.toml `embedding` extra updated to match.
- scripts/run_space_smoke.py honours ZSGDP_SMOKE_EMBEDDING_MODEL_ID env
var so operators can swap models without editing the script (e.g. to
sentence-transformers/all-MiniLM-L6-v2 when jina-v3 has transformers
compat issues).
- app.py exposes run_smokes_in_space as a callable function so the
smokes can be triggered from the Gradio API or a future button.

Test count: 250/250.

Files changed (6) hide show

app.py +30 -0
pyproject.toml +7 -1
requirements.txt +8 -3
scripts/run_space_smoke.py +25 -9
tests/test_space_smoke.py +22 -0
zsgdp/benchmarks/embedding_retriever.py +58 -10

app.py CHANGED Viewed

@@ -219,6 +219,36 @@ def runtime_status_for_mode(pipeline_mode: str) -> dict:
     return collect_gpu_runtime_status(load_config(_config_path_for_mode(pipeline_mode))).to_dict()
 with gr.Blocks(title="zeroshotGPU") as demo:
     gr.Markdown("# zeroshotGPU")
     with gr.Row():

     return collect_gpu_runtime_status(load_config(_config_path_for_mode(pipeline_mode))).to_dict()
+def run_smokes_in_space() -> dict:
+    """Run scripts/run_space_smoke.py inside the Space and return the JSON report.
+    Exposes the in-process smoke runner as a Gradio endpoint so it's callable
+    from the UI tab AND from `/gradio_api/call/run_smokes_in_space` remotely.
+    Same code path as the terminal `python -m scripts.run_space_smoke` — just
+    triggered through Gradio instead of an SSH session.
+    Returns the same dict shape as SmokeReport.to_dict(): per-smoke results
+    with status / elapsed / detail / skip_reason / install_hint, plus an
+    aggregate summary count block.
+    """
+    from scripts.run_space_smoke import run_smokes
+    _logger.info("space_smokes_requested", extra={"trigger": "gradio_endpoint"})
+    report = run_smokes()
+    payload = report.to_dict()
+    _logger.info(
+        "space_smokes_complete",
+        extra={
+            "passed": payload["summary"]["passed"],
+            "failed": payload["summary"]["failed"],
+            "skipped": payload["summary"]["skipped"],
+            "errored": payload["summary"]["errored"],
+        },
+    )
+    return payload
 with gr.Blocks(title="zeroshotGPU") as demo:
     gr.Markdown("# zeroshotGPU")
     with gr.Row():

pyproject.toml CHANGED Viewed

@@ -25,7 +25,13 @@ spaces = [
     "pyyaml>=6.0.1,<7.0.0",
     "docling>=2.0.0,<3.0.0",
 ]
-embedding = ["sentence-transformers>=3.0.0,<4.0.0", "transformers>=4.45.0,<6.0.0"]
 gpu_repair = ["transformers>=4.45.0,<6.0.0"]
 dev = ["pytest>=8.0.0"]

     "pyyaml>=6.0.1,<7.0.0",
     "docling>=2.0.0,<3.0.0",
 ]
+embedding = [
+    "sentence-transformers>=3.0.0,<4.0.0",
+    "transformers>=4.45.0,<6.0.0",
+    # jinaai/jina-embeddings-v3's custom modeling needs einops; not pulled
+    # in transitively by sentence-transformers.
+    "einops>=0.7.0",
+]
 gpu_repair = ["transformers>=4.45.0,<6.0.0"]
 dev = ["pytest>=8.0.0"]

requirements.txt CHANGED Viewed

@@ -21,12 +21,17 @@ docling>=2.0.0,<3.0.0
 # through to a passthrough decorator (see zsgdp/gpu/zero_gpu.py).
 spaces>=0.25.0
-# Optional GPU/embedding stack. Uncomment to enable the embedding retriever
-# (benchmarks.retriever.backend=embedding) and live GPU repair escalations
-# (repair.execute_gpu_escalations=true). Both are off by default.
 #
 transformers>=4.45.0,<6.0.0
 sentence-transformers>=3.0.0,<4.0.0
 # Optional external parser CLIs. Each adds a non-trivial install footprint;
 # enable only the ones the Space hardware can support. Adapter shells out to

 # through to a passthrough decorator (see zsgdp/gpu/zero_gpu.py).
 spaces>=0.25.0
+# Embedding retriever + live GPU repair stack. Enabled here because the
+# Space is provisioned for the full evaluation surface; comment out the
+# group if you want a CPU-only deploy with just the lexical retriever.
+#
+# einops is required by jinaai/jina-embeddings-v3's custom modeling code
+# (it ships a custom `xlm_roberta_flash` implementation that reshapes via
+# einops); pip-installing sentence-transformers alone does not pull it in.
 #
 transformers>=4.45.0,<6.0.0
 sentence-transformers>=3.0.0,<4.0.0
+einops>=0.7.0
 # Optional external parser CLIs. Each adds a non-trivial install footprint;
 # enable only the ones the Space hardware can support. Adapter shells out to

scripts/run_space_smoke.py CHANGED Viewed

@@ -179,6 +179,16 @@ def smoke_ablation() -> SmokeResult:
 def smoke_embedding() -> SmokeResult:
     started = time.perf_counter()
     if importlib.util.find_spec("sentence_transformers") is None:
         return SmokeResult(
@@ -189,15 +199,19 @@ def smoke_embedding() -> SmokeResult:
             install_hint="python -m pip install 'zero-shot-gpu-doc-parser[embedding]'",
         )
     from zsgdp.benchmarks.embedding_retriever import EmbeddingRetriever
     from zsgdp.benchmarks.parser_quality import run_parser_benchmark
     # Try to load the configured embedding model. If the load fails (no HF
     # token, download error, OOM at import time), we report it as a skip
     # with the exception text so the operator sees what to fix without the
     # whole smoke run blowing up.
     try:
-        retriever = EmbeddingRetriever()
         retriever._ensure_embedder()  # type: ignore[attr-defined]  # private but intentional
     except Exception as exc:
         return SmokeResult(
@@ -205,21 +219,23 @@ def smoke_embedding() -> SmokeResult:
             status="skip",
             elapsed_seconds=time.perf_counter() - started,
             skip_reason=f"embedding model failed to load: {exc}",
-            install_hint="Set HF_TOKEN if the model is gated, or downsize via "
-                        "benchmarks.retriever.model_id (e.g. sentence-transformers/all-MiniLM-L6-v2).",
         )
-    config_overrides = {"benchmarks": {"retriever": {"backend": "embedding"}}}
     with tempfile.TemporaryDirectory() as tmp:
         tmp_path = Path(tmp)
         src = _make_distinctive_corpus(tmp_path)
         out = tmp_path / "out"
         config_path = tmp_path / "config.yaml"
-        # Inline config write — keeps the smoke self-contained.
-        config_path.write_text(
-            "benchmarks:\n  retriever:\n    backend: embedding\n",
-            encoding="utf-8",
-        )
         try:
             summary = run_parser_benchmark(src, out, config_path=config_path, dataset_name="custom_folder")
         except Exception as exc:

 def smoke_embedding() -> SmokeResult:
+    """Validate the embedding-retriever wiring on a real Space.
+    Set ZSGDP_SMOKE_EMBEDDING_MODEL_ID to override the default model_id —
+    useful when the configured default (jinaai/jina-embeddings-v3) has
+    transformers-version compat issues with the running container. A
+    common safe fallback is `sentence-transformers/all-MiniLM-L6-v2`,
+    which has no custom remote modeling code and works with any
+    transformers version.
+    """
     started = time.perf_counter()
     if importlib.util.find_spec("sentence_transformers") is None:
         return SmokeResult(
             install_hint="python -m pip install 'zero-shot-gpu-doc-parser[embedding]'",
         )
+    import os
     from zsgdp.benchmarks.embedding_retriever import EmbeddingRetriever
     from zsgdp.benchmarks.parser_quality import run_parser_benchmark
+    override_model_id = os.environ.get("ZSGDP_SMOKE_EMBEDDING_MODEL_ID") or None
     # Try to load the configured embedding model. If the load fails (no HF
     # token, download error, OOM at import time), we report it as a skip
     # with the exception text so the operator sees what to fix without the
     # whole smoke run blowing up.
     try:
+        retriever = EmbeddingRetriever(model_id=override_model_id) if override_model_id else EmbeddingRetriever()
         retriever._ensure_embedder()  # type: ignore[attr-defined]  # private but intentional
     except Exception as exc:
         return SmokeResult(
             status="skip",
             elapsed_seconds=time.perf_counter() - started,
             skip_reason=f"embedding model failed to load: {exc}",
+            install_hint="Set HF_TOKEN if the model is gated, OR set "
+                        "ZSGDP_SMOKE_EMBEDDING_MODEL_ID=sentence-transformers/all-MiniLM-L6-v2 "
+                        "to use a smaller compat-friendly model.",
         )
     with tempfile.TemporaryDirectory() as tmp:
         tmp_path = Path(tmp)
         src = _make_distinctive_corpus(tmp_path)
         out = tmp_path / "out"
         config_path = tmp_path / "config.yaml"
+        # Inline config write — keeps the smoke self-contained. Honours the
+        # env-var model override so the operator can swap models without
+        # editing this script.
+        config_lines = ["benchmarks:", "  retriever:", "    backend: embedding"]
+        if override_model_id:
+            config_lines.append(f"    model_id: {override_model_id}")
+        config_path.write_text("\n".join(config_lines) + "\n", encoding="utf-8")
         try:
             summary = run_parser_benchmark(src, out, config_path=config_path, dataset_name="custom_folder")
         except Exception as exc:

tests/test_space_smoke.py CHANGED Viewed

@@ -138,6 +138,28 @@ class RunSmokesIntegrationTests(unittest.TestCase):
         self.assertIn("sentence-transformers", result.skip_reason)
         self.assertIn("pip install", result.install_hint)
     def test_marker_smoke_skips_when_binary_missing(self):
         with patch("scripts.run_space_smoke.shutil.which", return_value=None):
             result = smoke_marker()

         self.assertIn("sentence-transformers", result.skip_reason)
         self.assertIn("pip install", result.install_hint)
+    def test_embedding_smoke_install_hint_mentions_model_override(self):
+        # When the model fails to load (e.g. jina-v3 transformers compat),
+        # the install_hint must point at the env-var override path so the
+        # operator can immediately switch to a compat-friendly model.
+        # Patch where EmbeddingRetriever is *defined*, not where it's imported,
+        # because smoke_embedding does a function-local lazy import.
+        from unittest.mock import MagicMock
+        retriever_mock = MagicMock()
+        retriever_mock.return_value._ensure_embedder.side_effect = RuntimeError("synthetic load failure")
+        with patch("scripts.run_space_smoke.importlib.util.find_spec") as find_spec, patch(
+            "zsgdp.benchmarks.embedding_retriever.EmbeddingRetriever", retriever_mock
+        ):
+            find_spec.return_value = object()  # spec found, dep present
+            result = smoke_embedding()
+        self.assertEqual(result.status, "skip")
+        self.assertIn("synthetic load failure", result.skip_reason)
+        self.assertIn("ZSGDP_SMOKE_EMBEDDING_MODEL_ID", result.install_hint)
+        self.assertIn("all-MiniLM-L6-v2", result.install_hint)
     def test_marker_smoke_skips_when_binary_missing(self):
         with patch("scripts.run_space_smoke.shutil.which", return_value=None):
             result = smoke_marker()

zsgdp/benchmarks/embedding_retriever.py CHANGED Viewed

@@ -12,8 +12,7 @@ Definitions and contract (pinned):
   - Pass `embedder=...` directly (used by tests and any caller that wants
     full control over batching, device placement, or remote inference).
   - Pass `model_id=...` and let the retriever lazy-load
-    sentence-transformers. Selected through `build_retriever` from config
-    by setting `benchmarks.retriever.backend = "embedding"`.
 - Index and query both call the embedder. The retriever is stateless
   beyond the indexed chunk vectors; reusing across documents requires a
   fresh `index()` call, same contract as LexicalRetriever.
@@ -22,6 +21,14 @@ Definitions and contract (pinned):
   Other sentence-transformers models work as long as they accept the same
   encode signature; jina's task-prompt argument is optional and silently
   ignored by models that don't recognize it.
 """
 from __future__ import annotations
@@ -34,6 +41,38 @@ from zsgdp.schema import Chunk
 Embedder = Callable[[list[str]], list[list[float]]]
 class EmbeddingRetriever:
     def __init__(
         self,
@@ -51,17 +90,16 @@ class EmbeddingRetriever:
         self._chunk_ids: list[str] = []
         self._vectors: list[list[float]] = []
-    @zero_gpu_slot(duration=180)
     def index(self, chunks: Sequence[Chunk]) -> None:
-        # First call lazy-loads the model + encodes all chunks (the slow path);
-        # ZeroGPU slot covers both. No-op decorator off-Space.
-        embedder = self._ensure_embedder()
         texts = [chunk.text for chunk in chunks]
         if not texts:
             self._chunk_ids = []
             self._vectors = []
             return
-        vectors = embedder(texts)
         if len(vectors) != len(texts):
             raise RuntimeError(
                 f"EmbeddingRetriever embedder returned {len(vectors)} vectors for {len(texts)} chunks."
@@ -69,12 +107,10 @@ class EmbeddingRetriever:
         self._chunk_ids = [chunk.chunk_id for chunk in chunks]
         self._vectors = [_normalize(list(vector)) for vector in vectors]
-    @zero_gpu_slot(duration=30)
     def query(self, text: str, *, top_k: int) -> list[str]:
         if not self._vectors:
             return []
-        embedder = self._ensure_embedder()
-        query_vec = embedder([text])
         if not query_vec:
             return []
         query_vector = _normalize(list(query_vec[0]))
@@ -88,6 +124,18 @@ class EmbeddingRetriever:
         scored.sort(key=lambda item: (-item[0], item[1]))
         return [self._chunk_ids[index] for _score, index in scored[:top_k]]
     def _ensure_embedder(self) -> Embedder:
         if self._embedder is not None:
             return self._embedder

   - Pass `embedder=...` directly (used by tests and any caller that wants
     full control over batching, device placement, or remote inference).
   - Pass `model_id=...` and let the retriever lazy-load
+    sentence-transformers via the stateless `_gpu_encode_batch` helper.
 - Index and query both call the embedder. The retriever is stateless
   beyond the indexed chunk vectors; reusing across documents requires a
   fresh `index()` call, same contract as LexicalRetriever.
   Other sentence-transformers models work as long as they accept the same
   encode signature; jina's task-prompt argument is optional and silently
   ignored by models that don't recognize it.
+ZeroGPU note: the GPU slot decorator runs the wrapped function in a
+separate worker process. Mutations to `self` made inside the worker do
+NOT propagate back to the caller. So `index()` and `query()` are
+intentionally NOT decorated — the GPU work is offloaded to the free
+stateless `_gpu_encode_batch(model_id, task, texts) -> vectors` helper,
+and the calling EmbeddingRetriever instance (which holds chunk_ids and
+vectors) stays in the main process.
 """
 from __future__ import annotations
 Embedder = Callable[[list[str]], list[list[float]]]
+@zero_gpu_slot(duration=180)
+def _gpu_encode_batch(model_id: str, task: str | None, texts: list[str]) -> list[list[float]]:
+    """Load a sentence-transformers model and encode `texts` under a ZeroGPU slot.
+    Stateless by design: takes only picklable inputs (strings) and returns a
+    list-of-lists of floats. The model is loaded fresh inside the worker
+    process — that's where ZeroGPU has GPU access. Subsequent calls re-load
+    (acceptable for bursty workloads); for sustained-throughput workloads,
+    pin the Space to non-ZeroGPU hardware and inject an `embedder` callable
+    so the model stays warm in the main process.
+    """
+    try:
+        from sentence_transformers import SentenceTransformer  # type: ignore
+    except ImportError as exc:
+        raise RuntimeError(
+            "EmbeddingRetriever requires sentence-transformers. "
+            "Install with `pip install sentence-transformers` or pass `embedder=...` explicitly."
+        ) from exc
+    model = SentenceTransformer(model_id, trust_remote_code=True)
+    kwargs: dict[str, Any] = {"normalize_embeddings": True}
+    if task:
+        try:
+            vectors = model.encode(texts, task=task, **kwargs)
+        except TypeError:
+            vectors = model.encode(texts, **kwargs)
+    else:
+        vectors = model.encode(texts, **kwargs)
+    return [list(map(float, vector)) for vector in vectors]
 class EmbeddingRetriever:
     def __init__(
         self,
         self._chunk_ids: list[str] = []
         self._vectors: list[list[float]] = []
     def index(self, chunks: Sequence[Chunk]) -> None:
+        # NOT decorated with @zero_gpu_slot — see module docstring. The GPU
+        # work is offloaded to the stateless _gpu_encode_batch helper so
+        # mutations to self stay in the main process.
         texts = [chunk.text for chunk in chunks]
         if not texts:
             self._chunk_ids = []
             self._vectors = []
             return
+        vectors = self._encode(texts, task=self._task)
         if len(vectors) != len(texts):
             raise RuntimeError(
                 f"EmbeddingRetriever embedder returned {len(vectors)} vectors for {len(texts)} chunks."
         self._chunk_ids = [chunk.chunk_id for chunk in chunks]
         self._vectors = [_normalize(list(vector)) for vector in vectors]
     def query(self, text: str, *, top_k: int) -> list[str]:
         if not self._vectors:
             return []
+        query_vec = self._encode([text], task=self._query_task)
         if not query_vec:
             return []
         query_vector = _normalize(list(query_vec[0]))
         scored.sort(key=lambda item: (-item[0], item[1]))
         return [self._chunk_ids[index] for _score, index in scored[:top_k]]
+    def _encode(self, texts: list[str], *, task: str | None) -> list[list[float]]:
+        """Dispatch encode to the injected embedder or the GPU helper.
+        Test path: `embedder=...` was passed to __init__, runs in-process.
+        Production path: model_id was passed (default jina-v3), runs inside
+        the @spaces.GPU-decorated worker via _gpu_encode_batch.
+        """
+        if self._explicit_embedder is not None:
+            return self._explicit_embedder(texts)
+        return _gpu_encode_batch(self._model_id, task, texts)
     def _ensure_embedder(self) -> Embedder:
         if self._embedder is not None:
             return self._embedder