Spaces:
Running on Zero
Running on Zero
| """Tests for the local in-process backend β catalogue, gate, registry, router dispatch. | |
| The ``local`` backend (ADR-0033) runs a small ``transformers`` model in-process on the | |
| host GPU behind ``@spaces.GPU`` β hardware-agnostic (ZeroGPU or a dedicated GPU), with no | |
| HTTP endpoint. These tests cover the deterministic, offline-safe surface: the catalogue | |
| data, the capability gate (env signals + an injectable CUDA probe), the unified-registry | |
| integration, and that the router dispatches a ``local:`` key to the in-process provider | |
| rather than the HTTP gateway. The actual GPU forward pass is integration-only (it needs a | |
| GPU and weights), exactly as the HTTP provider's live call is β so nothing here downloads | |
| a model or touches CUDA. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import pytest | |
| from src.models import inference, local_catalogue | |
| from src.models.local_provider import LocalTransformersProvider | |
| from src.models.router import ModelRouter | |
| # ββ catalogue βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_one_sponsor_model_per_tier_and_sizes_stay_small(): | |
| # Each tier maps to a *distinct* sponsor model (the multi-track cast), so one show spans | |
| # NVIDIA Β· OpenBMB Β· Cohere Β· JetBrains. Every model honours the β€32B rule and the tiny | |
| # default keeps the Tiny-Titan β€4B band. | |
| tagged = {m.profile: m for m in local_catalogue.LOCAL_MODELS if m.profile is not None} | |
| assert set(tagged) == {"tiny", "fast", "balanced", "strong"} | |
| assert all(m.params_b is None or m.params_b <= 32 for m in local_catalogue.LOCAL_MODELS) | |
| assert tagged["tiny"].params_b <= 4 # Tiny-Titan band | |
| assert len({m.source for m in tagged.values()}) == 4 # four sponsor families | |
| def test_every_tier_resolves_to_its_sponsor_model(): | |
| assert local_catalogue.default_key_for_profile("tiny") == "nvidia/Nemotron-Mini-4B-Instruct" | |
| # OpenBMB lane uses MiniCPM5 (native llama arch) β the MiniCPM 4.x custom code mis-computes | |
| # under the transformers 5.x floor (KV-cache crash / gibberish), so it is deliberately avoided. | |
| assert local_catalogue.default_key_for_profile("fast") == "openbmb/MiniCPM5-1B" | |
| assert local_catalogue.default_key_for_profile("balanced") == "CohereLabs/aya-expanse-8b" | |
| assert local_catalogue.default_key_for_profile("strong") == "JetBrains/Mellum2-12B-A2.5B-Instruct" | |
| # the tiny model is listed first, so an untagged/unknown tier falls back to the cheapest. | |
| assert local_catalogue.LOCAL_MODELS[0].profile == "tiny" | |
| def test_catalogue_cast_is_all_native_arch_and_field_still_plumbs_custom_code(): | |
| # The whole live cast loads with the stock AutoModelForCausalLM β no trust_remote_code, | |
| # no custom-code-only KV-cache workaround β which is why output is correct under | |
| # transformers 5.x (MiniCPM5 replaced the 4.x custom-code model for exactly this reason). | |
| for m in local_catalogue.LOCAL_MODELS: | |
| assert m.trust_remote_code is False, f"{m.repo_id} unexpectedly needs trust_remote_code" | |
| assert m.use_cache is True, f"{m.repo_id} unexpectedly disables the KV cache" | |
| assert local_catalogue.model_by_key("openbmb/MiniCPM5-1B").trust_remote_code is False | |
| assert local_catalogue.model_by_key("does/not-exist") is None | |
| # The fields still plumb a non-default value, so a future custom-code model is one append away. | |
| custom = local_catalogue.LocalModel(repo_id="acme/custom", trust_remote_code=True, use_cache=False) | |
| assert custom.trust_remote_code is True and custom.use_cache is False | |
| def test_binding_is_a_bare_repo_id_with_no_endpoint(): | |
| # In-process: the binding carries the raw transformers repo id (no openai/ prefix) and | |
| # neither a base_url nor an api_key β the router builds the in-process provider from it. | |
| binding = local_catalogue.binding_for("nvidia/Nemotron-Mini-4B-Instruct") | |
| assert binding["model"] == "nvidia/Nemotron-Mini-4B-Instruct" | |
| assert binding["base_url"] == "" | |
| assert binding["api_key"] == "" | |
| def test_binding_unknown_key_raises(): | |
| with pytest.raises(KeyError): | |
| local_catalogue.binding_for("nobody/here") | |
| # ββ capability gate βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_gate_explicit_env_is_deterministic_without_a_probe(): | |
| # An explicit env dict is the whole story β no torch import, no host probe. | |
| assert local_catalogue.has_credentials(env={}) is False | |
| assert local_catalogue.has_credentials(env={"SPACES_ZERO_GPU": "true"}) is True | |
| assert local_catalogue.has_credentials(env={"LOCAL_INFERENCE": "1"}) is True | |
| def test_gate_accepts_common_truthy_spellings(): | |
| for val in ("1", "true", "TRUE", "yes", "on"): | |
| assert local_catalogue.has_credentials(env={"LOCAL_INFERENCE": val}) is True | |
| for val in ("0", "false", "", "no"): | |
| assert local_catalogue.has_credentials(env={"LOCAL_INFERENCE": val}) is False | |
| def test_gate_uses_injected_cuda_probe_when_env_signals_absent(): | |
| # No env signal β fall through to the probe (auto-detect a dedicated GPU / local box). | |
| assert local_catalogue.has_credentials(env={}, cuda_probe=lambda: True) is True | |
| assert local_catalogue.has_credentials(env={}, cuda_probe=lambda: False) is False | |
| # An env signal short-circuits before the probe is ever consulted. | |
| assert local_catalogue.has_credentials(env={"SPACES_ZERO_GPU": "1"}, cuda_probe=lambda: False) is True | |
| def test_gate_auto_probes_only_against_the_real_environment(): | |
| # Passing os.environ itself opts into the host CUDA probe; an arbitrary dict does not, | |
| # keeping faΓ§ade/test calls deterministic. We assert the boolean, whatever the host is. | |
| assert isinstance(local_catalogue.has_credentials(env=os.environ), bool) | |
| # ββ unified registry integration βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_local_backend_is_registered_and_qualified(): | |
| assert "local" in {b.key for b in inference.backends()} | |
| keys = {e["key"] for e in inference.entries("local")} | |
| assert keys and all(k.startswith("local:") for k in keys) | |
| def test_registry_default_and_binding_round_trip(): | |
| key = inference.default_key_for_profile("tiny", "local") | |
| assert key == "local:nvidia/Nemotron-Mini-4B-Instruct" | |
| binding = inference.binding_for(key) | |
| assert binding["model"] == "nvidia/Nemotron-Mini-4B-Instruct" | |
| assert binding["base_url"] == "" | |
| def test_backend_available_and_configured_backends_for_local(): | |
| assert inference.backend_available("local", env={"LOCAL_INFERENCE": "1"}) is True | |
| assert inference.backend_available("local", env={"SPACES_ZERO_GPU": "yes"}) is True | |
| assert inference.backend_available("local", env={}) is False | |
| configured = inference.configured_backends(env={"LOCAL_INFERENCE": "1"}) | |
| assert "local" in configured | |
| # ββ router dispatch ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_router_dispatches_local_key_to_in_process_provider(): | |
| # A live router resolving a local: key must build the in-process provider (not LiteLLM), | |
| # bound to the bare repo id. Construction only β no GPU is touched. | |
| router = ModelRouter(offline=False) | |
| provider = router.for_profile("local:nvidia/Nemotron-Mini-4B-Instruct") | |
| assert isinstance(provider, LocalTransformersProvider) | |
| assert provider.model == "nvidia/Nemotron-Mini-4B-Instruct" | |
| assert provider.model_id == "nvidia/Nemotron-Mini-4B-Instruct" | |
| def test_catalogue_spec_tags_local_kind_and_others_litellm(): | |
| router = ModelRouter(offline=False) | |
| local_spec = router._catalogue_spec("local:nvidia/Nemotron-Mini-4B-Instruct") | |
| assert local_spec is not None and local_spec.kind == "local" | |
| # An HF key resolves through the same path but stays on the HTTP transport. | |
| hf_spec = router._catalogue_spec("hf:katanemo/Arch-Router-1.5B") | |
| assert hf_spec is not None and hf_spec.kind == "litellm" | |
| # ββ provider (cheap, offline-safe surface) βββββββββββββββββββββββββββββββββββββββββββ | |
| def test_provider_reports_model_id_and_zeroed_usage_before_any_call(): | |
| provider = LocalTransformersProvider(model="nvidia/Nemotron-Mini-4B-Instruct") | |
| assert provider.model_id == "nvidia/Nemotron-Mini-4B-Instruct" | |
| assert provider.last_usage == {} # no call yet β matches the sibling providers | |
| provider._zero_usage() | |
| assert provider.last_usage == {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0} | |
| def test_provider_resolves_trust_remote_code_from_catalogue(): | |
| # The cast is all native-arch, so every catalogue model resolves to False; an | |
| # off-catalogue repo also defaults to the safe choice. | |
| assert LocalTransformersProvider(model="openbmb/MiniCPM5-1B")._trust_remote_code() is False | |
| assert LocalTransformersProvider(model="CohereLabs/aya-expanse-8b")._trust_remote_code() is False | |
| assert LocalTransformersProvider(model="some/random-repo")._trust_remote_code() is False | |
| def test_provider_resolves_use_cache_from_catalogue(): | |
| # Native-arch models keep the KV cache on (the fast path), and an off-catalogue repo | |
| # defaults to the cached path; no model in the current cast disables it. | |
| assert LocalTransformersProvider(model="openbmb/MiniCPM5-1B")._use_cache() is True | |
| assert LocalTransformersProvider(model="nvidia/Nemotron-Mini-4B-Instruct")._use_cache() is True | |
| assert LocalTransformersProvider(model="some/random-repo")._use_cache() is True | |
| # ββ ZeroGPU contract: CUDA only inside @spaces.GPU, never in the parent βββββββββββββββ | |
| # Regression guard for the production crash "Low-level CUDA init (torch._C._cuda_init) | |
| # reached β¦ ZeroGPU's emulation did not intercept": the parent process gets no GPU, so any | |
| # CUDA placement (or a model load that places onto a device) outside the @spaces.GPU window | |
| # kills the worker. The forward pass can only be exercised with a GPU + weights | |
| # (integration), so we pin the *structural* invariant β where CUDA may be touched, and how | |
| # the model reaches the device β by source contract. | |
| def test_parent_warm_only_downloads_never_loads_or_initialises_cuda(): | |
| import ast | |
| import inspect | |
| from src.models import local_provider | |
| # _ensure_downloaded runs in the parent. It must only fetch weights to disk β never touch | |
| # CUDA, and never materialise the model in RAM (a multi-model cast would pin tens of GB). | |
| # Check the executable body with the docstring stripped (the docstring explains the | |
| # invariant in prose, so it legitimately mentions CUDA/RAM); the banned ops are device | |
| # moves, torch.cuda.* and any model instantiation. | |
| fn = ast.parse(inspect.getsource(local_provider._ensure_downloaded)).body[0] | |
| if ast.get_docstring(fn): | |
| fn.body = fn.body[1:] | |
| code = ast.unparse(fn) | |
| assert 'to("cuda")' not in code and "torch.cuda" not in code and ".cuda(" not in code | |
| # No weight materialisation in the parent β only the on-disk fetch. | |
| assert "from_pretrained" not in code and "AutoModel" not in code | |
| assert "snapshot_download" in code | |
| def test_worker_loads_onto_device_via_device_map_no_meta_prone_move(): | |
| import ast | |
| import inspect | |
| from src.models import local_provider | |
| # Regression guard for the ZeroGPU crash "Cannot copy out of meta tensor; no data!". | |
| # transformers 5.x always builds on the meta device and streams the checkpoint onto the | |
| # target; a bare from_pretrained(...).to("cuda") can leave a non-persistent buffer (e.g. | |
| # rotary inv_freq) or a tied/"missing" head on meta, and the move then dies | |
| # (transformers#41038/#30703). low_cpu_mem_usage no longer changes this (5.x drops the | |
| # kwarg). The fix: hand transformers the device via device_map so it materialises AND | |
| # places everything on-device in one step β no fragile post-hoc .to("cuda"). | |
| fn = ast.parse(inspect.getsource(local_provider._ensure_loaded_on_device)).body[0] | |
| if ast.get_docstring(fn): | |
| fn.body = fn.body[1:] | |
| code = ast.unparse(fn) | |
| # The supported placement path is used⦠| |
| assert "device_map" in code | |
| # β¦the GPU window never re-downloads (the parent already fetched the weights)β¦ | |
| assert "local_files_only=True" in code | |
| # β¦and the meta-prone manual move / dead kwarg are gone. | |
| assert 'to("cuda")' not in code | |
| assert "low_cpu_mem_usage" not in code | |
| def test_v4_compat_shim_backfills_removed_remote_code_predicates(): | |
| # Regression guard for the ZeroGPU error "cannot import name 'is_torch_fx_available' | |
| # from transformers.utils.import_utils": transformers 5.x removed these predicates, but | |
| # MiniCPM's (and other) trust_remote_code modelling files still import them. The provider | |
| # back-fills them (all True at our torch floor) so the remote import succeeds. | |
| from src.models import local_provider | |
| local_provider._ensure_transformers_v4_symbols() | |
| from transformers.utils import import_utils | |
| # Every name the shim covers is importable from transformers.utils.import_utils and True. | |
| for name in local_provider._REMOVED_TORCH_PREDICATES: | |
| fn = getattr(import_utils, name) | |
| assert fn() is True | |
| # And the device loader runs the shim before touching any remote code. | |
| import inspect | |
| assert "_ensure_transformers_v4_symbols()" in inspect.getsource(local_provider._ensure_loaded_on_device) | |
| def test_device_placement_lives_inside_the_spaces_gpu_function(): | |
| from pathlib import Path | |
| from src.models import local_provider | |
| # _generate is wrapped by @spaces.GPU, so read the module source and isolate its block. | |
| module_src = Path(local_provider.__file__).read_text() | |
| gen_block = module_src.split("def _generate(", 1)[1].split("\ndef ", 1)[0] | |
| # The model reaches the device here (the one place ZeroGPU grants one) via the on-device | |
| # loader β never via a parent-side loadβ¦ | |
| assert "_ensure_loaded_on_device(" in gen_block | |
| # β¦and the function carries the decorator the platform registers. | |
| assert "@spaces.GPU" in module_src.split("def _generate(", 1)[0].rsplit("\n\n", 1)[-1] | |
| # The parent path (complete) warms the on-disk cache only β it must not load on-device. | |
| complete_block = module_src.split("def complete(", 1)[1].split("\n def ", 1)[0] | |
| assert "_ensure_downloaded(" in complete_block | |
| assert "_ensure_loaded_on_device(" not in complete_block | |
| def test_generate_unpacks_batchencoding_never_passes_a_positional_dict(): | |
| # Regression guard for the production AttributeError "inputs_tensor.shape[0]" in | |
| # transformers.generate: in transformers 5.x apply_chat_template(return_tensors="pt") | |
| # defaults to a BatchEncoding *dict*, and passing that dict positionally into | |
| # model.generate(inputs) makes generate() do .shape on a dict. The fix: request the | |
| # dict explicitly (return_dict=True) and unpack it with ** so input_ids + attention_mask | |
| # are fed as kwargs. Pinned by AST so the call shape can't silently regress. | |
| import ast | |
| from pathlib import Path | |
| from src.models import local_provider | |
| tree = ast.parse(Path(local_provider.__file__).read_text()) | |
| gen = next(n for n in ast.walk(tree) if isinstance(n, ast.FunctionDef) and n.name == "_generate") | |
| calls = [c for c in ast.walk(gen) if isinstance(c, ast.Call)] | |
| # apply_chat_template asks for the dict form explicitly (robust whatever the default). | |
| act = next(c for c in calls if isinstance(c.func, ast.Attribute) and c.func.attr == "apply_chat_template") | |
| assert any(k.arg == "return_dict" and k.value.value is True for k in act.keywords) | |
| # Reasoning models (e.g. MiniCPM5) are told not to think, so a <think> block can't eat the | |
| # token budget and leave an empty spoken line; harmlessly ignored by non-reasoning templates. | |
| assert any(k.arg == "enable_thinking" and k.value.value is False for k in act.keywords) | |
| # model.generate(**inputs, β¦): the encoding is unpacked, never a positional dict. | |
| gen_call = next(c for c in calls if isinstance(c.func, ast.Attribute) and c.func.attr == "generate") | |
| assert not gen_call.args, "generate() must take no positional arg (the old bug passed the dict positionally)" | |
| assert any(k.arg is None and isinstance(k.value, ast.Name) and k.value.id == "inputs" for k in gen_call.keywords) | |
| # use_cache is threaded through so a model with broken 5.x cache handling (MiniCPM) can | |
| # disable it ("Key and Value must have the same sequence length"). | |
| assert any(k.arg == "use_cache" for k in gen_call.keywords) | |