Spaces:

build-small-hackathon
/

multi-agent-lab

Running on Zero

multi-agent-lab / tests /test_local_backend.py

agharsallah

feat(media): introduce MediaRouter and stubs for image and speech generation

8400d8c 21 days ago

17.1 kB

	"""Tests for the local in-process backend — catalogue, gate, registry, router dispatch.

	The ``local`` backend (ADR-0033) runs a small ``transformers`` model in-process on the
	host GPU behind ``@spaces.GPU`` — hardware-agnostic (ZeroGPU or a dedicated GPU), with no
	HTTP endpoint. These tests cover the deterministic, offline-safe surface: the catalogue
	data, the capability gate (env signals + an injectable CUDA probe), the unified-registry
	integration, and that the router dispatches a ``local:`` key to the in-process provider
	rather than the HTTP gateway. The actual GPU forward pass is integration-only (it needs a
	GPU and weights), exactly as the HTTP provider's live call is — so nothing here downloads
	a model or touches CUDA.
	"""

	from __future__ import annotations

	import os

	import pytest

	from src.models import inference, local_catalogue
	from src.models.local_provider import LocalTransformersProvider
	from src.models.router import ModelRouter


	# ── catalogue ─────────────────────────────────────────────────────────────────────


	def test_one_sponsor_model_per_tier_and_sizes_stay_small():
	# Each tier maps to a distinct sponsor model (the multi-track cast), so one show spans
	# NVIDIA · OpenBMB · Cohere · JetBrains. Every model honours the ≤32B rule and the tiny
	# default keeps the Tiny-Titan ≤4B band.
	tagged = {m.profile: m for m in local_catalogue.LOCAL_MODELS if m.profile is not None}
	assert set(tagged) == {"tiny", "fast", "balanced", "strong"}
	assert all(m.params_b is None or m.params_b <= 32 for m in local_catalogue.LOCAL_MODELS)
	assert tagged["tiny"].params_b <= 4 # Tiny-Titan band
	assert len({m.source for m in tagged.values()}) == 4 # four sponsor families


	def test_every_tier_resolves_to_its_sponsor_model():
	assert local_catalogue.default_key_for_profile("tiny") == "nvidia/Nemotron-Mini-4B-Instruct"
	# OpenBMB lane uses MiniCPM5 (native llama arch) — the MiniCPM 4.x custom code mis-computes
	# under the transformers 5.x floor (KV-cache crash / gibberish), so it is deliberately avoided.
	assert local_catalogue.default_key_for_profile("fast") == "openbmb/MiniCPM5-1B"
	assert local_catalogue.default_key_for_profile("balanced") == "CohereLabs/aya-expanse-8b"
	assert local_catalogue.default_key_for_profile("strong") == "JetBrains/Mellum2-12B-A2.5B-Instruct"
	# the tiny model is listed first, so an untagged/unknown tier falls back to the cheapest.
	assert local_catalogue.LOCAL_MODELS[0].profile == "tiny"


	def test_catalogue_cast_is_all_native_arch_and_field_still_plumbs_custom_code():
	# The whole live cast loads with the stock AutoModelForCausalLM — no trust_remote_code,
	# no custom-code-only KV-cache workaround — which is why output is correct under
	# transformers 5.x (MiniCPM5 replaced the 4.x custom-code model for exactly this reason).
	for m in local_catalogue.LOCAL_MODELS:
	assert m.trust_remote_code is False, f"{m.repo_id} unexpectedly needs trust_remote_code"
	assert m.use_cache is True, f"{m.repo_id} unexpectedly disables the KV cache"
	assert local_catalogue.model_by_key("openbmb/MiniCPM5-1B").trust_remote_code is False
	assert local_catalogue.model_by_key("does/not-exist") is None
	# The fields still plumb a non-default value, so a future custom-code model is one append away.
	custom = local_catalogue.LocalModel(repo_id="acme/custom", trust_remote_code=True, use_cache=False)
	assert custom.trust_remote_code is True and custom.use_cache is False


	def test_binding_is_a_bare_repo_id_with_no_endpoint():
	# In-process: the binding carries the raw transformers repo id (no openai/ prefix) and
	# neither a base_url nor an api_key — the router builds the in-process provider from it.
	binding = local_catalogue.binding_for("nvidia/Nemotron-Mini-4B-Instruct")
	assert binding["model"] == "nvidia/Nemotron-Mini-4B-Instruct"
	assert binding["base_url"] == ""
	assert binding["api_key"] == ""


	def test_binding_unknown_key_raises():
	with pytest.raises(KeyError):
	local_catalogue.binding_for("nobody/here")


	# ── capability gate ─────────────────────────────────────────────────────────────────


	def test_gate_explicit_env_is_deterministic_without_a_probe():
	# An explicit env dict is the whole story — no torch import, no host probe.
	assert local_catalogue.has_credentials(env={}) is False
	assert local_catalogue.has_credentials(env={"SPACES_ZERO_GPU": "true"}) is True
	assert local_catalogue.has_credentials(env={"LOCAL_INFERENCE": "1"}) is True


	def test_gate_accepts_common_truthy_spellings():
	for val in ("1", "true", "TRUE", "yes", "on"):
	assert local_catalogue.has_credentials(env={"LOCAL_INFERENCE": val}) is True
	for val in ("0", "false", "", "no"):
	assert local_catalogue.has_credentials(env={"LOCAL_INFERENCE": val}) is False


	def test_gate_uses_injected_cuda_probe_when_env_signals_absent():
	# No env signal → fall through to the probe (auto-detect a dedicated GPU / local box).
	assert local_catalogue.has_credentials(env={}, cuda_probe=lambda: True) is True
	assert local_catalogue.has_credentials(env={}, cuda_probe=lambda: False) is False
	# An env signal short-circuits before the probe is ever consulted.
	assert local_catalogue.has_credentials(env={"SPACES_ZERO_GPU": "1"}, cuda_probe=lambda: False) is True


	def test_gate_auto_probes_only_against_the_real_environment():
	# Passing os.environ itself opts into the host CUDA probe; an arbitrary dict does not,
	# keeping façade/test calls deterministic. We assert the boolean, whatever the host is.
	assert isinstance(local_catalogue.has_credentials(env=os.environ), bool)


	# ── unified registry integration ─────────────────────────────────────────────────────


	def test_local_backend_is_registered_and_qualified():
	assert "local" in {b.key for b in inference.backends()}
	keys = {e["key"] for e in inference.entries("local")}
	assert keys and all(k.startswith("local:") for k in keys)


	def test_registry_default_and_binding_round_trip():
	key = inference.default_key_for_profile("tiny", "local")
	assert key == "local:nvidia/Nemotron-Mini-4B-Instruct"
	binding = inference.binding_for(key)
	assert binding["model"] == "nvidia/Nemotron-Mini-4B-Instruct"
	assert binding["base_url"] == ""


	def test_backend_available_and_configured_backends_for_local():
	assert inference.backend_available("local", env={"LOCAL_INFERENCE": "1"}) is True
	assert inference.backend_available("local", env={"SPACES_ZERO_GPU": "yes"}) is True
	assert inference.backend_available("local", env={}) is False
	configured = inference.configured_backends(env={"LOCAL_INFERENCE": "1"})
	assert "local" in configured


	# ── router dispatch ──────────────────────────────────────────────────────────────────


	def test_router_dispatches_local_key_to_in_process_provider():
	# A live router resolving a local: key must build the in-process provider (not LiteLLM),
	# bound to the bare repo id. Construction only — no GPU is touched.
	router = ModelRouter(offline=False)
	provider = router.for_profile("local:nvidia/Nemotron-Mini-4B-Instruct")
	assert isinstance(provider, LocalTransformersProvider)
	assert provider.model == "nvidia/Nemotron-Mini-4B-Instruct"
	assert provider.model_id == "nvidia/Nemotron-Mini-4B-Instruct"


	def test_catalogue_spec_tags_local_kind_and_others_litellm():
	router = ModelRouter(offline=False)
	local_spec = router._catalogue_spec("local:nvidia/Nemotron-Mini-4B-Instruct")
	assert local_spec is not None and local_spec.kind == "local"
	# An HF key resolves through the same path but stays on the HTTP transport.
	hf_spec = router._catalogue_spec("hf:katanemo/Arch-Router-1.5B")
	assert hf_spec is not None and hf_spec.kind == "litellm"


	# ── provider (cheap, offline-safe surface) ───────────────────────────────────────────


	def test_provider_reports_model_id_and_zeroed_usage_before_any_call():
	provider = LocalTransformersProvider(model="nvidia/Nemotron-Mini-4B-Instruct")
	assert provider.model_id == "nvidia/Nemotron-Mini-4B-Instruct"
	assert provider.last_usage == {} # no call yet — matches the sibling providers
	provider._zero_usage()
	assert provider.last_usage == {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}


	def test_provider_resolves_trust_remote_code_from_catalogue():
	# The cast is all native-arch, so every catalogue model resolves to False; an
	# off-catalogue repo also defaults to the safe choice.
	assert LocalTransformersProvider(model="openbmb/MiniCPM5-1B")._trust_remote_code() is False
	assert LocalTransformersProvider(model="CohereLabs/aya-expanse-8b")._trust_remote_code() is False
	assert LocalTransformersProvider(model="some/random-repo")._trust_remote_code() is False


	def test_provider_resolves_use_cache_from_catalogue():
	# Native-arch models keep the KV cache on (the fast path), and an off-catalogue repo
	# defaults to the cached path; no model in the current cast disables it.
	assert LocalTransformersProvider(model="openbmb/MiniCPM5-1B")._use_cache() is True
	assert LocalTransformersProvider(model="nvidia/Nemotron-Mini-4B-Instruct")._use_cache() is True
	assert LocalTransformersProvider(model="some/random-repo")._use_cache() is True


	# ── ZeroGPU contract: CUDA only inside @spaces.GPU, never in the parent ───────────────
	# Regression guard for the production crash "Low-level CUDA init (torch._C._cuda_init)
	# reached … ZeroGPU's emulation did not intercept": the parent process gets no GPU, so any
	# CUDA placement (or a model load that places onto a device) outside the @spaces.GPU window
	# kills the worker. The forward pass can only be exercised with a GPU + weights
	# (integration), so we pin the structural invariant — where CUDA may be touched, and how
	# the model reaches the device — by source contract.


	def test_parent_warm_only_downloads_never_loads_or_initialises_cuda():
	import ast
	import inspect

	from src.models import local_provider

	# _ensure_downloaded runs in the parent. It must only fetch weights to disk — never touch
	# CUDA, and never materialise the model in RAM (a multi-model cast would pin tens of GB).
	# Check the executable body with the docstring stripped (the docstring explains the
	# invariant in prose, so it legitimately mentions CUDA/RAM); the banned ops are device
	# moves, torch.cuda.* and any model instantiation.
	fn = ast.parse(inspect.getsource(local_provider._ensure_downloaded)).body[0]
	if ast.get_docstring(fn):
	fn.body = fn.body[1:]
	code = ast.unparse(fn)
	assert 'to("cuda")' not in code and "torch.cuda" not in code and ".cuda(" not in code
	# No weight materialisation in the parent — only the on-disk fetch.
	assert "from_pretrained" not in code and "AutoModel" not in code
	assert "snapshot_download" in code


	def test_worker_loads_onto_device_via_device_map_no_meta_prone_move():
	import ast
	import inspect

	from src.models import local_provider

	# Regression guard for the ZeroGPU crash "Cannot copy out of meta tensor; no data!".
	# transformers 5.x always builds on the meta device and streams the checkpoint onto the
	# target; a bare from_pretrained(...).to("cuda") can leave a non-persistent buffer (e.g.
	# rotary inv_freq) or a tied/"missing" head on meta, and the move then dies
	# (transformers#41038/#30703). low_cpu_mem_usage no longer changes this (5.x drops the
	# kwarg). The fix: hand transformers the device via device_map so it materialises AND
	# places everything on-device in one step — no fragile post-hoc .to("cuda").
	fn = ast.parse(inspect.getsource(local_provider._ensure_loaded_on_device)).body[0]
	if ast.get_docstring(fn):
	fn.body = fn.body[1:]
	code = ast.unparse(fn)
	# The supported placement path is used…
	assert "device_map" in code
	# …the GPU window never re-downloads (the parent already fetched the weights)…
	assert "local_files_only=True" in code
	# …and the meta-prone manual move / dead kwarg are gone.
	assert 'to("cuda")' not in code
	assert "low_cpu_mem_usage" not in code


	def test_v4_compat_shim_backfills_removed_remote_code_predicates():
	# Regression guard for the ZeroGPU error "cannot import name 'is_torch_fx_available'
	# from transformers.utils.import_utils": transformers 5.x removed these predicates, but
	# MiniCPM's (and other) trust_remote_code modelling files still import them. The provider
	# back-fills them (all True at our torch floor) so the remote import succeeds.
	from src.models import local_provider

	local_provider._ensure_transformers_v4_symbols()
	from transformers.utils import import_utils

	# Every name the shim covers is importable from transformers.utils.import_utils and True.
	for name in local_provider._REMOVED_TORCH_PREDICATES:
	fn = getattr(import_utils, name)
	assert fn() is True
	# And the device loader runs the shim before touching any remote code.
	import inspect

	assert "_ensure_transformers_v4_symbols()" in inspect.getsource(local_provider._ensure_loaded_on_device)


	def test_device_placement_lives_inside_the_spaces_gpu_function():
	from pathlib import Path

	from src.models import local_provider

	# _generate is wrapped by @spaces.GPU, so read the module source and isolate its block.
	module_src = Path(local_provider.__file__).read_text()
	gen_block = module_src.split("def _generate(", 1)[1].split("\ndef ", 1)[0]
	# The model reaches the device here (the one place ZeroGPU grants one) via the on-device
	# loader — never via a parent-side load…
	assert "_ensure_loaded_on_device(" in gen_block
	# …and the function carries the decorator the platform registers.
	assert "@spaces.GPU" in module_src.split("def _generate(", 1)[0].rsplit("\n\n", 1)[-1]
	# The parent path (complete) warms the on-disk cache only — it must not load on-device.
	complete_block = module_src.split("def complete(", 1)[1].split("\n def ", 1)[0]
	assert "_ensure_downloaded(" in complete_block
	assert "_ensure_loaded_on_device(" not in complete_block


	def test_generate_unpacks_batchencoding_never_passes_a_positional_dict():
	# Regression guard for the production AttributeError "inputs_tensor.shape[0]" in
	# transformers.generate: in transformers 5.x apply_chat_template(return_tensors="pt")
	# defaults to a BatchEncoding dict, and passing that dict positionally into
	# model.generate(inputs) makes generate() do .shape on a dict. The fix: request the
	# dict explicitly (return_dict=True) and unpack it with ** so input_ids + attention_mask
	# are fed as kwargs. Pinned by AST so the call shape can't silently regress.
	import ast
	from pathlib import Path

	from src.models import local_provider

	tree = ast.parse(Path(local_provider.__file__).read_text())
	gen = next(n for n in ast.walk(tree) if isinstance(n, ast.FunctionDef) and n.name == "_generate")
	calls = [c for c in ast.walk(gen) if isinstance(c, ast.Call)]

	# apply_chat_template asks for the dict form explicitly (robust whatever the default).
	act = next(c for c in calls if isinstance(c.func, ast.Attribute) and c.func.attr == "apply_chat_template")
	assert any(k.arg == "return_dict" and k.value.value is True for k in act.keywords)
	# Reasoning models (e.g. MiniCPM5) are told not to think, so a <think> block can't eat the
	# token budget and leave an empty spoken line; harmlessly ignored by non-reasoning templates.
	assert any(k.arg == "enable_thinking" and k.value.value is False for k in act.keywords)

	# model.generate(**inputs, …): the encoding is unpacked, never a positional dict.
	gen_call = next(c for c in calls if isinstance(c.func, ast.Attribute) and c.func.attr == "generate")
	assert not gen_call.args, "generate() must take no positional arg (the old bug passed the dict positionally)"
	assert any(k.arg is None and isinstance(k.value, ast.Name) and k.value.id == "inputs" for k in gen_call.keywords)
	# use_cache is threaded through so a model with broken 5.x cache handling (MiniCPM) can
	# disable it ("Key and Value must have the same sequence length").
	assert any(k.arg == "use_cache" for k in gen_call.keywords)