Spaces:

build-small-hackathon
/

Scrypt

Running on Zero

File size: 5,929 Bytes

9fca766

"""Inference layer tests: SSE parsing, fixtures, backend selection."""

import json
from pathlib import Path

import httpx
import pytest

from scrypt.inference import build_backend
from scrypt.inference.api import OpenAIChatBackend
from scrypt.inference.backend import (
    RecordingBackend,
    ReplayBackend,
    ScriptedBackend,
    complete,
)
from scrypt.inference.local import LocalSetupError, LlamaServer, preflight


def sse_response(*texts: str) -> bytes:
    lines = []
    for t in texts:
        payload = {"choices": [{"delta": {"content": t}}]}
        lines.append(f"data: {json.dumps(payload)}")
    lines.append("data: [DONE]")
    return ("\n".join(lines) + "\n").encode()


async def test_openai_backend_parses_sse_stream():
    def handler(request: httpx.Request) -> httpx.Response:
        body = json.loads(request.content)
        assert body["stream"] is True
        assert body["chat_template_kwargs"] == {"enable_thinking": False}
        assert request.headers["authorization"] == "Bearer k"
        return httpx.Response(
            200,
            content=sse_response("The ", "scale ", "tips."),
            headers={"content-type": "text/event-stream"},
        )

    client = httpx.AsyncClient(transport=httpx.MockTransport(handler))
    backend = OpenAIChatBackend("http://test/v1", api_key="k", client=client)
    text = await complete(backend, [{"role": "user", "content": "hi"}])
    assert text == "The scale tips."


async def test_record_then_replay_roundtrip(tmp_path: Path):
    fixture = tmp_path / "fixtures.jsonl"
    live = ScriptedBackend(default="recorded line")
    messages = [{"role": "user", "content": "moment"}]

    recorder = RecordingBackend(live, fixture)
    assert await complete(recorder, messages) == "recorded line"

    replay = ReplayBackend(fixture)
    assert await complete(replay, messages) == "recorded line"
    with pytest.raises(KeyError):
        await complete(replay, [{"role": "user", "content": "unseen"}])


def test_preflight_reports_missing_pieces(monkeypatch, tmp_path):
    monkeypatch.setattr("scrypt.inference.local.find_binary", lambda: None)
    monkeypatch.setattr("scrypt.inference.local.llama_cpp_available", lambda: False)
    monkeypatch.setattr("scrypt.inference.local.SCRYPT_HOME", tmp_path)
    problems = preflight()
    assert any("llama-server" in p for p in problems)
    assert any("model" in p for p in problems)


def test_installed_model_finds_any_gguf_name(monkeypatch, tmp_path):
    """A hand-downloaded file with a custom name still counts."""
    import scrypt.inference.local as local

    monkeypatch.setattr(local, "SCRYPT_HOME", tmp_path)
    models = tmp_path / "models"
    models.mkdir()
    (models / "my-cool-quant-q4.gguf").write_bytes(b"x" * 10)
    assert local.installed_model().name == "my-cool-quant-q4.gguf"


def test_server_command_falls_back_to_llama_cpp_python(monkeypatch, tmp_path):
    import sys

    import scrypt.inference.local as local

    monkeypatch.setattr(local, "find_binary", lambda: None)
    monkeypatch.setattr(local, "llama_cpp_available", lambda: True)
    cmd = local.server_command(tmp_path / "m.gguf", 8731, 8192)
    assert cmd[:3] == [sys.executable, "-m", "llama_cpp.server"]

    monkeypatch.setattr(local, "llama_cpp_available", lambda: False)
    assert local.server_command(tmp_path / "m.gguf", 8731, 8192) is None


def test_llama_server_start_refuses_without_setup(monkeypatch):
    monkeypatch.setattr("scrypt.inference.local.find_binary", lambda: None)
    with pytest.raises(LocalSetupError):
        LlamaServer().start()


def test_build_backend_falls_back_to_scripted(monkeypatch):
    monkeypatch.delenv("SCRYPT_API_KEY", raising=False)
    monkeypatch.setenv("SCRYPT_BACKEND", "auto")
    monkeypatch.setattr("scrypt.inference.preflight", lambda: ["no model"])
    backend, server, mode = build_backend()
    assert mode == "scripted" and server is None
    assert isinstance(backend, ScriptedBackend)


def test_build_backend_api_mode(monkeypatch):
    monkeypatch.setenv("SCRYPT_BACKEND", "api")
    monkeypatch.setenv("SCRYPT_API_KEY", "sk-test")
    backend, server, mode = build_backend()
    assert mode == "api"
    assert isinstance(backend, OpenAIChatBackend)


def test_quant_ladder_tiers(monkeypatch):
    from scrypt.inference.local import choose_quant

    monkeypatch.delenv("SCRYPT_QUANT", raising=False)
    assert choose_quant(128) == "Q8_0"
    assert choose_quant(96) == "Q8_0"
    assert choose_quant(64) == "Q5_K_M"
    assert choose_quant(48) == "Q4_K_S"
    assert choose_quant(32) == "Q3_K_S"
    assert choose_quant(16) is None          # booted to API mode
    assert choose_quant(0) == "Q4_K_S"       # unknown RAM -> safe default


def test_quant_env_override(monkeypatch):
    from scrypt.inference.local import choose_quant

    monkeypatch.setenv("SCRYPT_QUANT", "Q6_K")
    assert choose_quant(32) == "Q6_K"


def test_preflight_names_machine_tier(monkeypatch, tmp_path):
    import scrypt.inference.local as local

    monkeypatch.delenv("SCRYPT_QUANT", raising=False)
    monkeypatch.setattr(local, "SCRYPT_HOME", tmp_path)
    monkeypatch.setattr(local, "find_binary", lambda: None)
    monkeypatch.setattr(local, "system_ram_gb", lambda: 33.0)
    problems = local.preflight()
    assert any("Q3_K_S" in p for p in problems)

    monkeypatch.setattr(local, "system_ram_gb", lambda: 16.0)
    problems = local.preflight()
    assert any("API mode" in p for p in problems)
    assert not any("Q3_K_S" in p for p in problems)


def test_installed_model_prefers_heaviest(monkeypatch, tmp_path):
    import scrypt.inference.local as local

    monkeypatch.setattr(local, "SCRYPT_HOME", tmp_path)
    (tmp_path / "models").mkdir()
    (tmp_path / "models" / local.model_file("Q3_K_S")).touch()
    (tmp_path / "models" / local.model_file("Q5_K_M")).touch()
    assert local.installed_model().name == local.model_file("Q5_K_M")