OffGridSchedula

Running

File size: 12,286 Bytes

0366d65

"""Load the fine-tuned Gemma 4 GGUF and run inference via llama.cpp.

Llama Champion: all generation goes through llama-cpp-python — no cloud AI API.
The GGUF is downloaded from HF at startup so the Space image stays small.

Two inference locations, selected by env:
- in-process llama.cpp, GPU-offloaded inside an @spaces.GPU lease (ZeroGPU), or
- a remote OpenAI-compatible / llama.cpp server via INFERENCE_BASE_URL
  (e.g. a llama-server on the phone itself, or a backend).
"""
from __future__ import annotations

import os
import threading
import time

from huggingface_hub import hf_hub_download

from . import events

# The platform runs the gemma-cal EDGE fine-tune (Gemma-4 E4B, ~5GB Q4) — our own
# calendar-native model, eval-gated before every publish (docs/eval-roadmap.md).
# MODEL SIZE (hackathon hard constraint, <= 32B): E4B = ~4B effective params.
# All inference is local via llama.cpp (no cloud AI).
MODEL_REPO = os.environ.get("MODEL_REPO", "ParetoOptimal/gemma-4-cal-gguf")
MODEL_FILE = os.environ.get("MODEL_FILE", "gemma-cal-e4b-Q4_K_M.gguf")
# Vision projector (mmproj). Set to enable image input; leave empty for text-only.
# MMPROJ_REPO lets the projector come from a different repo than the LLM — the E4B
# edge model pairs with the base E4B's projector, not a projector in our repo.
MMPROJ_REPO = os.environ.get("MMPROJ_REPO", "") or os.environ.get("MODEL_REPO", "ParetoOptimal/gemma-4-cal-gguf")
MMPROJ_FILE = os.environ.get("MMPROJ_FILE", "")
# llama-cpp-python vision handler class (in llama_cpp.llama_chat_format). Gemma 4
# vision may ship a dedicated handler; the generic clip/Llava handler is the default.
CHAT_HANDLER = os.environ.get("CHAT_HANDLER", "Llava15ChatHandler")

N_CTX = int(os.environ.get("N_CTX", "8192"))
N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", "-1"))  # -1 = offload all (GPU)
GPU_DURATION = int(os.environ.get("GPU_DURATION", "120"))  # ZeroGPU lease seconds

# Configurable inference location. If INFERENCE_BASE_URL is set, generation is
# delegated to a remote OpenAI-compatible / llama.cpp server (e.g. a llama-server
# running on the phone itself, or a backend) instead of loading the GGUF in-process.
# This is how the same agent runs on-device OR thin-client — selected by env.
INFERENCE_BASE_URL = os.environ.get("INFERENCE_BASE_URL", "")
INFERENCE_API_KEY = os.environ.get("INFERENCE_API_KEY", "")
INFERENCE_MODEL = os.environ.get("INFERENCE_MODEL", "local")
# Let a tool-calling model (Hermes) write its own long-term memory mid-run.
# Only applies to the remote path (server/tools.py); off by default.
HERMES_TOOLS = os.environ.get("HERMES_TOOLS") == "1"

_llm = None
_lock = threading.Lock()


# ZeroGPU: GPU-bound work must run inside an @spaces.GPU function (the GPU is
# attached only for that call). Locally / in CI the `spaces` package is absent,
# so `gpu` degrades to a no-op decorator and stub mode never touches this path.
try:
    from spaces import GPU as _spaces_gpu

    def gpu(fn):
        return _spaces_gpu(duration=GPU_DURATION)(fn)
except Exception:  # noqa: BLE001 - spaces not installed (local/CI)

    def gpu(fn):
        return fn


def _preload_cuda_libs():
    """Preload CUDA userspace libs so the prebuilt CUDA llama-cpp-python wheel can
    dlopen. The ZeroGPU/Gradio-SDK env lacks libcudart.so.12 on the default loader
    path; the nvidia-*-cu12 pip packages provide them. We CDLL them RTLD_GLOBAL so
    the llama .so's NEEDED deps resolve. Path-independent (no LD_LIBRARY_PATH guess);
    a no-op off-Linux / when the packages aren't installed."""
    import ctypes
    import glob
    import os

    try:
        import nvidia  # namespace package from nvidia-*-cu12 wheels
    except Exception:  # noqa: BLE001
        return
    # nvidia is a PEP 420 namespace package: __file__ is None, use __path__.
    bases = list(getattr(nvidia, "__path__", []) or [])
    # cublas before its dependents is unnecessary ($ORIGIN RPATH resolves siblings).
    for base in bases:
        for sub in ("cuda_runtime", "cuda_nvrtc", "cublas"):
            for so in sorted(glob.glob(os.path.join(base, sub, "lib", "*.so*"))):
                try:
                    ctypes.CDLL(so, mode=ctypes.RTLD_GLOBAL)
                except OSError:
                    pass


def _build_chat_handler():
    """Return a vision chat handler if MMPROJ_FILE is set, else None (text-only)."""
    if not MMPROJ_FILE:
        return None
    import llama_cpp.llama_chat_format as fmt

    mmproj_path = hf_hub_download(repo_id=MMPROJ_REPO, filename=MMPROJ_FILE)
    handler_cls = getattr(fmt, CHAT_HANDLER)
    return handler_cls(clip_model_path=mmproj_path, verbose=False)


def get_llm():
    """Lazily download + load the GGUF once, thread-safe."""
    global _llm
    if _llm is None:
        with _lock:
            if _llm is None:
                _preload_cuda_libs()  # satisfy libcudart.so.12 etc. before loading
                from llama_cpp import Llama  # imported lazily so tests can stub

                path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
                _llm = Llama(
                    model_path=path,
                    n_ctx=N_CTX,
                    n_gpu_layers=N_GPU_LAYERS,
                    chat_handler=_build_chat_handler(),  # enables image_url inputs
                    verbose=False,
                )
    return _llm


# --- GPU-scoped inner functions (run inside the ZeroGPU lease) ---
# These do the actual in-process llama.cpp work; emits stay in the main-process
# wrappers below because in-memory state (the events bus) isn't shared back from
# the ZeroGPU subprocess.
@gpu
def _infer_text(messages: list[dict], temperature: float, max_tokens: int) -> str:
    out = get_llm().create_chat_completion(
        messages=messages, temperature=temperature, max_tokens=max_tokens
    )
    return out["choices"][0]["message"]["content"]


@gpu
def _infer_json(messages: list[dict], json_schema: dict, temperature: float, max_tokens: int):
    out = get_llm().create_chat_completion(
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
        response_format={"type": "json_object", "schema": json_schema},
    )
    usage = out.get("usage") or {}
    return out["choices"][0]["message"]["content"], usage.get("completion_tokens")


@gpu
def _infer_stream(messages: list[dict], json_schema: dict, temperature: float, max_tokens: int):
    stream = get_llm().create_chat_completion(
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
        response_format={"type": "json_object", "schema": json_schema},
        stream=True,
    )
    for chunk in stream:
        delta = chunk["choices"][0].get("delta", {}).get("content")
        if delta:
            yield delta


# --- remote inference seam (on-device / thin-client via INFERENCE_BASE_URL) ---
def _remote_payload(messages, json_schema, temperature, max_tokens, stream):
    return {
        "model": INFERENCE_MODEL,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
        # llama-server accepts json_schema (OpenAI-style); the in-process path uses
        # the json_object+schema form. Both grammar-constrain the output.
        "response_format": {
            "type": "json_schema",
            "json_schema": {"name": "ActionPlan", "schema": json_schema, "strict": True},
        },
        "stream": stream,
    }


def _remote_headers() -> dict:
    h = {"Content-Type": "application/json"}
    if INFERENCE_API_KEY:
        h["Authorization"] = f"Bearer {INFERENCE_API_KEY}"
    return h


def _remote_complete_json(messages, json_schema, temperature, max_tokens) -> str:
    import requests  # already a dependency; imported here to keep import light

    t0 = time.perf_counter()

    if HERMES_TOOLS:
        # Tool-calling loop: the model may call `remember` to update memory before
        # returning the final ActionPlan JSON. See server/tools.py.
        from .tools import TOOL_SPECS, run_with_tools

        def _post(msgs):
            payload = _remote_payload(msgs, json_schema, temperature, max_tokens, False)
            payload["tools"] = TOOL_SPECS
            r = requests.post(
                f"{INFERENCE_BASE_URL.rstrip('/')}/chat/completions",
                json=payload,
                headers=_remote_headers(),
                timeout=120,
            )
            r.raise_for_status()
            return r.json()

        content, out = run_with_tools(list(messages), _post)
        usage = out.get("usage") or {}
        events.emit(
            "model",
            "remote inference complete (tools)",
            latency_ms=round((time.perf_counter() - t0) * 1000),
            tokens=usage.get("completion_tokens"),
        )
        return content

    resp = requests.post(
        f"{INFERENCE_BASE_URL.rstrip('/')}/chat/completions",
        json=_remote_payload(messages, json_schema, temperature, max_tokens, False),
        headers=_remote_headers(),
        timeout=120,
    )
    resp.raise_for_status()
    out = resp.json()
    usage = out.get("usage") or {}
    events.emit(
        "model",
        "remote inference complete",
        latency_ms=round((time.perf_counter() - t0) * 1000),
        tokens=usage.get("completion_tokens"),
    )
    return out["choices"][0]["message"]["content"]


def _remote_stream_json(messages, json_schema, temperature, max_tokens):
    import json as _json

    import requests

    t0 = time.perf_counter()
    events.emit("model", "remote inference started")
    with requests.post(
        f"{INFERENCE_BASE_URL.rstrip('/')}/chat/completions",
        json=_remote_payload(messages, json_schema, temperature, max_tokens, True),
        headers=_remote_headers(),
        timeout=120,
        stream=True,
    ) as resp:
        resp.raise_for_status()
        for raw in resp.iter_lines():
            if not raw:
                continue
            line = raw.decode("utf-8").removeprefix("data: ").strip()
            if not line or line == "[DONE]":
                continue
            try:
                delta = _json.loads(line)["choices"][0].get("delta", {}).get("content")
            except (ValueError, KeyError, IndexError):
                continue
            if delta:
                yield delta
    events.emit(
        "model", "remote stream complete", latency_ms=round((time.perf_counter() - t0) * 1000)
    )


# --- main-process wrappers (own the activity-bus emits; pick local vs remote) ---
def complete(messages: list[dict], temperature: float = 0.2, max_tokens: int = 1024) -> str:
    """Chat-completion helper returning the assistant text."""
    return _infer_text(messages, temperature, max_tokens)


def complete_json(
    messages: list[dict],
    json_schema: dict,
    temperature: float = 0.2,
    max_tokens: int = 2048,
) -> str:
    """Constrained completion: grammar-constrained so the output always parses.
    Delegates to a remote server if INFERENCE_BASE_URL is set, else runs the
    GPU-offloaded in-process llama.cpp path."""
    if INFERENCE_BASE_URL:
        return _remote_complete_json(messages, json_schema, temperature, max_tokens)
    t0 = time.perf_counter()
    text, tokens = _infer_json(messages, json_schema, temperature, max_tokens)
    events.emit(
        "model",
        "inference complete",
        latency_ms=round((time.perf_counter() - t0) * 1000),
        tokens=tokens,
    )
    return text


def stream_complete_json(
    messages: list[dict],
    json_schema: dict,
    temperature: float = 0.2,
    max_tokens: int = 2048,
):
    """Streaming constrained completion: yields text deltas so the UI can show the
    model 'thinking'. Remote seam when INFERENCE_BASE_URL is set, else GPU-offloaded
    in-process llama.cpp. Emits model events around the call."""
    if INFERENCE_BASE_URL:
        yield from _remote_stream_json(messages, json_schema, temperature, max_tokens)
        return
    t0 = time.perf_counter()
    events.emit("model", "inference started")
    for delta in _infer_stream(messages, json_schema, temperature, max_tokens):
        yield delta
    events.emit(
        "model", "stream complete", latency_ms=round((time.perf_counter() - t0) * 1000)
    )