File size: 12,286 Bytes
0366d65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
"""Load the fine-tuned Gemma 4 GGUF and run inference via llama.cpp.

Llama Champion: all generation goes through llama-cpp-python — no cloud AI API.
The GGUF is downloaded from HF at startup so the Space image stays small.

Two inference locations, selected by env:
- in-process llama.cpp, GPU-offloaded inside an @spaces.GPU lease (ZeroGPU), or
- a remote OpenAI-compatible / llama.cpp server via INFERENCE_BASE_URL
  (e.g. a llama-server on the phone itself, or a backend).
"""
from __future__ import annotations

import os
import threading
import time

from huggingface_hub import hf_hub_download

from . import events

# The platform runs the gemma-cal EDGE fine-tune (Gemma-4 E4B, ~5GB Q4) — our own
# calendar-native model, eval-gated before every publish (docs/eval-roadmap.md).
# MODEL SIZE (hackathon hard constraint, <= 32B): E4B = ~4B effective params.
# All inference is local via llama.cpp (no cloud AI).
MODEL_REPO = os.environ.get("MODEL_REPO", "ParetoOptimal/gemma-4-cal-gguf")
MODEL_FILE = os.environ.get("MODEL_FILE", "gemma-cal-e4b-Q4_K_M.gguf")
# Vision projector (mmproj). Set to enable image input; leave empty for text-only.
# MMPROJ_REPO lets the projector come from a different repo than the LLM — the E4B
# edge model pairs with the base E4B's projector, not a projector in our repo.
MMPROJ_REPO = os.environ.get("MMPROJ_REPO", "") or os.environ.get("MODEL_REPO", "ParetoOptimal/gemma-4-cal-gguf")
MMPROJ_FILE = os.environ.get("MMPROJ_FILE", "")
# llama-cpp-python vision handler class (in llama_cpp.llama_chat_format). Gemma 4
# vision may ship a dedicated handler; the generic clip/Llava handler is the default.
CHAT_HANDLER = os.environ.get("CHAT_HANDLER", "Llava15ChatHandler")

N_CTX = int(os.environ.get("N_CTX", "8192"))
N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", "-1"))  # -1 = offload all (GPU)
GPU_DURATION = int(os.environ.get("GPU_DURATION", "120"))  # ZeroGPU lease seconds

# Configurable inference location. If INFERENCE_BASE_URL is set, generation is
# delegated to a remote OpenAI-compatible / llama.cpp server (e.g. a llama-server
# running on the phone itself, or a backend) instead of loading the GGUF in-process.
# This is how the same agent runs on-device OR thin-client — selected by env.
INFERENCE_BASE_URL = os.environ.get("INFERENCE_BASE_URL", "")
INFERENCE_API_KEY = os.environ.get("INFERENCE_API_KEY", "")
INFERENCE_MODEL = os.environ.get("INFERENCE_MODEL", "local")
# Let a tool-calling model (Hermes) write its own long-term memory mid-run.
# Only applies to the remote path (server/tools.py); off by default.
HERMES_TOOLS = os.environ.get("HERMES_TOOLS") == "1"

_llm = None
_lock = threading.Lock()


# ZeroGPU: GPU-bound work must run inside an @spaces.GPU function (the GPU is
# attached only for that call). Locally / in CI the `spaces` package is absent,
# so `gpu` degrades to a no-op decorator and stub mode never touches this path.
try:
    from spaces import GPU as _spaces_gpu

    def gpu(fn):
        return _spaces_gpu(duration=GPU_DURATION)(fn)
except Exception:  # noqa: BLE001 - spaces not installed (local/CI)

    def gpu(fn):
        return fn


def _preload_cuda_libs():
    """Preload CUDA userspace libs so the prebuilt CUDA llama-cpp-python wheel can
    dlopen. The ZeroGPU/Gradio-SDK env lacks libcudart.so.12 on the default loader
    path; the nvidia-*-cu12 pip packages provide them. We CDLL them RTLD_GLOBAL so
    the llama .so's NEEDED deps resolve. Path-independent (no LD_LIBRARY_PATH guess);
    a no-op off-Linux / when the packages aren't installed."""
    import ctypes
    import glob
    import os

    try:
        import nvidia  # namespace package from nvidia-*-cu12 wheels
    except Exception:  # noqa: BLE001
        return
    # nvidia is a PEP 420 namespace package: __file__ is None, use __path__.
    bases = list(getattr(nvidia, "__path__", []) or [])
    # cublas before its dependents is unnecessary ($ORIGIN RPATH resolves siblings).
    for base in bases:
        for sub in ("cuda_runtime", "cuda_nvrtc", "cublas"):
            for so in sorted(glob.glob(os.path.join(base, sub, "lib", "*.so*"))):
                try:
                    ctypes.CDLL(so, mode=ctypes.RTLD_GLOBAL)
                except OSError:
                    pass


def _build_chat_handler():
    """Return a vision chat handler if MMPROJ_FILE is set, else None (text-only)."""
    if not MMPROJ_FILE:
        return None
    import llama_cpp.llama_chat_format as fmt

    mmproj_path = hf_hub_download(repo_id=MMPROJ_REPO, filename=MMPROJ_FILE)
    handler_cls = getattr(fmt, CHAT_HANDLER)
    return handler_cls(clip_model_path=mmproj_path, verbose=False)


def get_llm():
    """Lazily download + load the GGUF once, thread-safe."""
    global _llm
    if _llm is None:
        with _lock:
            if _llm is None:
                _preload_cuda_libs()  # satisfy libcudart.so.12 etc. before loading
                from llama_cpp import Llama  # imported lazily so tests can stub

                path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
                _llm = Llama(
                    model_path=path,
                    n_ctx=N_CTX,
                    n_gpu_layers=N_GPU_LAYERS,
                    chat_handler=_build_chat_handler(),  # enables image_url inputs
                    verbose=False,
                )
    return _llm


# --- GPU-scoped inner functions (run inside the ZeroGPU lease) ---
# These do the actual in-process llama.cpp work; emits stay in the main-process
# wrappers below because in-memory state (the events bus) isn't shared back from
# the ZeroGPU subprocess.
@gpu
def _infer_text(messages: list[dict], temperature: float, max_tokens: int) -> str:
    out = get_llm().create_chat_completion(
        messages=messages, temperature=temperature, max_tokens=max_tokens
    )
    return out["choices"][0]["message"]["content"]


@gpu
def _infer_json(messages: list[dict], json_schema: dict, temperature: float, max_tokens: int):
    out = get_llm().create_chat_completion(
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
        response_format={"type": "json_object", "schema": json_schema},
    )
    usage = out.get("usage") or {}
    return out["choices"][0]["message"]["content"], usage.get("completion_tokens")


@gpu
def _infer_stream(messages: list[dict], json_schema: dict, temperature: float, max_tokens: int):
    stream = get_llm().create_chat_completion(
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
        response_format={"type": "json_object", "schema": json_schema},
        stream=True,
    )
    for chunk in stream:
        delta = chunk["choices"][0].get("delta", {}).get("content")
        if delta:
            yield delta


# --- remote inference seam (on-device / thin-client via INFERENCE_BASE_URL) ---
def _remote_payload(messages, json_schema, temperature, max_tokens, stream):
    return {
        "model": INFERENCE_MODEL,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
        # llama-server accepts json_schema (OpenAI-style); the in-process path uses
        # the json_object+schema form. Both grammar-constrain the output.
        "response_format": {
            "type": "json_schema",
            "json_schema": {"name": "ActionPlan", "schema": json_schema, "strict": True},
        },
        "stream": stream,
    }


def _remote_headers() -> dict:
    h = {"Content-Type": "application/json"}
    if INFERENCE_API_KEY:
        h["Authorization"] = f"Bearer {INFERENCE_API_KEY}"
    return h


def _remote_complete_json(messages, json_schema, temperature, max_tokens) -> str:
    import requests  # already a dependency; imported here to keep import light

    t0 = time.perf_counter()

    if HERMES_TOOLS:
        # Tool-calling loop: the model may call `remember` to update memory before
        # returning the final ActionPlan JSON. See server/tools.py.
        from .tools import TOOL_SPECS, run_with_tools

        def _post(msgs):
            payload = _remote_payload(msgs, json_schema, temperature, max_tokens, False)
            payload["tools"] = TOOL_SPECS
            r = requests.post(
                f"{INFERENCE_BASE_URL.rstrip('/')}/chat/completions",
                json=payload,
                headers=_remote_headers(),
                timeout=120,
            )
            r.raise_for_status()
            return r.json()

        content, out = run_with_tools(list(messages), _post)
        usage = out.get("usage") or {}
        events.emit(
            "model",
            "remote inference complete (tools)",
            latency_ms=round((time.perf_counter() - t0) * 1000),
            tokens=usage.get("completion_tokens"),
        )
        return content

    resp = requests.post(
        f"{INFERENCE_BASE_URL.rstrip('/')}/chat/completions",
        json=_remote_payload(messages, json_schema, temperature, max_tokens, False),
        headers=_remote_headers(),
        timeout=120,
    )
    resp.raise_for_status()
    out = resp.json()
    usage = out.get("usage") or {}
    events.emit(
        "model",
        "remote inference complete",
        latency_ms=round((time.perf_counter() - t0) * 1000),
        tokens=usage.get("completion_tokens"),
    )
    return out["choices"][0]["message"]["content"]


def _remote_stream_json(messages, json_schema, temperature, max_tokens):
    import json as _json

    import requests

    t0 = time.perf_counter()
    events.emit("model", "remote inference started")
    with requests.post(
        f"{INFERENCE_BASE_URL.rstrip('/')}/chat/completions",
        json=_remote_payload(messages, json_schema, temperature, max_tokens, True),
        headers=_remote_headers(),
        timeout=120,
        stream=True,
    ) as resp:
        resp.raise_for_status()
        for raw in resp.iter_lines():
            if not raw:
                continue
            line = raw.decode("utf-8").removeprefix("data: ").strip()
            if not line or line == "[DONE]":
                continue
            try:
                delta = _json.loads(line)["choices"][0].get("delta", {}).get("content")
            except (ValueError, KeyError, IndexError):
                continue
            if delta:
                yield delta
    events.emit(
        "model", "remote stream complete", latency_ms=round((time.perf_counter() - t0) * 1000)
    )


# --- main-process wrappers (own the activity-bus emits; pick local vs remote) ---
def complete(messages: list[dict], temperature: float = 0.2, max_tokens: int = 1024) -> str:
    """Chat-completion helper returning the assistant text."""
    return _infer_text(messages, temperature, max_tokens)


def complete_json(
    messages: list[dict],
    json_schema: dict,
    temperature: float = 0.2,
    max_tokens: int = 2048,
) -> str:
    """Constrained completion: grammar-constrained so the output always parses.
    Delegates to a remote server if INFERENCE_BASE_URL is set, else runs the
    GPU-offloaded in-process llama.cpp path."""
    if INFERENCE_BASE_URL:
        return _remote_complete_json(messages, json_schema, temperature, max_tokens)
    t0 = time.perf_counter()
    text, tokens = _infer_json(messages, json_schema, temperature, max_tokens)
    events.emit(
        "model",
        "inference complete",
        latency_ms=round((time.perf_counter() - t0) * 1000),
        tokens=tokens,
    )
    return text


def stream_complete_json(
    messages: list[dict],
    json_schema: dict,
    temperature: float = 0.2,
    max_tokens: int = 2048,
):
    """Streaming constrained completion: yields text deltas so the UI can show the
    model 'thinking'. Remote seam when INFERENCE_BASE_URL is set, else GPU-offloaded
    in-process llama.cpp. Emits model events around the call."""
    if INFERENCE_BASE_URL:
        yield from _remote_stream_json(messages, json_schema, temperature, max_tokens)
        return
    t0 = time.perf_counter()
    events.emit("model", "inference started")
    for delta in _infer_stream(messages, json_schema, temperature, max_tokens):
        yield delta
    events.emit(
        "model", "stream complete", latency_ms=round((time.perf_counter() - t0) * 1000)
    )