File size: 4,600 Bytes
7f9dfed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from __future__ import annotations

import shutil
from collections.abc import Callable
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import requests

from models.base import BackendStatus
from models.model_catalog import ModelInfo
from models.response_parsing import extract_chat_response


@dataclass(frozen=True)
class LlamaCppConfig:
    """Runtime configuration for a local llama.cpp server."""

    server_url: str = "http://127.0.0.1:8080"
    server_path: str = ""
    model_path: str = ""
    mmproj_path: str = ""


class LlamaCppService:
    """llama.cpp HTTP client for local GGUF inference."""

    def __init__(
        self,
        model: ModelInfo,
        config: LlamaCppConfig | None = None,
        timeout_seconds: float = 60,
    ) -> None:
        self.model = model
        self.config = config or LlamaCppConfig()
        self.timeout_seconds = timeout_seconds

    @staticmethod
    def status(
        which_func: Callable[[str], str | None] = shutil.which,
        get_func: Callable[..., requests.Response] = requests.get,
        server_url: str = "http://127.0.0.1:8080",
        server_path: str = "",
    ) -> BackendStatus:
        executable = server_path or "llama-server"
        if server_path:
            if not Path(server_path).exists():
                return BackendStatus(
                    "llama.cpp",
                    False,
                    f"Configured llama-server was not found: {server_path}",
                )
        elif which_func(executable) is None:
            return BackendStatus("llama.cpp", False, "llama-server was not found on PATH.")

        try:
            response = get_func(f"{server_url}/health", timeout=2)
        except requests.RequestException as exc:
            return BackendStatus(
                "llama.cpp",
                False,
                f"llama-server is installed but not reachable: {exc}",
            )

        if response.ok:
            return BackendStatus("llama.cpp", True, "llama-server is installed and reachable.")
        return BackendStatus(
            "llama.cpp",
            False,
            f"llama-server responded with HTTP {response.status_code}.",
        )

    def launch_command(self) -> list[str]:
        if not self.config.model_path:
            return []

        command = [self.config.server_path or "llama-server", "-m", self.config.model_path]
        if self.config.mmproj_path:
            command.extend(["--mmproj", self.config.mmproj_path])
        return command

    def chat(self, system_prompt: str, user_prompt: str) -> str:
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]
        return self._post_chat(messages)

    def vision_chat(self, has_image: bool, prompt: str, image=None) -> str:
        del image
        if has_image:
            return (
                "[llama.cpp vision note]\n\n"
                "Image upload requires a running llama-server with an mmproj file. "
                "The current scaffold validates the server path but does not yet serialize "
                "Gradio images into llama.cpp multimodal payloads."
            )
        return self._post_chat([{"role": "user", "content": prompt}])

    def _post_chat(self, messages: list[dict[str, str]]) -> str:
        status = self.status(
            server_url=self.config.server_url,
            server_path=self.config.server_path,
        )
        if not status.available:
            return (
                "[llama.cpp unavailable]\n\n"
                f"{status.detail}\n\n"
                "Install llama.cpp, start llama-server with an explicit GGUF model, "
                "then retry."
            )

        try:
            response = requests.post(
                f"{self.config.server_url}/v1/chat/completions",
                json={
                    "messages": messages,
                    "temperature": 0.7,
                    "max_tokens": 512,
                },
                timeout=self.timeout_seconds,
            )
            response.raise_for_status()
        except requests.RequestException as exc:
            return f"[llama.cpp request failed]\n\n{exc}"

        return self._extract_response(dict(response.json()))

    @staticmethod
    def _extract_response(data: dict[str, Any]) -> str:
        return extract_chat_response(data)


def local_file_status(path: str) -> str:
    if not path:
        return "not configured"
    return "found" if Path(path).exists() else "missing"