Spaces:
Sleeping
Sleeping
File size: 4,600 Bytes
7f9dfed | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | from __future__ import annotations
import shutil
from collections.abc import Callable
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import requests
from models.base import BackendStatus
from models.model_catalog import ModelInfo
from models.response_parsing import extract_chat_response
@dataclass(frozen=True)
class LlamaCppConfig:
"""Runtime configuration for a local llama.cpp server."""
server_url: str = "http://127.0.0.1:8080"
server_path: str = ""
model_path: str = ""
mmproj_path: str = ""
class LlamaCppService:
"""llama.cpp HTTP client for local GGUF inference."""
def __init__(
self,
model: ModelInfo,
config: LlamaCppConfig | None = None,
timeout_seconds: float = 60,
) -> None:
self.model = model
self.config = config or LlamaCppConfig()
self.timeout_seconds = timeout_seconds
@staticmethod
def status(
which_func: Callable[[str], str | None] = shutil.which,
get_func: Callable[..., requests.Response] = requests.get,
server_url: str = "http://127.0.0.1:8080",
server_path: str = "",
) -> BackendStatus:
executable = server_path or "llama-server"
if server_path:
if not Path(server_path).exists():
return BackendStatus(
"llama.cpp",
False,
f"Configured llama-server was not found: {server_path}",
)
elif which_func(executable) is None:
return BackendStatus("llama.cpp", False, "llama-server was not found on PATH.")
try:
response = get_func(f"{server_url}/health", timeout=2)
except requests.RequestException as exc:
return BackendStatus(
"llama.cpp",
False,
f"llama-server is installed but not reachable: {exc}",
)
if response.ok:
return BackendStatus("llama.cpp", True, "llama-server is installed and reachable.")
return BackendStatus(
"llama.cpp",
False,
f"llama-server responded with HTTP {response.status_code}.",
)
def launch_command(self) -> list[str]:
if not self.config.model_path:
return []
command = [self.config.server_path or "llama-server", "-m", self.config.model_path]
if self.config.mmproj_path:
command.extend(["--mmproj", self.config.mmproj_path])
return command
def chat(self, system_prompt: str, user_prompt: str) -> str:
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
return self._post_chat(messages)
def vision_chat(self, has_image: bool, prompt: str, image=None) -> str:
del image
if has_image:
return (
"[llama.cpp vision note]\n\n"
"Image upload requires a running llama-server with an mmproj file. "
"The current scaffold validates the server path but does not yet serialize "
"Gradio images into llama.cpp multimodal payloads."
)
return self._post_chat([{"role": "user", "content": prompt}])
def _post_chat(self, messages: list[dict[str, str]]) -> str:
status = self.status(
server_url=self.config.server_url,
server_path=self.config.server_path,
)
if not status.available:
return (
"[llama.cpp unavailable]\n\n"
f"{status.detail}\n\n"
"Install llama.cpp, start llama-server with an explicit GGUF model, "
"then retry."
)
try:
response = requests.post(
f"{self.config.server_url}/v1/chat/completions",
json={
"messages": messages,
"temperature": 0.7,
"max_tokens": 512,
},
timeout=self.timeout_seconds,
)
response.raise_for_status()
except requests.RequestException as exc:
return f"[llama.cpp request failed]\n\n{exc}"
return self._extract_response(dict(response.json()))
@staticmethod
def _extract_response(data: dict[str, Any]) -> str:
return extract_chat_response(data)
def local_file_status(path: str) -> str:
if not path:
return "not configured"
return "found" if Path(path).exists() else "missing"
|