Spaces:

build-small-hackathon
/

workbench

Sleeping

File size: 4,600 Bytes

7f9dfed

from __future__ import annotations

import shutil
from collections.abc import Callable
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import requests

from models.base import BackendStatus
from models.model_catalog import ModelInfo
from models.response_parsing import extract_chat_response


@dataclass(frozen=True)
class LlamaCppConfig:
    """Runtime configuration for a local llama.cpp server."""

    server_url: str = "http://127.0.0.1:8080"
    server_path: str = ""
    model_path: str = ""
    mmproj_path: str = ""


class LlamaCppService:
    """llama.cpp HTTP client for local GGUF inference."""

    def __init__(
        self,
        model: ModelInfo,
        config: LlamaCppConfig | None = None,
        timeout_seconds: float = 60,
    ) -> None:
        self.model = model
        self.config = config or LlamaCppConfig()
        self.timeout_seconds = timeout_seconds

    @staticmethod
    def status(
        which_func: Callable[[str], str | None] = shutil.which,
        get_func: Callable[..., requests.Response] = requests.get,
        server_url: str = "http://127.0.0.1:8080",
        server_path: str = "",
    ) -> BackendStatus:
        executable = server_path or "llama-server"
        if server_path:
            if not Path(server_path).exists():
                return BackendStatus(
                    "llama.cpp",
                    False,
                    f"Configured llama-server was not found: {server_path}",
                )
        elif which_func(executable) is None:
            return BackendStatus("llama.cpp", False, "llama-server was not found on PATH.")

        try:
            response = get_func(f"{server_url}/health", timeout=2)
        except requests.RequestException as exc:
            return BackendStatus(
                "llama.cpp",
                False,
                f"llama-server is installed but not reachable: {exc}",
            )

        if response.ok:
            return BackendStatus("llama.cpp", True, "llama-server is installed and reachable.")
        return BackendStatus(
            "llama.cpp",
            False,
            f"llama-server responded with HTTP {response.status_code}.",
        )

    def launch_command(self) -> list[str]:
        if not self.config.model_path:
            return []

        command = [self.config.server_path or "llama-server", "-m", self.config.model_path]
        if self.config.mmproj_path:
            command.extend(["--mmproj", self.config.mmproj_path])
        return command

    def chat(self, system_prompt: str, user_prompt: str) -> str:
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]
        return self._post_chat(messages)

    def vision_chat(self, has_image: bool, prompt: str, image=None) -> str:
        del image
        if has_image:
            return (
                "[llama.cpp vision note]\n\n"
                "Image upload requires a running llama-server with an mmproj file. "
                "The current scaffold validates the server path but does not yet serialize "
                "Gradio images into llama.cpp multimodal payloads."
            )
        return self._post_chat([{"role": "user", "content": prompt}])

    def _post_chat(self, messages: list[dict[str, str]]) -> str:
        status = self.status(
            server_url=self.config.server_url,
            server_path=self.config.server_path,
        )
        if not status.available:
            return (
                "[llama.cpp unavailable]\n\n"
                f"{status.detail}\n\n"
                "Install llama.cpp, start llama-server with an explicit GGUF model, "
                "then retry."
            )

        try:
            response = requests.post(
                f"{self.config.server_url}/v1/chat/completions",
                json={
                    "messages": messages,
                    "temperature": 0.7,
                    "max_tokens": 512,
                },
                timeout=self.timeout_seconds,
            )
            response.raise_for_status()
        except requests.RequestException as exc:
            return f"[llama.cpp request failed]\n\n{exc}"

        return self._extract_response(dict(response.json()))

    @staticmethod
    def _extract_response(data: dict[str, Any]) -> str:
        return extract_chat_response(data)


def local_file_status(path: str) -> str:
    if not path:
        return "not configured"
    return "found" if Path(path).exists() else "missing"