File size: 4,698 Bytes
659322c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
"""Launch llama.cpp `llama-server` for a quantized GGUF (local / edge only)."""

from __future__ import annotations

import argparse
import os
import shutil
import subprocess
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
DEFAULT_MODEL = ROOT / "models" / "model-Q4_K_M.gguf"
DEFAULT_HF_REPO = "ysingh-aiml/tinyllama-alpaca-lora-gguf"
QUANT_FILES = {
    "q4_k_m": "model-Q4_K_M.gguf",
    "q5_k_m": "model-Q5_K_M.gguf",
    "q8_0": "model-Q8_0.gguf",
}


def download_gguf(
    repo_id: str,
    filename: str,
    dest_dir: Path,
    revision: str | None = None,
) -> Path:
    from huggingface_hub import hf_hub_download

    dest_dir.mkdir(parents=True, exist_ok=True)
    out = hf_hub_download(
        repo_id=repo_id,
        filename=filename,
        local_dir=str(dest_dir),
        local_dir_use_symlinks=False,
        revision=revision,
    )
    return Path(out)


def resolve_llama_server() -> str:
    env = os.environ.get("LLAMA_SERVER", "").strip()
    if env:
        return env
    which = shutil.which("llama-server")
    return which or ""


def main() -> None:
    parser = argparse.ArgumentParser(description="Start llama.cpp llama-server with a GGUF model.")
    parser.add_argument(
        "--quant",
        choices=sorted(QUANT_FILES.keys()),
        default=None,
        help="Pick a Hub filename under --hf-repo (sets model to models/<file>).",
    )
    parser.add_argument("--model", type=Path, default=None, help="Path to .gguf (default: models/model-Q4_K_M.gguf)")
    parser.add_argument(
        "--hf-repo",
        default=os.environ.get("TASK4_GGUF_REPO", DEFAULT_HF_REPO),
        help="Hugging Face model repo id for --fetch (default: env TASK4_GGUF_REPO or built-in)",
    )
    parser.add_argument("--revision", default=None, help="Hub git revision (branch / tag / commit) for download")
    parser.add_argument(
        "--no-fetch",
        action="store_true",
        help="Do not download from the Hub if the model file is missing",
    )
    parser.add_argument(
        "--fetch-only",
        action="store_true",
        help="Download the GGUF from the Hub then exit (no llama-server)",
    )
    parser.add_argument("--host", default="127.0.0.1")
    parser.add_argument("--port", type=int, default=8080)
    parser.add_argument("--threads", type=int, default=8)
    parser.add_argument("--ctx-size", type=int, default=2048)
    parser.add_argument(
        "--n-gpu-layers",
        type=int,
        default=0,
        help="GPU/Metal offload layer count; 0 = CPU only",
    )
    args = parser.parse_args()

    if args.quant:
        filename = QUANT_FILES[args.quant]
        model_path = (ROOT / "models" / filename).resolve()
    elif args.model is not None:
        model_path = args.model.resolve()
    else:
        model_path = DEFAULT_MODEL.resolve()

    if not model_path.is_file():
        if args.no_fetch:
            print(
                f"Model not found: {model_path}\n"
                "Remove --no-fetch to download from the Hub, or place a .gguf at this path.",
                file=sys.stderr,
            )
            raise SystemExit(1)
        print(f"Downloading {model_path.name} from {args.hf_repo} …", file=sys.stderr)
        try:
            downloaded = download_gguf(
                args.hf_repo,
                model_path.name,
                model_path.parent,
                revision=args.revision,
            )
        except Exception as e:
            print(f"Download failed: {e}", file=sys.stderr)
            raise SystemExit(1) from e
        model_path = downloaded.resolve()
        if not model_path.is_file():
            print(f"Expected file after download: {model_path}", file=sys.stderr)
            raise SystemExit(1)
        print(f"Model ready: {model_path}", file=sys.stderr)

    if args.fetch_only:
        print(model_path)
        raise SystemExit(0)

    exe = resolve_llama_server()
    if not exe:
        print(
            "llama-server not found. Build llama.cpp and set LLAMA_SERVER=/path/to/llama-server "
            "or put `llama-server` on PATH.",
            file=sys.stderr,
        )
        raise SystemExit(1)

    cmd = [
        exe,
        "-m",
        str(model_path),
        "--host",
        args.host,
        "--port",
        str(args.port),
        "--threads",
        str(args.threads),
        "--ctx-size",
        str(args.ctx_size),
        "--n-gpu-layers",
        str(args.n_gpu_layers),
        "--parallel",
        "1",
        "--no-warmup",
    ]
    print("Running:", " ".join(cmd))
    raise SystemExit(subprocess.call(cmd))


if __name__ == "__main__":
    main()