ysingh-aiml's picture
Reinitialize Space repo with latest code
659322c
#!/usr/bin/env python3
"""Launch llama.cpp `llama-server` for a quantized GGUF (local / edge only)."""
from __future__ import annotations
import argparse
import os
import shutil
import subprocess
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
DEFAULT_MODEL = ROOT / "models" / "model-Q4_K_M.gguf"
DEFAULT_HF_REPO = "ysingh-aiml/tinyllama-alpaca-lora-gguf"
QUANT_FILES = {
"q4_k_m": "model-Q4_K_M.gguf",
"q5_k_m": "model-Q5_K_M.gguf",
"q8_0": "model-Q8_0.gguf",
}
def download_gguf(
repo_id: str,
filename: str,
dest_dir: Path,
revision: str | None = None,
) -> Path:
from huggingface_hub import hf_hub_download
dest_dir.mkdir(parents=True, exist_ok=True)
out = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir=str(dest_dir),
local_dir_use_symlinks=False,
revision=revision,
)
return Path(out)
def resolve_llama_server() -> str:
env = os.environ.get("LLAMA_SERVER", "").strip()
if env:
return env
which = shutil.which("llama-server")
return which or ""
def main() -> None:
parser = argparse.ArgumentParser(description="Start llama.cpp llama-server with a GGUF model.")
parser.add_argument(
"--quant",
choices=sorted(QUANT_FILES.keys()),
default=None,
help="Pick a Hub filename under --hf-repo (sets model to models/<file>).",
)
parser.add_argument("--model", type=Path, default=None, help="Path to .gguf (default: models/model-Q4_K_M.gguf)")
parser.add_argument(
"--hf-repo",
default=os.environ.get("TASK4_GGUF_REPO", DEFAULT_HF_REPO),
help="Hugging Face model repo id for --fetch (default: env TASK4_GGUF_REPO or built-in)",
)
parser.add_argument("--revision", default=None, help="Hub git revision (branch / tag / commit) for download")
parser.add_argument(
"--no-fetch",
action="store_true",
help="Do not download from the Hub if the model file is missing",
)
parser.add_argument(
"--fetch-only",
action="store_true",
help="Download the GGUF from the Hub then exit (no llama-server)",
)
parser.add_argument("--host", default="127.0.0.1")
parser.add_argument("--port", type=int, default=8080)
parser.add_argument("--threads", type=int, default=8)
parser.add_argument("--ctx-size", type=int, default=2048)
parser.add_argument(
"--n-gpu-layers",
type=int,
default=0,
help="GPU/Metal offload layer count; 0 = CPU only",
)
args = parser.parse_args()
if args.quant:
filename = QUANT_FILES[args.quant]
model_path = (ROOT / "models" / filename).resolve()
elif args.model is not None:
model_path = args.model.resolve()
else:
model_path = DEFAULT_MODEL.resolve()
if not model_path.is_file():
if args.no_fetch:
print(
f"Model not found: {model_path}\n"
"Remove --no-fetch to download from the Hub, or place a .gguf at this path.",
file=sys.stderr,
)
raise SystemExit(1)
print(f"Downloading {model_path.name} from {args.hf_repo} …", file=sys.stderr)
try:
downloaded = download_gguf(
args.hf_repo,
model_path.name,
model_path.parent,
revision=args.revision,
)
except Exception as e:
print(f"Download failed: {e}", file=sys.stderr)
raise SystemExit(1) from e
model_path = downloaded.resolve()
if not model_path.is_file():
print(f"Expected file after download: {model_path}", file=sys.stderr)
raise SystemExit(1)
print(f"Model ready: {model_path}", file=sys.stderr)
if args.fetch_only:
print(model_path)
raise SystemExit(0)
exe = resolve_llama_server()
if not exe:
print(
"llama-server not found. Build llama.cpp and set LLAMA_SERVER=/path/to/llama-server "
"or put `llama-server` on PATH.",
file=sys.stderr,
)
raise SystemExit(1)
cmd = [
exe,
"-m",
str(model_path),
"--host",
args.host,
"--port",
str(args.port),
"--threads",
str(args.threads),
"--ctx-size",
str(args.ctx_size),
"--n-gpu-layers",
str(args.n_gpu_layers),
"--parallel",
"1",
"--no-warmup",
]
print("Running:", " ".join(cmd))
raise SystemExit(subprocess.call(cmd))
if __name__ == "__main__":
main()